IKEA_сбор данных по ссылке на товар в текстовом файле.
This commit is contained in:
parent
20bd54dd3c
commit
790abe8b95
1
Парсер_IKEA/links.txt
Normal file
1
Парсер_IKEA/links.txt
Normal file
@ -0,0 +1 @@
|
|||||||
|
https://www.ikea.com/pl/pl/p/indira-narzuta-zoltobezowy-20582629/#content
|
||||||
200
Парсер_IKEA/main.py
Normal file
200
Парсер_IKEA/main.py
Normal file
@ -0,0 +1,200 @@
|
|||||||
|
import requests
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import html
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from openpyxl import Workbook
|
||||||
|
|
||||||
|
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
INPUT_FILE = os.path.join(BASE_DIR, "links.txt")
|
||||||
|
OUTPUT_FILE = os.path.join(BASE_DIR, "result.xlsx")
|
||||||
|
|
||||||
|
CSS_SELECTOR = ".pip-product__subgrid.product-pip.js-product-pip"
|
||||||
|
BLOCKS = [
|
||||||
|
"buyModule",
|
||||||
|
"productSummary",
|
||||||
|
"pipPricePackage",
|
||||||
|
"productInformationSection",
|
||||||
|
"keyFacts",
|
||||||
|
"stockcheckSection",
|
||||||
|
"availabilityGroup",
|
||||||
|
"productGallery"
|
||||||
|
]
|
||||||
|
# ── какие колонки сохраняем ─────────────────────────────────────────
|
||||||
|
KEEP_COLUMNS = [
|
||||||
|
"availabilityGroup.serverOnlineSellable",
|
||||||
|
"availabilityGroup.storeHeader",
|
||||||
|
"buyModule.onlineSellable",
|
||||||
|
"buyModule.productName",
|
||||||
|
"buyModule.productPrice",
|
||||||
|
"buyModule.productType",
|
||||||
|
"keyFacts.ariaLabels",
|
||||||
|
"keyFacts.gaLabel",
|
||||||
|
"keyFacts.keyFacts",
|
||||||
|
"pipPricePackage.measurementText",
|
||||||
|
"pipPricePackage.productDescription",
|
||||||
|
"productGallery.urls",
|
||||||
|
"productInformationSection.dimensionProps",
|
||||||
|
"productInformationSection.productDetailsProps",
|
||||||
|
"productSummary.description",
|
||||||
|
"productSummary.visibleItemNo",
|
||||||
|
"stockcheckSection.packagingProps",
|
||||||
|
"stockcheckSection.typeName",
|
||||||
|
"url",
|
||||||
|
"categoryBreadcrumb",
|
||||||
|
]
|
||||||
|
|
||||||
|
def flatten_block(block_name, data):
|
||||||
|
if not isinstance(data, dict):
|
||||||
|
return {}
|
||||||
|
|
||||||
|
flat = {}
|
||||||
|
|
||||||
|
for k, v in data.items():
|
||||||
|
|
||||||
|
'''
|
||||||
|
# === 1. dimensionProps.images ===
|
||||||
|
if block_name == "productInformationSection" and k == "dimensionProps":
|
||||||
|
if isinstance(v, dict):
|
||||||
|
urls = []
|
||||||
|
for img in v.get("images", []):
|
||||||
|
if isinstance(img, dict):
|
||||||
|
url = img.get("url")
|
||||||
|
if url:
|
||||||
|
urls.append(url)
|
||||||
|
flat[f"{key_name}.images_urls"] = "\n".join(urls)
|
||||||
|
continue
|
||||||
|
'''
|
||||||
|
# === 2. mediaList.content.url → productGallery.urls
|
||||||
|
if block_name == "productGallery" and k == "mediaList":
|
||||||
|
if isinstance(v, list):
|
||||||
|
urls = []
|
||||||
|
for item in v:
|
||||||
|
content = item.get("content", {})
|
||||||
|
if isinstance(content, dict) and "url" in content:
|
||||||
|
urls.append(content["url"])
|
||||||
|
flat["productGallery.urls"] = "\n".join(urls)
|
||||||
|
return flat # ⬅ возвращаем только urls, остальные поля игнорируем
|
||||||
|
|
||||||
|
continue
|
||||||
|
|
||||||
|
# === Остальные поля — по умолчанию ===
|
||||||
|
key = f"{block_name}.{k}"
|
||||||
|
flat[key] = v
|
||||||
|
|
||||||
|
return flat
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def extract_data(url):
|
||||||
|
"""
|
||||||
|
Возвращает словарь с нужными полями товара IKEA.
|
||||||
|
+ NEW: добавляет ключ 'categoryBreadcrumb' вида
|
||||||
|
'Produkty/Tekstylia/Tekstylia do sypialni/Narzuty na łóżko'
|
||||||
|
(берётся из JSON-LD BreadcrumbList).
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
response = requests.get(url, timeout=10,
|
||||||
|
headers={"User-Agent": "Mozilla/5.0"})
|
||||||
|
response.raise_for_status()
|
||||||
|
soup = BeautifulSoup(response.text, "html.parser")
|
||||||
|
|
||||||
|
# ── основной JSON из data-hydration-props ──────────────────
|
||||||
|
target = soup.select_one(CSS_SELECTOR)
|
||||||
|
if not target:
|
||||||
|
return {"url": url, "error": "CSS selector not found"}
|
||||||
|
|
||||||
|
raw = target.get("data-hydration-props")
|
||||||
|
if not raw:
|
||||||
|
return {"url": url, "error": "data-hydration-props not found"}
|
||||||
|
|
||||||
|
decoded = html.unescape(raw)
|
||||||
|
full_json = json.loads(decoded)
|
||||||
|
result = {"url": url}
|
||||||
|
|
||||||
|
# вытаскиваем нужные блоки
|
||||||
|
for block in BLOCKS:
|
||||||
|
result.update(flatten_block(block, full_json.get(block, {})))
|
||||||
|
|
||||||
|
# ── NEW: извлекаем BreadcrumbList → categoryBreadcrumb ────
|
||||||
|
breadcrumb = None
|
||||||
|
for tag in soup.find_all("script",
|
||||||
|
attrs={"type": lambda t: t and "ld+json" in t}):
|
||||||
|
try:
|
||||||
|
data = json.loads(tag.string)
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# если это массив JSON-LD, ищем в нём объект Product / Breadcrumb
|
||||||
|
if isinstance(data, list):
|
||||||
|
data = next((d for d in data
|
||||||
|
if d.get("@type") == "BreadcrumbList"), None)
|
||||||
|
|
||||||
|
if isinstance(data, dict) and data.get("@type") == "BreadcrumbList":
|
||||||
|
items = data.get("itemListElement", [])
|
||||||
|
names = [it.get("name", "") for it in items]
|
||||||
|
breadcrumb = "/".join(names)
|
||||||
|
break # нашли нужный блок – выходим из цикла
|
||||||
|
|
||||||
|
if breadcrumb:
|
||||||
|
result["categoryBreadcrumb"] = breadcrumb
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
return {"url": url, "error": str(e)}
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
# ── читаем ссылки ────────────────────────────────────────────
|
||||||
|
with open(INPUT_FILE, "r", encoding="utf-8") as f:
|
||||||
|
links = [line.strip() for line in f if line.strip()]
|
||||||
|
|
||||||
|
rows = []
|
||||||
|
|
||||||
|
# ---- РЕЖИМ КОЛОНОК -----------------------------------------
|
||||||
|
# NEW: фиксированный список колонок (см. KEEP_COLUMNS вверху)
|
||||||
|
all_columns = KEEP_COLUMNS
|
||||||
|
|
||||||
|
# OLD (восстановить-если-нужно):
|
||||||
|
# all_columns = set() # ← копил все поля
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
|
||||||
|
print("🔍 Извлечение данных...")
|
||||||
|
for idx, link in enumerate(links, 1):
|
||||||
|
print(f"[{idx}/{len(links)}] {link}")
|
||||||
|
row = extract_data(link)
|
||||||
|
|
||||||
|
# NEW: оставляем только нужные 17 полей
|
||||||
|
row = {k: v for k, v in row.items() if k in KEEP_COLUMNS}
|
||||||
|
|
||||||
|
# OLD (восстановить-если-нужно):
|
||||||
|
# all_columns.update(row.keys()) # ← собирал все ключи
|
||||||
|
|
||||||
|
rows.append(row)
|
||||||
|
|
||||||
|
# OLD (восстановить-если-нужно):
|
||||||
|
# if isinstance(all_columns, set):
|
||||||
|
# all_columns = sorted(all_columns) # упорядочивал всё
|
||||||
|
|
||||||
|
def safe(val):
|
||||||
|
"""Преобразует dict / list в JSON-строку, None → ''."""
|
||||||
|
if isinstance(val, (dict, list)):
|
||||||
|
return json.dumps(val, ensure_ascii=False)
|
||||||
|
return "" if val is None else val
|
||||||
|
|
||||||
|
print("📤 Сохраняем Excel...")
|
||||||
|
wb = Workbook()
|
||||||
|
ws = wb.active
|
||||||
|
ws.title = "IKEA Products"
|
||||||
|
ws.append(all_columns)
|
||||||
|
|
||||||
|
for row in rows:
|
||||||
|
ws.append([safe(row.get(col, "")) for col in all_columns])
|
||||||
|
|
||||||
|
wb.save(OUTPUT_FILE)
|
||||||
|
print(f"\n✅ Готово: {OUTPUT_FILE}")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
BIN
Парсер_IKEA/result.xlsx
Normal file
BIN
Парсер_IKEA/result.xlsx
Normal file
Binary file not shown.
BIN
Парсер_IKEA/~$result.xlsx
Normal file
BIN
Парсер_IKEA/~$result.xlsx
Normal file
Binary file not shown.
Loading…
Reference in New Issue
Block a user