import requests import json import os import html from bs4 import BeautifulSoup from openpyxl import Workbook BASE_DIR = os.path.dirname(os.path.abspath(__file__)) INPUT_FILE = os.path.join(BASE_DIR, "links.txt") OUTPUT_FILE = os.path.join(BASE_DIR, "result.xlsx") CSS_SELECTOR = ".pip-product__subgrid.product-pip.js-product-pip" BLOCKS = [ "buyModule", "productSummary", "pipPricePackage", "productInformationSection", "keyFacts", "stockcheckSection", "availabilityGroup", "productGallery" ] # ── какие колонки сохраняем ───────────────────────────────────────── KEEP_COLUMNS = [ "availabilityGroup.serverOnlineSellable", "availabilityGroup.storeHeader", "buyModule.onlineSellable", "buyModule.productName", "buyModule.productPrice", "buyModule.productType", "keyFacts.ariaLabels", "keyFacts.gaLabel", "keyFacts.keyFacts", "pipPricePackage.measurementText", "pipPricePackage.productDescription", "productGallery.urls", "productInformationSection.dimensionProps", "productInformationSection.productDetailsProps", "productSummary.description", "productSummary.visibleItemNo", "stockcheckSection.packagingProps", "stockcheckSection.typeName", "url", "categoryBreadcrumb", ] def flatten_block(block_name, data): if not isinstance(data, dict): return {} flat = {} for k, v in data.items(): ''' # === 1. dimensionProps.images === if block_name == "productInformationSection" and k == "dimensionProps": if isinstance(v, dict): urls = [] for img in v.get("images", []): if isinstance(img, dict): url = img.get("url") if url: urls.append(url) flat[f"{key_name}.images_urls"] = "\n".join(urls) continue ''' # === 2. mediaList.content.url → productGallery.urls if block_name == "productGallery" and k == "mediaList": if isinstance(v, list): urls = [] for item in v: content = item.get("content", {}) if isinstance(content, dict) and "url" in content: urls.append(content["url"]) flat["productGallery.urls"] = "\n".join(urls) return flat # ⬅ возвращаем только urls, остальные поля игнорируем continue # === Остальные поля — по умолчанию === key = f"{block_name}.{k}" flat[key] = v return flat def extract_data(url): """ Возвращает словарь с нужными полями товара IKEA. + NEW: добавляет ключ 'categoryBreadcrumb' вида 'Produkty/Tekstylia/Tekstylia do sypialni/Narzuty na łóżko' (берётся из JSON-LD BreadcrumbList). """ try: response = requests.get(url, timeout=10, headers={"User-Agent": "Mozilla/5.0"}) response.raise_for_status() soup = BeautifulSoup(response.text, "html.parser") # ── основной JSON из data-hydration-props ────────────────── target = soup.select_one(CSS_SELECTOR) if not target: return {"url": url, "error": "CSS selector not found"} raw = target.get("data-hydration-props") if not raw: return {"url": url, "error": "data-hydration-props not found"} decoded = html.unescape(raw) full_json = json.loads(decoded) result = {"url": url} # вытаскиваем нужные блоки for block in BLOCKS: result.update(flatten_block(block, full_json.get(block, {}))) # ── NEW: извлекаем BreadcrumbList → categoryBreadcrumb ──── breadcrumb = None for tag in soup.find_all("script", attrs={"type": lambda t: t and "ld+json" in t}): try: data = json.loads(tag.string) except Exception: continue # если это массив JSON-LD, ищем в нём объект Product / Breadcrumb if isinstance(data, list): data = next((d for d in data if d.get("@type") == "BreadcrumbList"), None) if isinstance(data, dict) and data.get("@type") == "BreadcrumbList": items = data.get("itemListElement", []) names = [it.get("name", "") for it in items] breadcrumb = "/".join(names) break # нашли нужный блок – выходим из цикла if breadcrumb: result["categoryBreadcrumb"] = breadcrumb return result except Exception as e: return {"url": url, "error": str(e)} def main(): # ── читаем ссылки ──────────────────────────────────────────── with open(INPUT_FILE, "r", encoding="utf-8") as f: links = [line.strip() for line in f if line.strip()] rows = [] # ---- РЕЖИМ КОЛОНОК ----------------------------------------- # NEW: фиксированный список колонок (см. KEEP_COLUMNS вверху) all_columns = KEEP_COLUMNS # OLD (восстановить-если-нужно): # all_columns = set() # ← копил все поля # ------------------------------------------------------------ print("🔍 Извлечение данных...") for idx, link in enumerate(links, 1): print(f"[{idx}/{len(links)}] {link}") row = extract_data(link) # NEW: оставляем только нужные 17 полей row = {k: v for k, v in row.items() if k in KEEP_COLUMNS} # OLD (восстановить-если-нужно): # all_columns.update(row.keys()) # ← собирал все ключи rows.append(row) # OLD (восстановить-если-нужно): # if isinstance(all_columns, set): # all_columns = sorted(all_columns) # упорядочивал всё def safe(val): """Преобразует dict / list в JSON-строку, None → ''.""" if isinstance(val, (dict, list)): return json.dumps(val, ensure_ascii=False) return "" if val is None else val print("📤 Сохраняем Excel...") wb = Workbook() ws = wb.active ws.title = "IKEA Products" ws.append(all_columns) for row in rows: ws.append([safe(row.get(col, "")) for col in all_columns]) wb.save(OUTPUT_FILE) print(f"\n✅ Готово: {OUTPUT_FILE}") if __name__ == "__main__": main()