IKEA_сбор данных по ссылке на товар в текстовом файле.

2025-08-01 12:05:02 +03:00 · 2025-08-01 12:05:02 +03:00 · 790abe8b95
commit 790abe8b95
parent 20bd54dd3c
4 changed files with 201 additions and 0 deletions
--- a/Парсер_IKEA/links.txt
+++ b/Парсер_IKEA/links.txt
@ -0,0 +1 @@
+https://www.ikea.com/pl/pl/p/indira-narzuta-zoltobezowy-20582629/#content
--- a/Парсер_IKEA/main.py
+++ b/Парсер_IKEA/main.py
@ -0,0 +1,200 @@
+import requests
+import json
+import os
+import html
+from bs4 import BeautifulSoup
+from openpyxl import Workbook
+
+BASE_DIR = os.path.dirname(os.path.abspath(__file__))
+INPUT_FILE = os.path.join(BASE_DIR, "links.txt")
+OUTPUT_FILE = os.path.join(BASE_DIR, "result.xlsx")
+
+CSS_SELECTOR = ".pip-product__subgrid.product-pip.js-product-pip"
+BLOCKS = [
+    "buyModule",
+    "productSummary",
+    "pipPricePackage",
+    "productInformationSection",
+    "keyFacts",
+    "stockcheckSection",
+    "availabilityGroup",
+    "productGallery"
+]
+# ── какие колонки сохраняем ─────────────────────────────────────────
+KEEP_COLUMNS = [
+    "availabilityGroup.serverOnlineSellable",
+    "availabilityGroup.storeHeader",
+    "buyModule.onlineSellable",
+    "buyModule.productName",
+    "buyModule.productPrice",
+    "buyModule.productType",
+    "keyFacts.ariaLabels",
+    "keyFacts.gaLabel",
+    "keyFacts.keyFacts",
+    "pipPricePackage.measurementText",
+    "pipPricePackage.productDescription",
+    "productGallery.urls",
+    "productInformationSection.dimensionProps",
+    "productInformationSection.productDetailsProps",
+    "productSummary.description",
+    "productSummary.visibleItemNo",
+    "stockcheckSection.packagingProps",
+    "stockcheckSection.typeName",
+    "url",
+    "categoryBreadcrumb",
+]
+
+def flatten_block(block_name, data):
+    if not isinstance(data, dict):
+        return {}
+
+    flat = {}
+
+    for k, v in data.items():
+        
+        '''
+        # === 1. dimensionProps.images ===
+        if block_name == "productInformationSection" and k == "dimensionProps":
+            if isinstance(v, dict):
+                urls = []
+                for img in v.get("images", []):
+                    if isinstance(img, dict):
+                        url = img.get("url")
+                        if url:
+                            urls.append(url)
+                flat[f"{key_name}.images_urls"] = "\n".join(urls)
+            continue
+            '''
+        # === 2. mediaList.content.url → productGallery.urls
+        if block_name == "productGallery" and k == "mediaList":
+            if isinstance(v, list):
+                urls = []
+                for item in v:
+                    content = item.get("content", {})
+                    if isinstance(content, dict) and "url" in content:
+                        urls.append(content["url"])
+                flat["productGallery.urls"] = "\n".join(urls) 
+            return flat  # ⬅ возвращаем только urls, остальные поля игнорируем
+
+            continue
+
+        # === Остальные поля — по умолчанию ===
+        key = f"{block_name}.{k}"
+        flat[key] = v
+
+    return flat
+
+
+
+
+def extract_data(url):
+    """
+    Возвращает словарь с нужными полями товара IKEA.
+    + NEW: добавляет ключ 'categoryBreadcrumb' вида
+      'Produkty/Tekstylia/Tekstylia do sypialni/Narzuty na łóżko'
+      (берётся из JSON-LD BreadcrumbList).
+    """
+    try:
+        response = requests.get(url, timeout=10,
+                                headers={"User-Agent": "Mozilla/5.0"})
+        response.raise_for_status()
+        soup = BeautifulSoup(response.text, "html.parser")
+
+        # ── основной JSON из data-hydration-props ──────────────────
+        target = soup.select_one(CSS_SELECTOR)
+        if not target:
+            return {"url": url, "error": "CSS selector not found"}
+
+        raw = target.get("data-hydration-props")
+        if not raw:
+            return {"url": url, "error": "data-hydration-props not found"}
+
+        decoded    = html.unescape(raw)
+        full_json  = json.loads(decoded)
+        result     = {"url": url}
+
+        # вытаскиваем нужные блоки
+        for block in BLOCKS:
+            result.update(flatten_block(block, full_json.get(block, {})))
+
+        # ── NEW: извлекаем BreadcrumbList → categoryBreadcrumb ────
+        breadcrumb = None
+        for tag in soup.find_all("script",
+                                 attrs={"type": lambda t: t and "ld+json" in t}):
+            try:
+                data = json.loads(tag.string)
+            except Exception:
+                continue
+
+            # если это массив JSON-LD, ищем в нём объект Product / Breadcrumb
+            if isinstance(data, list):
+                data = next((d for d in data
+                             if d.get("@type") == "BreadcrumbList"), None)
+
+            if isinstance(data, dict) and data.get("@type") == "BreadcrumbList":
+                items = data.get("itemListElement", [])
+                names = [it.get("name", "") for it in items]
+                breadcrumb = "/".join(names)
+                break   # нашли нужный блок – выходим из цикла
+
+        if breadcrumb:
+            result["categoryBreadcrumb"] = breadcrumb
+
+        return result
+
+    except Exception as e:
+        return {"url": url, "error": str(e)}
+
+
+def main():
+    # ── читаем ссылки ────────────────────────────────────────────
+    with open(INPUT_FILE, "r", encoding="utf-8") as f:
+        links = [line.strip() for line in f if line.strip()]
+
+    rows = []
+
+    # ---- РЕЖИМ КОЛОНОК -----------------------------------------
+    # NEW: фиксированный список колонок (см. KEEP_COLUMNS вверху)
+    all_columns = KEEP_COLUMNS
+
+    # OLD (восстановить-если-нужно):
+    # all_columns = set()                           # ← копил все поля
+    # ------------------------------------------------------------
+
+    print("🔍 Извлечение данных...")
+    for idx, link in enumerate(links, 1):
+        print(f"[{idx}/{len(links)}] {link}")
+        row = extract_data(link)
+
+        # NEW: оставляем только нужные 17 полей
+        row = {k: v for k, v in row.items() if k in KEEP_COLUMNS}
+
+        # OLD (восстановить-если-нужно):
+        # all_columns.update(row.keys())            # ← собирал все ключи
+
+        rows.append(row)
+
+    # OLD (восстановить-если-нужно):
+    # if isinstance(all_columns, set):
+    #     all_columns = sorted(all_columns)         # упорядочивал всё
+
+    def safe(val):
+        """Преобразует dict / list в JSON-строку, None → ''."""
+        if isinstance(val, (dict, list)):
+            return json.dumps(val, ensure_ascii=False)
+        return "" if val is None else val
+
+    print("📤 Сохраняем Excel...")
+    wb = Workbook()
+    ws = wb.active
+    ws.title = "IKEA Products"
+    ws.append(all_columns)
+
+    for row in rows:
+        ws.append([safe(row.get(col, "")) for col in all_columns])
+
+    wb.save(OUTPUT_FILE)
+    print(f"\n✅ Готово: {OUTPUT_FILE}")
+
+if __name__ == "__main__":
+    main()
--- a/Парсер_IKEA/result.xlsx
+++ b/Парсер_IKEA/result.xlsx
--- a/Парсер_IKEA/~$result.xlsx
+++ b/Парсер_IKEA/~$result.xlsx
				`@ -0,0 +1 @@`
				`https://www.ikea.com/pl/pl/p/indira-narzuta-zoltobezowy-20582629/#content`