IKEA_сбор данных по ссылке на товар в текстовом файле.

2025-08-01 12:05:02 +03:00 · 2025-08-01 12:05:02 +03:00 · 790abe8b95
commit 790abe8b95
parent 20bd54dd3c
4 changed files with 201 additions and 0 deletions
--- a/Парсер_IKEA/links.txt
+++ b/Парсер_IKEA/links.txt
@ -0,0 +1 @@
 https://www.ikea.com/pl/pl/p/indira-narzuta-zoltobezowy-20582629/#content
--- a/Парсер_IKEA/main.py
+++ b/Парсер_IKEA/main.py
@ -0,0 +1,200 @@
 import requests
 import json
 import os
 import html
 from bs4 import BeautifulSoup
 from openpyxl import Workbook
 BASE_DIR = os.path.dirname(os.path.abspath(__file__))
 INPUT_FILE = os.path.join(BASE_DIR, "links.txt")
 OUTPUT_FILE = os.path.join(BASE_DIR, "result.xlsx")
 CSS_SELECTOR = ".pip-product__subgrid.product-pip.js-product-pip"
 BLOCKS = [
    "buyModule",
    "productSummary",
    "pipPricePackage",
    "productInformationSection",
    "keyFacts",
    "stockcheckSection",
    "availabilityGroup",
    "productGallery"
 ]
 # ── какие колонки сохраняем ─────────────────────────────────────────
 KEEP_COLUMNS = [
    "availabilityGroup.serverOnlineSellable",
    "availabilityGroup.storeHeader",
    "buyModule.onlineSellable",
    "buyModule.productName",
    "buyModule.productPrice",
    "buyModule.productType",
    "keyFacts.ariaLabels",
    "keyFacts.gaLabel",
    "keyFacts.keyFacts",
    "pipPricePackage.measurementText",
    "pipPricePackage.productDescription",
    "productGallery.urls",
    "productInformationSection.dimensionProps",
    "productInformationSection.productDetailsProps",
    "productSummary.description",
    "productSummary.visibleItemNo",
    "stockcheckSection.packagingProps",
    "stockcheckSection.typeName",
    "url",
    "categoryBreadcrumb",
 ]
 def flatten_block(block_name, data):
    if not isinstance(data, dict):
        return {}
    flat = {}
    for k, v in data.items():
        '''
        # === 1. dimensionProps.images ===
        if block_name == "productInformationSection" and k == "dimensionProps":
            if isinstance(v, dict):
                urls = []
                for img in v.get("images", []):
                    if isinstance(img, dict):
                        url = img.get("url")
                        if url:
                            urls.append(url)
                flat[f"{key_name}.images_urls"] = "\n".join(urls)
            continue
            '''
        # === 2. mediaList.content.url → productGallery.urls
        if block_name == "productGallery" and k == "mediaList":
            if isinstance(v, list):
                urls = []
                for item in v:
                    content = item.get("content", {})
                    if isinstance(content, dict) and "url" in content:
                        urls.append(content["url"])
                flat["productGallery.urls"] = "\n".join(urls) 
            return flat  # ⬅ возвращаем только urls, остальные поля игнорируем
            continue
        # === Остальные поля — по умолчанию ===
        key = f"{block_name}.{k}"
        flat[key] = v
    return flat
 def extract_data(url):
    """
    Возвращает словарь с нужными полями товара IKEA.
    + NEW: добавляет ключ 'categoryBreadcrumb' вида
      'Produkty/Tekstylia/Tekstylia do sypialni/Narzuty na łóżko'
      (берётся из JSON-LD BreadcrumbList).
    """
    try:
        response = requests.get(url, timeout=10,
                                headers={"User-Agent": "Mozilla/5.0"})
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")
        # ── основной JSON из data-hydration-props ──────────────────
        target = soup.select_one(CSS_SELECTOR)
        if not target:
            return {"url": url, "error": "CSS selector not found"}
        raw = target.get("data-hydration-props")
        if not raw:
            return {"url": url, "error": "data-hydration-props not found"}
        decoded    = html.unescape(raw)
        full_json  = json.loads(decoded)
        result     = {"url": url}
        # вытаскиваем нужные блоки
        for block in BLOCKS:
            result.update(flatten_block(block, full_json.get(block, {})))
        # ── NEW: извлекаем BreadcrumbList → categoryBreadcrumb ────
        breadcrumb = None
        for tag in soup.find_all("script",
                                 attrs={"type": lambda t: t and "ld+json" in t}):
            try:
                data = json.loads(tag.string)
            except Exception:
                continue
            # если это массив JSON-LD, ищем в нём объект Product / Breadcrumb
            if isinstance(data, list):
                data = next((d for d in data
                             if d.get("@type") == "BreadcrumbList"), None)
            if isinstance(data, dict) and data.get("@type") == "BreadcrumbList":
                items = data.get("itemListElement", [])
                names = [it.get("name", "") for it in items]
                breadcrumb = "/".join(names)
                break   # нашли нужный блок – выходим из цикла
        if breadcrumb:
            result["categoryBreadcrumb"] = breadcrumb
        return result
    except Exception as e:
        return {"url": url, "error": str(e)}
 def main():
    # ── читаем ссылки ────────────────────────────────────────────
    with open(INPUT_FILE, "r", encoding="utf-8") as f:
        links = [line.strip() for line in f if line.strip()]
    rows = []
    # ---- РЕЖИМ КОЛОНОК -----------------------------------------
    # NEW: фиксированный список колонок (см. KEEP_COLUMNS вверху)
    all_columns = KEEP_COLUMNS
    # OLD (восстановить-если-нужно):
    # all_columns = set()                           # ← копил все поля
    # ------------------------------------------------------------
    print("🔍 Извлечение данных...")
    for idx, link in enumerate(links, 1):
        print(f"[{idx}/{len(links)}] {link}")
        row = extract_data(link)
        # NEW: оставляем только нужные 17 полей
        row = {k: v for k, v in row.items() if k in KEEP_COLUMNS}
        # OLD (восстановить-если-нужно):
        # all_columns.update(row.keys())            # ← собирал все ключи
        rows.append(row)
    # OLD (восстановить-если-нужно):
    # if isinstance(all_columns, set):
    #     all_columns = sorted(all_columns)         # упорядочивал всё
    def safe(val):
        """Преобразует dict / list в JSON-строку, None → ''."""
        if isinstance(val, (dict, list)):
            return json.dumps(val, ensure_ascii=False)
        return "" if val is None else val
    print("📤 Сохраняем Excel...")
    wb = Workbook()
    ws = wb.active
    ws.title = "IKEA Products"
    ws.append(all_columns)
    for row in rows:
        ws.append([safe(row.get(col, "")) for col in all_columns])
    wb.save(OUTPUT_FILE)
    print(f"\n✅ Готово: {OUTPUT_FILE}")
 if __name__ == "__main__":
    main()
--- a/Парсер_IKEA/result.xlsx
+++ b/Парсер_IKEA/result.xlsx
--- a/Парсер_IKEA/~$result.xlsx
+++ b/Парсер_IKEA/~$result.xlsx
		`@ -0,0 +1 @@`
							`https://www.ikea.com/pl/pl/p/indira-narzuta-zoltobezowy-20582629/#content`