diff --git a/Парсер_IKEA/links.txt b/Парсер_IKEA/links.txt new file mode 100644 index 0000000..518f651 --- /dev/null +++ b/Парсер_IKEA/links.txt @@ -0,0 +1 @@ +https://www.ikea.com/pl/pl/p/indira-narzuta-zoltobezowy-20582629/#content diff --git a/Парсер_IKEA/main.py b/Парсер_IKEA/main.py new file mode 100644 index 0000000..69ea84f --- /dev/null +++ b/Парсер_IKEA/main.py @@ -0,0 +1,200 @@ +import requests +import json +import os +import html +from bs4 import BeautifulSoup +from openpyxl import Workbook + +BASE_DIR = os.path.dirname(os.path.abspath(__file__)) +INPUT_FILE = os.path.join(BASE_DIR, "links.txt") +OUTPUT_FILE = os.path.join(BASE_DIR, "result.xlsx") + +CSS_SELECTOR = ".pip-product__subgrid.product-pip.js-product-pip" +BLOCKS = [ + "buyModule", + "productSummary", + "pipPricePackage", + "productInformationSection", + "keyFacts", + "stockcheckSection", + "availabilityGroup", + "productGallery" +] +# ── какие колонки сохраняем ───────────────────────────────────────── +KEEP_COLUMNS = [ + "availabilityGroup.serverOnlineSellable", + "availabilityGroup.storeHeader", + "buyModule.onlineSellable", + "buyModule.productName", + "buyModule.productPrice", + "buyModule.productType", + "keyFacts.ariaLabels", + "keyFacts.gaLabel", + "keyFacts.keyFacts", + "pipPricePackage.measurementText", + "pipPricePackage.productDescription", + "productGallery.urls", + "productInformationSection.dimensionProps", + "productInformationSection.productDetailsProps", + "productSummary.description", + "productSummary.visibleItemNo", + "stockcheckSection.packagingProps", + "stockcheckSection.typeName", + "url", + "categoryBreadcrumb", +] + +def flatten_block(block_name, data): + if not isinstance(data, dict): + return {} + + flat = {} + + for k, v in data.items(): + + ''' + # === 1. dimensionProps.images === + if block_name == "productInformationSection" and k == "dimensionProps": + if isinstance(v, dict): + urls = [] + for img in v.get("images", []): + if isinstance(img, dict): + url = img.get("url") + if url: + urls.append(url) + flat[f"{key_name}.images_urls"] = "\n".join(urls) + continue + ''' + # === 2. mediaList.content.url → productGallery.urls + if block_name == "productGallery" and k == "mediaList": + if isinstance(v, list): + urls = [] + for item in v: + content = item.get("content", {}) + if isinstance(content, dict) and "url" in content: + urls.append(content["url"]) + flat["productGallery.urls"] = "\n".join(urls) + return flat # ⬅ возвращаем только urls, остальные поля игнорируем + + continue + + # === Остальные поля — по умолчанию === + key = f"{block_name}.{k}" + flat[key] = v + + return flat + + + + +def extract_data(url): + """ + Возвращает словарь с нужными полями товара IKEA. + + NEW: добавляет ключ 'categoryBreadcrumb' вида + 'Produkty/Tekstylia/Tekstylia do sypialni/Narzuty na łóżko' + (берётся из JSON-LD BreadcrumbList). + """ + try: + response = requests.get(url, timeout=10, + headers={"User-Agent": "Mozilla/5.0"}) + response.raise_for_status() + soup = BeautifulSoup(response.text, "html.parser") + + # ── основной JSON из data-hydration-props ────────────────── + target = soup.select_one(CSS_SELECTOR) + if not target: + return {"url": url, "error": "CSS selector not found"} + + raw = target.get("data-hydration-props") + if not raw: + return {"url": url, "error": "data-hydration-props not found"} + + decoded = html.unescape(raw) + full_json = json.loads(decoded) + result = {"url": url} + + # вытаскиваем нужные блоки + for block in BLOCKS: + result.update(flatten_block(block, full_json.get(block, {}))) + + # ── NEW: извлекаем BreadcrumbList → categoryBreadcrumb ──── + breadcrumb = None + for tag in soup.find_all("script", + attrs={"type": lambda t: t and "ld+json" in t}): + try: + data = json.loads(tag.string) + except Exception: + continue + + # если это массив JSON-LD, ищем в нём объект Product / Breadcrumb + if isinstance(data, list): + data = next((d for d in data + if d.get("@type") == "BreadcrumbList"), None) + + if isinstance(data, dict) and data.get("@type") == "BreadcrumbList": + items = data.get("itemListElement", []) + names = [it.get("name", "") for it in items] + breadcrumb = "/".join(names) + break # нашли нужный блок – выходим из цикла + + if breadcrumb: + result["categoryBreadcrumb"] = breadcrumb + + return result + + except Exception as e: + return {"url": url, "error": str(e)} + + +def main(): + # ── читаем ссылки ──────────────────────────────────────────── + with open(INPUT_FILE, "r", encoding="utf-8") as f: + links = [line.strip() for line in f if line.strip()] + + rows = [] + + # ---- РЕЖИМ КОЛОНОК ----------------------------------------- + # NEW: фиксированный список колонок (см. KEEP_COLUMNS вверху) + all_columns = KEEP_COLUMNS + + # OLD (восстановить-если-нужно): + # all_columns = set() # ← копил все поля + # ------------------------------------------------------------ + + print("🔍 Извлечение данных...") + for idx, link in enumerate(links, 1): + print(f"[{idx}/{len(links)}] {link}") + row = extract_data(link) + + # NEW: оставляем только нужные 17 полей + row = {k: v for k, v in row.items() if k in KEEP_COLUMNS} + + # OLD (восстановить-если-нужно): + # all_columns.update(row.keys()) # ← собирал все ключи + + rows.append(row) + + # OLD (восстановить-если-нужно): + # if isinstance(all_columns, set): + # all_columns = sorted(all_columns) # упорядочивал всё + + def safe(val): + """Преобразует dict / list в JSON-строку, None → ''.""" + if isinstance(val, (dict, list)): + return json.dumps(val, ensure_ascii=False) + return "" if val is None else val + + print("📤 Сохраняем Excel...") + wb = Workbook() + ws = wb.active + ws.title = "IKEA Products" + ws.append(all_columns) + + for row in rows: + ws.append([safe(row.get(col, "")) for col in all_columns]) + + wb.save(OUTPUT_FILE) + print(f"\n✅ Готово: {OUTPUT_FILE}") + +if __name__ == "__main__": + main() diff --git a/Парсер_IKEA/result.xlsx b/Парсер_IKEA/result.xlsx new file mode 100644 index 0000000..b108b03 Binary files /dev/null and b/Парсер_IKEA/result.xlsx differ diff --git a/Парсер_IKEA/~$result.xlsx b/Парсер_IKEA/~$result.xlsx new file mode 100644 index 0000000..0f76e2b Binary files /dev/null and b/Парсер_IKEA/~$result.xlsx differ