#!/usr/bin/env python3 # ikea_collect_product_data.py — один POST, статичный payload import json, re, sys, pathlib, datetime, requests, pandas as pd from urllib.parse import urljoin from openpyxl import Workbook # ────────────────────────── файлы ──────────────────────────── BASE_DIR = pathlib.Path(__file__).resolve().parent CAT_FILE = BASE_DIR / "leaf_categories.txt" LOG_FILE = BASE_DIR / "log_all_CatProd.txt" OUT_EXCEL = BASE_DIR / "ikea_products_flat.xlsx" OUT_LINKS = BASE_DIR / "product_links.txt" # ─────────────────────── константы API ─────────────────────── BASE_DOMAIN = "https://www.ikea.com" SEARCH_URL = "https://sik.search.blue.cdtapps.com/pl/pl/search?c=listaf&v=20250507" HEADERS = { "User-Agent": "Mozilla/5.0", "Content-Type": "application/json", } # ───────────────────────────────────────────────────────────── def log(txt: str): ts = datetime.datetime.now().strftime("[%Y-%m-%d %H:%M:%S] ") with LOG_FILE.open("a", encoding="utf-8") as f: f.write(ts + txt + "\n") def flatten(node, prefix="", out=None): if out is None: out = {} if isinstance(node, dict): for k, v in node.items(): flatten(v, f"{prefix}{k}.", out) elif isinstance(node, list): for i, v in enumerate(node): flatten(v, f"{prefix}{i}.", out) else: out[prefix[:-1]] = node return out def fetch_category_json(category_id: str) -> dict: payload = { "searchParameters": {"input": category_id, "type": "CATEGORY"}, "zip": "05-090", "store": "188", "isUserLoggedIn": False, "optimizely": { "listing_3547_filter_hnf_sticky": None, "listing_3332_collapsed_filter_bar": None, "discount_percentage": None, "listing_3790_simplify_rating_stars": None }, "optimizelyAttributes": { "market": "pl", "device": "desktop", "deviceVendor": "Apple", "deviceType": "desktop", "isLoggedIn": False, "environment": "prod", "browser": "Chrome", "os": "Mac OS", "language": "pl", "feedMarket": "pl-PL", "locale": "pl-PL", "customerType": "guest", "isEntranceVisit": False, "pip_to_pip_src": "" }, "components": [{ "component": "PRIMARY_AREA", "columns": 4, "types": { "main": "PRODUCT", "breakouts": ["PLANNER", "LOGIN_REMINDER", "MATTRESS_WARRANTY"] }, "filterConfig": {"max-num-filters": 6}, "window": {"size": 1000, "offset": 0}, "forceFilterCalculation": True }] } log("POST payload: " + json.dumps(payload, ensure_ascii=False)) r = requests.post(SEARCH_URL, headers=HEADERS, data=json.dumps(payload), timeout=20) log(f"→ status {r.status_code}") log("response: " + r.text) r.raise_for_status() return r.json() def rows_from_category(cat_url: str) -> list[dict]: m = re.search(r'-([0-9]+)/?$', cat_url.rstrip("/")) if not m: print(" ⚠️ нет ID категории") return [] data = fetch_category_json(m.group(1)) rows = [] for res in data.get("results", []): for itm in res.get("items", []): prod = itm.get("product") if not prod: continue row = flatten(prod) row["pipUrl"] = urljoin(BASE_DOMAIN, prod.get("pipUrl", "")) rows.append(row) print(f" товаров: {len(rows)}") return rows # ─────────────────────────── main ──────────────────────────── def main(): LOG_FILE.write_text("", encoding="utf-8") # чистим лог if not CAT_FILE.exists(): sys.exit("✖ leaf_categories.txt не найден") cats = [u.strip() for u in CAT_FILE.read_text(encoding="utf-8").splitlines() if u.strip()] all_rows = [] all_cols = set() for idx, url in enumerate(cats, 1): print(f"[{idx}/{len(cats)}] {url}") try: rows = rows_from_category(url) all_rows.extend(rows) for r in rows: all_cols.update(r.keys()) except Exception as e: print(" ⚠️", e) log(f"ERROR on {url}: {e}") if not all_rows: print("┐(´‿`)┌ пусто") return all_cols = sorted(all_cols) wb = Workbook() ws = wb.active ws.title = "IKEA raw" ws.append(all_cols) for r in all_rows: ws.append([r.get(c, "") for c in all_cols]) wb.save(OUT_EXCEL) print("📊", OUT_EXCEL.name, "создан") OUT_LINKS.write_text( "\n".join(r["pipUrl"] for r in all_rows if r.get("pipUrl")), encoding="utf-8" ) print("🔗", OUT_LINKS.name, "создан") if __name__ == "__main__": main()