MacOS_Parsers/Парсер_IKEA/ikea_collect_product_linksANDinfo.py

158 lines
5.3 KiB
Python

#!/usr/bin/env python3
# ikea_collect_product_data.py — один POST, статичный payload
import json, re, sys, pathlib, datetime, requests, pandas as pd
from urllib.parse import urljoin
from openpyxl import Workbook
# ────────────────────────── файлы ────────────────────────────
BASE_DIR = pathlib.Path(__file__).resolve().parent
CAT_FILE = BASE_DIR / "leaf_categories.txt"
LOG_FILE = BASE_DIR / "log_all_CatProd.txt"
OUT_EXCEL = BASE_DIR / "ikea_products_flat.xlsx"
OUT_LINKS = BASE_DIR / "product_links.txt"
# ─────────────────────── константы API ───────────────────────
BASE_DOMAIN = "https://www.ikea.com"
SEARCH_URL = "https://sik.search.blue.cdtapps.com/pl/pl/search?c=listaf&v=20250507"
HEADERS = {
"User-Agent": "Mozilla/5.0",
"Content-Type": "application/json",
}
# ─────────────────────────────────────────────────────────────
def log(txt: str):
ts = datetime.datetime.now().strftime("[%Y-%m-%d %H:%M:%S] ")
with LOG_FILE.open("a", encoding="utf-8") as f:
f.write(ts + txt + "\n")
def flatten(node, prefix="", out=None):
if out is None:
out = {}
if isinstance(node, dict):
for k, v in node.items():
flatten(v, f"{prefix}{k}.", out)
elif isinstance(node, list):
for i, v in enumerate(node):
flatten(v, f"{prefix}{i}.", out)
else:
out[prefix[:-1]] = node
return out
def fetch_category_json(category_id: str) -> dict:
payload = {
"searchParameters": {"input": category_id, "type": "CATEGORY"},
"zip": "05-090",
"store": "188",
"isUserLoggedIn": False,
"optimizely": {
"listing_3547_filter_hnf_sticky": None,
"listing_3332_collapsed_filter_bar": None,
"discount_percentage": None,
"listing_3790_simplify_rating_stars": None
},
"optimizelyAttributes": {
"market": "pl",
"device": "desktop",
"deviceVendor": "Apple",
"deviceType": "desktop",
"isLoggedIn": False,
"environment": "prod",
"browser": "Chrome",
"os": "Mac OS",
"language": "pl",
"feedMarket": "pl-PL",
"locale": "pl-PL",
"customerType": "guest",
"isEntranceVisit": False,
"pip_to_pip_src": ""
},
"components": [{
"component": "PRIMARY_AREA",
"columns": 4,
"types": {
"main": "PRODUCT",
"breakouts": ["PLANNER", "LOGIN_REMINDER", "MATTRESS_WARRANTY"]
},
"filterConfig": {"max-num-filters": 6},
"window": {"size": 1000, "offset": 0},
"forceFilterCalculation": True
}]
}
log("POST payload: " + json.dumps(payload, ensure_ascii=False))
r = requests.post(SEARCH_URL, headers=HEADERS,
data=json.dumps(payload), timeout=20)
log(f"→ status {r.status_code}")
log("response: " + r.text)
r.raise_for_status()
return r.json()
def rows_from_category(cat_url: str) -> list[dict]:
m = re.search(r'-([0-9]+)/?$', cat_url.rstrip("/"))
if not m:
print(" ⚠️ нет ID категории")
return []
data = fetch_category_json(m.group(1))
rows = []
for res in data.get("results", []):
for itm in res.get("items", []):
prod = itm.get("product")
if not prod:
continue
row = flatten(prod)
row["pipUrl"] = urljoin(BASE_DOMAIN, prod.get("pipUrl", ""))
rows.append(row)
print(f" товаров: {len(rows)}")
return rows
# ─────────────────────────── main ────────────────────────────
def main():
LOG_FILE.write_text("", encoding="utf-8") # чистим лог
if not CAT_FILE.exists():
sys.exit("✖ leaf_categories.txt не найден")
cats = [u.strip() for u in CAT_FILE.read_text(encoding="utf-8").splitlines() if u.strip()]
all_rows = []
all_cols = set()
for idx, url in enumerate(cats, 1):
print(f"[{idx}/{len(cats)}] {url}")
try:
rows = rows_from_category(url)
all_rows.extend(rows)
for r in rows:
all_cols.update(r.keys())
except Exception as e:
print(" ⚠️", e)
log(f"ERROR on {url}: {e}")
if not all_rows:
print("┐(´‿`)┌ пусто")
return
all_cols = sorted(all_cols)
wb = Workbook()
ws = wb.active
ws.title = "IKEA raw"
ws.append(all_cols)
for r in all_rows:
ws.append([r.get(c, "") for c in all_cols])
wb.save(OUT_EXCEL)
print("📊", OUT_EXCEL.name, "создан")
OUT_LINKS.write_text(
"\n".join(r["pipUrl"] for r in all_rows if r.get("pipUrl")),
encoding="utf-8"
)
print("🔗", OUT_LINKS.name, "создан")
if __name__ == "__main__":
main()