158 lines
5.3 KiB
Python
158 lines
5.3 KiB
Python
#!/usr/bin/env python3
|
|
# ikea_collect_product_data.py — один POST, статичный payload
|
|
|
|
import json, re, sys, pathlib, datetime, requests, pandas as pd
|
|
from urllib.parse import urljoin
|
|
from openpyxl import Workbook
|
|
|
|
# ────────────────────────── файлы ────────────────────────────
|
|
BASE_DIR = pathlib.Path(__file__).resolve().parent
|
|
CAT_FILE = BASE_DIR / "leaf_categories.txt"
|
|
|
|
LOG_FILE = BASE_DIR / "log_all_CatProd.txt"
|
|
OUT_EXCEL = BASE_DIR / "ikea_products_flat.xlsx"
|
|
OUT_LINKS = BASE_DIR / "product_links.txt"
|
|
|
|
# ─────────────────────── константы API ───────────────────────
|
|
BASE_DOMAIN = "https://www.ikea.com"
|
|
SEARCH_URL = "https://sik.search.blue.cdtapps.com/pl/pl/search?c=listaf&v=20250507"
|
|
|
|
HEADERS = {
|
|
"User-Agent": "Mozilla/5.0",
|
|
"Content-Type": "application/json",
|
|
}
|
|
# ─────────────────────────────────────────────────────────────
|
|
|
|
def log(txt: str):
|
|
ts = datetime.datetime.now().strftime("[%Y-%m-%d %H:%M:%S] ")
|
|
with LOG_FILE.open("a", encoding="utf-8") as f:
|
|
f.write(ts + txt + "\n")
|
|
|
|
def flatten(node, prefix="", out=None):
|
|
if out is None:
|
|
out = {}
|
|
if isinstance(node, dict):
|
|
for k, v in node.items():
|
|
flatten(v, f"{prefix}{k}.", out)
|
|
elif isinstance(node, list):
|
|
for i, v in enumerate(node):
|
|
flatten(v, f"{prefix}{i}.", out)
|
|
else:
|
|
out[prefix[:-1]] = node
|
|
return out
|
|
|
|
def fetch_category_json(category_id: str) -> dict:
|
|
payload = {
|
|
"searchParameters": {"input": category_id, "type": "CATEGORY"},
|
|
"zip": "05-090",
|
|
"store": "188",
|
|
"isUserLoggedIn": False,
|
|
"optimizely": {
|
|
"listing_3547_filter_hnf_sticky": None,
|
|
"listing_3332_collapsed_filter_bar": None,
|
|
"discount_percentage": None,
|
|
"listing_3790_simplify_rating_stars": None
|
|
},
|
|
"optimizelyAttributes": {
|
|
"market": "pl",
|
|
"device": "desktop",
|
|
"deviceVendor": "Apple",
|
|
"deviceType": "desktop",
|
|
"isLoggedIn": False,
|
|
"environment": "prod",
|
|
"browser": "Chrome",
|
|
"os": "Mac OS",
|
|
"language": "pl",
|
|
"feedMarket": "pl-PL",
|
|
"locale": "pl-PL",
|
|
"customerType": "guest",
|
|
"isEntranceVisit": False,
|
|
"pip_to_pip_src": ""
|
|
},
|
|
"components": [{
|
|
"component": "PRIMARY_AREA",
|
|
"columns": 4,
|
|
"types": {
|
|
"main": "PRODUCT",
|
|
"breakouts": ["PLANNER", "LOGIN_REMINDER", "MATTRESS_WARRANTY"]
|
|
},
|
|
"filterConfig": {"max-num-filters": 6},
|
|
"window": {"size": 1000, "offset": 0},
|
|
"forceFilterCalculation": True
|
|
}]
|
|
}
|
|
|
|
log("POST payload: " + json.dumps(payload, ensure_ascii=False))
|
|
r = requests.post(SEARCH_URL, headers=HEADERS,
|
|
data=json.dumps(payload), timeout=20)
|
|
log(f"→ status {r.status_code}")
|
|
log("response: " + r.text)
|
|
r.raise_for_status()
|
|
return r.json()
|
|
|
|
def rows_from_category(cat_url: str) -> list[dict]:
|
|
m = re.search(r'-([0-9]+)/?$', cat_url.rstrip("/"))
|
|
if not m:
|
|
print(" ⚠️ нет ID категории")
|
|
return []
|
|
|
|
data = fetch_category_json(m.group(1))
|
|
rows = []
|
|
|
|
for res in data.get("results", []):
|
|
for itm in res.get("items", []):
|
|
prod = itm.get("product")
|
|
if not prod:
|
|
continue
|
|
row = flatten(prod)
|
|
row["pipUrl"] = urljoin(BASE_DOMAIN, prod.get("pipUrl", ""))
|
|
rows.append(row)
|
|
|
|
print(f" товаров: {len(rows)}")
|
|
return rows
|
|
|
|
# ─────────────────────────── main ────────────────────────────
|
|
def main():
|
|
LOG_FILE.write_text("", encoding="utf-8") # чистим лог
|
|
|
|
if not CAT_FILE.exists():
|
|
sys.exit("✖ leaf_categories.txt не найден")
|
|
|
|
cats = [u.strip() for u in CAT_FILE.read_text(encoding="utf-8").splitlines() if u.strip()]
|
|
all_rows = []
|
|
all_cols = set()
|
|
|
|
for idx, url in enumerate(cats, 1):
|
|
print(f"[{idx}/{len(cats)}] {url}")
|
|
try:
|
|
rows = rows_from_category(url)
|
|
all_rows.extend(rows)
|
|
for r in rows:
|
|
all_cols.update(r.keys())
|
|
except Exception as e:
|
|
print(" ⚠️", e)
|
|
log(f"ERROR on {url}: {e}")
|
|
|
|
if not all_rows:
|
|
print("┐(´‿`)┌ пусто")
|
|
return
|
|
|
|
all_cols = sorted(all_cols)
|
|
wb = Workbook()
|
|
ws = wb.active
|
|
ws.title = "IKEA raw"
|
|
ws.append(all_cols)
|
|
for r in all_rows:
|
|
ws.append([r.get(c, "") for c in all_cols])
|
|
wb.save(OUT_EXCEL)
|
|
print("📊", OUT_EXCEL.name, "создан")
|
|
|
|
OUT_LINKS.write_text(
|
|
"\n".join(r["pipUrl"] for r in all_rows if r.get("pipUrl")),
|
|
encoding="utf-8"
|
|
)
|
|
print("🔗", OUT_LINKS.name, "создан")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|