MacOS_Parsers/Парсер_IKEA/ikea_collect_product_linksAND-mininfo.py
2025-10-07 14:17:12 +03:00

186 lines
6.4 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import json
import requests
import datetime
import pathlib
import re
from openpyxl import Workbook
# ──────────────── ПУТИ ────────────────
BASE_DIR = pathlib.Path(__file__).resolve().parent
CAT_FILE = BASE_DIR / "leaf_categories.txt" # список категорий IKEA
OUT_DIR = BASE_DIR / "json_raw"
OUT_DIR.mkdir(exist_ok=True)
LOG_FILE = BASE_DIR / "fetch_log.txt"
OUT_JSON = OUT_DIR / "flattened_products.json"
OUT_XLSX = OUT_DIR / "flattened_products.xlsx"
# ──────────────── API ────────────────
SEARCH_URL = "https://sik.search.blue.cdtapps.com/pl/pl/search?c=listaf&v=20250507"
HEADERS = {
"User-Agent": "Mozilla/5.0",
"Content-Type": "application/json",
}
# ──────────────── ВСПОМОГАТЕЛЬНОЕ ────────────────
def log(msg: str):
ts = datetime.datetime.now().strftime("[%Y-%m-%d %H:%M:%S] ")
print(ts + msg)
with LOG_FILE.open("a", encoding="utf-8") as f:
f.write(ts + msg + "\n")
def fetch_category_json(category_id: str) -> dict:
"""Делает POST к IKEA API и возвращает чистый JSON"""
payload = {
"searchParameters": {"input": category_id, "type": "CATEGORY"},
"zip": "05-090",
"store": "188",
"isUserLoggedIn": False,
"optimizely": {
"listing_3547_filter_hnf_sticky": None,
"listing_3332_collapsed_filter_bar": None,
"discount_percentage": None,
"listing_3790_simplify_rating_stars": None
},
"optimizelyAttributes": {
"market": "pl",
"device": "desktop",
"deviceVendor": "Apple",
"deviceType": "desktop",
"isLoggedIn": False,
"environment": "prod",
"browser": "Chrome",
"os": "Mac OS",
"language": "pl",
"feedMarket": "pl-PL",
"locale": "pl-PL",
"customerType": "guest",
"isEntranceVisit": False,
"pip_to_pip_src": ""
},
"components": [{
"component": "PRIMARY_AREA",
"columns": 4,
"types": {
"main": "PRODUCT",
"breakouts": ["PLANNER", "LOGIN_REMINDER", "MATTRESS_WARRANTY"]
},
"filterConfig": {"max-num-filters": 6},
"window": {"size": 1000, "offset": 0},
"forceFilterCalculation": True
}]
}
log(f"POST {SEARCH_URL} category_id={category_id}")
r = requests.post(SEARCH_URL, headers=HEADERS, json=payload, timeout=30)
log(f"→ Status: {r.status_code}")
r.raise_for_status()
return r.json()
def extract_products(data: dict) -> list[dict]:
"""Извлекает товары и варианты из ответа IKEA"""
products = []
for result in data.get("results", []):
for item in result.get("items", []):
product = item.get("product")
if not product:
continue
# Собираем категорию
category_path = " / ".join(c.get("name", "") for c in product.get("categoryPath", []))
def extract_one(prod):
av = prod.get("availability", [])
av0_status = av[0].get("status") if len(av) > 0 else ""
av1_status = av[1].get("status") if len(av) > 1 else ""
av1_store = av[1].get("store") if len(av) > 1 else ""
price = (
prod.get("salesPrice", {})
.get("current", {})
.get("wholeNumber", "")
)
return {
"id": prod.get("id") or prod.get("itemNoGlobal"),
"pipUrl": prod.get("pipUrl", ""),
"availability_0_status": av0_status,
"availability_1_status": av1_status,
"availability_1_store": av1_store,
"price": price,
"category_path": category_path,
}
# Основной продукт
products.append(extract_one(product))
# Варианты
variants = (
product.get("gprDescription", {}).get("variants", [])
)
for v in variants:
products.append(extract_one(v))
return products
# ──────────────── MAIN ────────────────
def main():
if not CAT_FILE.exists():
print("✖ Файл leaf_categories.txt не найден.")
return
categories = [
line.strip() for line in CAT_FILE.read_text(encoding="utf-8").splitlines() if line.strip()
]
if not categories:
print("✖ Нет категорий для обработки.")
return
all_products = []
for idx, url in enumerate(categories, 1):
log(f"[{idx}/{len(categories)}] {url}")
m = re.search(r"-([0-9]+)/?$", url.rstrip("/"))
if not m:
log("⚠️ Не найден ID категории в URL")
continue
cat_id = m.group(1)
try:
data = fetch_category_json(cat_id)
items = extract_products(data)
all_products.extend(items)
log(f"{len(items)} товаров добавлено из категории {cat_id}")
except Exception as e:
log(f"❌ Ошибка при категории {cat_id}: {e}")
if not all_products:
log("⚠️ Нет товаров для сохранения.")
return
# Сохраняем JSON
with OUT_JSON.open("w", encoding="utf-8") as f:
json.dump(all_products, f, ensure_ascii=False, indent=2)
log(f"💾 JSON сохранён → {OUT_JSON.name} ({len(all_products)} записей)")
# Сохраняем Excel
wb = Workbook()
ws = wb.active
ws.title = "IKEA_flat"
headers = list(all_products[0].keys())
ws.append(headers)
for row in all_products:
ws.append([row.get(h, "") for h in headers])
wb.save(OUT_XLSX)
log(f"📊 Excel сохранён → {OUT_XLSX.name}")
log("🎯 Готово.")
if __name__ == "__main__":
main()