MacOS_Parsers/Парсер_IKEA/ikea_collect_product_links.py

165 lines
6.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# ikea_collect_product_links.py POST-вариант, полный лог
import json, re, sys, time, pathlib, requests, pandas as pd, datetime
from urllib.parse import urljoin
# ── настройки ─────────────────────────────────────────────────────
BASE_DIR = pathlib.Path(__file__).resolve().parent
CAT_FILE = BASE_DIR / "leaf_categories.txt"
OUT_TXT = BASE_DIR / "product_links.txt"
OUT_XLSX = BASE_DIR / "product_links.xlsx"
LOG_FILE = BASE_DIR / "log_all_CatProd.txt"
LOCALE_PREFIX = "/pl/pl/" # поменяйте, если другая страна
BASE_DOMAIN = "https://www.ikea.com"
SEARCH_URL = "https://sik.search.blue.cdtapps.com/pl/pl/search?c=listaf&v=20250507"
HEADERS = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36",
"Content-Type": "application/json",
"session-id":"99c25720-5826-4c57-87b2-d6e29397b584",
"Accept":"*/*",
}
SIZE = 24 # столько же использует фронт
PAUSE = 0.6 # задержка между offset-запросами
# ──────────────────────────────────────────────────────────────────
def log(text: str):
ts = datetime.datetime.now().strftime("[%Y-%m-%d %H:%M:%S] ")
with LOG_FILE.open("a", encoding="utf-8") as f:
f.write(ts + text + "\n")
LOG_FILE.write_text("", encoding="utf-8") # обнуляем файл (без append)
def fetch_chunk(category_id: str, offset: int) -> dict:
payload = {
"searchParameters": {"input": category_id, "type": "CATEGORY"},
"zip": "05-090",
"store": "188",
"isUserLoggedIn": False,
"optimizely": {
"listing_3547_filter_hnf_sticky": None,
"listing_3332_collapsed_filter_bar": None,
"discount_percentage": None,
"listing_3790_simplify_rating_stars": None
},
"optimizelyAttributes": {
"market": "pl",
"device": "desktop",
"deviceVendor": "Apple",
"deviceType": "desktop",
"isLoggedIn": False,
"environment": "prod",
"browser": "Chrome",
"os": "Mac OS",
"language": "pl",
"feedMarket": "pl-PL",
"locale": "pl-PL",
"customerType": "guest",
"isEntranceVisit": False,
"pip_to_pip_src": ""
},
"components": [
{
"component": "PRIMARY_AREA",
"columns": 4,
"types": {
"main": "PRODUCT",
"breakouts": ["PLANNER", "LOGIN_REMINDER", "MATTRESS_WARRANTY"]
},
"filterConfig": {"max-num-filters": 6},
"window": {"size": 1000, "offset": 0},
"forceFilterCalculation": True
}
]
}
log(f"POST offset={offset} payload: {json.dumps(payload, ensure_ascii=False)}")
r = requests.post(SEARCH_URL, headers=HEADERS,
data=json.dumps(payload), timeout=15)
log(f"→ status {r.status_code}")
try:
data = r.json()
log("response: " + json.dumps(data, ensure_ascii=False))
except ValueError:
log("response: <non-JSON>")
data = {}
r.raise_for_status()
return data
def collect_pipurls(node, out:set):
"""Рекурсивно находит все значения ключа 'pipUrl'."""
if isinstance(node, dict):
for k, v in node.items():
if k == "pipUrl" and isinstance(v, str):
out.add(urljoin(BASE_DOMAIN, v))
else:
collect_pipurls(v, out)
elif isinstance(node, list):
for item in node:
collect_pipurls(item, out)
def product_links_from_category(cat_url: str) -> list[str]:
m = re.search(r'-([0-9]+)/?$', cat_url.rstrip("/"))
if not m:
print(" ⚠️ categoryId not found")
return []
cid = m.group(1)
offset = 0
links = []
while True:
data = fetch_chunk(cid, offset) # ← ваша существующая функция
pip_set = set()
collect_pipurls(data, pip_set) # ★ собираем все pipUrl
links.extend(pip_set) # без deduplicate по просьбе
print(f" +{len(pip_set)} pipUrl")
# если вернулось меньше стандартного окна — выходим
if len(data.get("products", [])) < SIZE:
break
offset += SIZE
#time.sleep(PAUSE_SEC)
return links
def main():
LOG_FILE.write_text("", encoding="utf-8") # очищаем лог
if not CAT_FILE.exists():
sys.exit("leaf_categories.txt не найден")
categories = [l.strip() for l in CAT_FILE.read_text(encoding="utf-8").splitlines() if l.strip()]
all_links = []
for idx, cat in enumerate(categories, 1):
print(f"[{idx}/{len(categories)}] {cat}")
try:
links = product_links_from_category(cat)
print(f" Σ {len(links)} ссылок")
all_links.extend(links) # дубликаты сохраняем
except Exception as e:
print(" ⚠️", e)
log(f"ERROR on {cat}: {e}")
OUT_TXT.write_text("\n".join(all_links), encoding="utf-8")
print("🔗 product_links.txt создан")
try:
pd.DataFrame(all_links, columns=["url"]).to_excel(OUT_XLSX, index=False)
print("📊 product_links.xlsx создан")
except Exception as e:
print("⚠️ Excel не записан:", e)
if __name__ == "__main__":
main()