165 lines
6.1 KiB
Python
165 lines
6.1 KiB
Python
#!/usr/bin/env python3
|
||
# ikea_collect_product_links.py – POST-вариант, полный лог
|
||
|
||
import json, re, sys, time, pathlib, requests, pandas as pd, datetime
|
||
from urllib.parse import urljoin
|
||
|
||
# ── настройки ─────────────────────────────────────────────────────
|
||
BASE_DIR = pathlib.Path(__file__).resolve().parent
|
||
CAT_FILE = BASE_DIR / "leaf_categories.txt"
|
||
|
||
OUT_TXT = BASE_DIR / "product_links.txt"
|
||
OUT_XLSX = BASE_DIR / "product_links.xlsx"
|
||
LOG_FILE = BASE_DIR / "log_all_CatProd.txt"
|
||
|
||
LOCALE_PREFIX = "/pl/pl/" # поменяйте, если другая страна
|
||
BASE_DOMAIN = "https://www.ikea.com"
|
||
SEARCH_URL = "https://sik.search.blue.cdtapps.com/pl/pl/search?c=listaf&v=20250507"
|
||
|
||
HEADERS = {
|
||
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36",
|
||
"Content-Type": "application/json",
|
||
"session-id":"99c25720-5826-4c57-87b2-d6e29397b584",
|
||
"Accept":"*/*",
|
||
}
|
||
|
||
|
||
SIZE = 24 # столько же использует фронт
|
||
PAUSE = 0.6 # задержка между offset-запросами
|
||
# ──────────────────────────────────────────────────────────────────
|
||
|
||
|
||
def log(text: str):
|
||
ts = datetime.datetime.now().strftime("[%Y-%m-%d %H:%M:%S] ")
|
||
with LOG_FILE.open("a", encoding="utf-8") as f:
|
||
f.write(ts + text + "\n")
|
||
LOG_FILE.write_text("", encoding="utf-8") # обнуляем файл (без append)
|
||
|
||
|
||
def fetch_chunk(category_id: str, offset: int) -> dict:
|
||
payload = {
|
||
"searchParameters": {"input": category_id, "type": "CATEGORY"},
|
||
"zip": "05-090",
|
||
"store": "188",
|
||
"isUserLoggedIn": False,
|
||
"optimizely": {
|
||
"listing_3547_filter_hnf_sticky": None,
|
||
"listing_3332_collapsed_filter_bar": None,
|
||
"discount_percentage": None,
|
||
"listing_3790_simplify_rating_stars": None
|
||
},
|
||
"optimizelyAttributes": {
|
||
"market": "pl",
|
||
"device": "desktop",
|
||
"deviceVendor": "Apple",
|
||
"deviceType": "desktop",
|
||
"isLoggedIn": False,
|
||
"environment": "prod",
|
||
"browser": "Chrome",
|
||
"os": "Mac OS",
|
||
"language": "pl",
|
||
"feedMarket": "pl-PL",
|
||
"locale": "pl-PL",
|
||
"customerType": "guest",
|
||
"isEntranceVisit": False,
|
||
"pip_to_pip_src": ""
|
||
},
|
||
"components": [
|
||
{
|
||
"component": "PRIMARY_AREA",
|
||
"columns": 4,
|
||
"types": {
|
||
"main": "PRODUCT",
|
||
"breakouts": ["PLANNER", "LOGIN_REMINDER", "MATTRESS_WARRANTY"]
|
||
},
|
||
"filterConfig": {"max-num-filters": 6},
|
||
"window": {"size": 1000, "offset": 0},
|
||
"forceFilterCalculation": True
|
||
}
|
||
]
|
||
}
|
||
|
||
log(f"POST offset={offset} payload: {json.dumps(payload, ensure_ascii=False)}")
|
||
r = requests.post(SEARCH_URL, headers=HEADERS,
|
||
data=json.dumps(payload), timeout=15)
|
||
log(f"→ status {r.status_code}")
|
||
try:
|
||
data = r.json()
|
||
log("response: " + json.dumps(data, ensure_ascii=False))
|
||
except ValueError:
|
||
log("response: <non-JSON>")
|
||
data = {}
|
||
|
||
r.raise_for_status()
|
||
return data
|
||
|
||
def collect_pipurls(node, out:set):
|
||
"""Рекурсивно находит все значения ключа 'pipUrl'."""
|
||
if isinstance(node, dict):
|
||
for k, v in node.items():
|
||
if k == "pipUrl" and isinstance(v, str):
|
||
out.add(urljoin(BASE_DOMAIN, v))
|
||
else:
|
||
collect_pipurls(v, out)
|
||
elif isinstance(node, list):
|
||
for item in node:
|
||
collect_pipurls(item, out)
|
||
|
||
def product_links_from_category(cat_url: str) -> list[str]:
|
||
m = re.search(r'-([0-9]+)/?$', cat_url.rstrip("/"))
|
||
if not m:
|
||
print(" ⚠️ categoryId not found")
|
||
return []
|
||
|
||
cid = m.group(1)
|
||
offset = 0
|
||
links = []
|
||
|
||
while True:
|
||
data = fetch_chunk(cid, offset) # ← ваша существующая функция
|
||
pip_set = set()
|
||
collect_pipurls(data, pip_set) # ★ собираем все pipUrl
|
||
links.extend(pip_set) # без deduplicate по просьбе
|
||
print(f" +{len(pip_set)} pipUrl")
|
||
|
||
# если вернулось меньше стандартного окна — выходим
|
||
if len(data.get("products", [])) < SIZE:
|
||
break
|
||
offset += SIZE
|
||
#time.sleep(PAUSE_SEC)
|
||
|
||
return links
|
||
|
||
|
||
|
||
def main():
|
||
LOG_FILE.write_text("", encoding="utf-8") # очищаем лог
|
||
if not CAT_FILE.exists():
|
||
sys.exit("leaf_categories.txt не найден")
|
||
|
||
categories = [l.strip() for l in CAT_FILE.read_text(encoding="utf-8").splitlines() if l.strip()]
|
||
all_links = []
|
||
|
||
for idx, cat in enumerate(categories, 1):
|
||
print(f"[{idx}/{len(categories)}] {cat}")
|
||
try:
|
||
links = product_links_from_category(cat)
|
||
print(f" Σ {len(links)} ссылок")
|
||
all_links.extend(links) # дубликаты сохраняем
|
||
except Exception as e:
|
||
print(" ⚠️", e)
|
||
log(f"ERROR on {cat}: {e}")
|
||
|
||
OUT_TXT.write_text("\n".join(all_links), encoding="utf-8")
|
||
print("🔗 product_links.txt создан")
|
||
|
||
try:
|
||
pd.DataFrame(all_links, columns=["url"]).to_excel(OUT_XLSX, index=False)
|
||
print("📊 product_links.xlsx создан")
|
||
except Exception as e:
|
||
print("⚠️ Excel не записан:", e)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|