205 lines
8.3 KiB
Python
205 lines
8.3 KiB
Python
#!/usr/bin/env python3
|
||
# Собирает ссылки на товары из списка категорий IKEA (leaf_categories.txt)
|
||
|
||
import re, sys, time, pathlib, requests, pandas as pd, html
|
||
from urllib.parse import urljoin
|
||
from bs4 import BeautifulSoup # нужен только для поиска "Pokaż więcej"
|
||
from math import ceil
|
||
import json, re, time, requests
|
||
|
||
# ── константы ───────────────────────────────────────────────────────
|
||
PRODUCT_RE = re.compile(
|
||
r'https?://www\.ikea\.com/pl/pl/p/[a-z0-9-]+-\d+/', re.I)
|
||
|
||
BASE_DIR = pathlib.Path(__file__).resolve().parent
|
||
BASE_DOMAIN = "https://www.ikea.com"
|
||
LOCALE_PREFIX = "/pl/pl/" # ← смените при другой локали
|
||
HEADERS = {"User-Agent": "Mozilla/5.0"}
|
||
|
||
CAT_FILE = BASE_DIR / "leaf_categories.txt"
|
||
OUT_TXT = BASE_DIR / "product_links.txt"
|
||
OUT_XLSX = BASE_DIR / "product_links.xlsx"
|
||
|
||
LOAD_PAUSE = 10 # задержка между страницами категории
|
||
WAIT_BEFORE_PARSE = 5
|
||
|
||
SEARCH_URL = "https://sik.search.blue.cdtapps.com/pl/pl/search?c=listaf&v=20250507"
|
||
# ────────────────────────────────────────────────────────────────────
|
||
|
||
|
||
def fetch_html(url, as_json=False):
|
||
r = requests.get(url, headers=HEADERS, timeout=10)
|
||
r.raise_for_status()
|
||
return r.json() if as_json else r.text
|
||
|
||
|
||
def extract_product_links(html_text: str) -> list[str]:
|
||
"""
|
||
Возвращает ВСЕ (может с повторами) абсолютные ссылки,
|
||
начинающиеся с https://www.ikea.com/pl/pl/p/
|
||
"""
|
||
return re.findall(r'https://www\.ikea\.com/pl/pl/p/[a-z0-9-]+-\d+/', html_text, flags=re.I)
|
||
|
||
|
||
# NEW ────────────────────────────────────────────────────────────────
|
||
# — замените старую функцию этим вариантом —
|
||
def product_links_from_category(cat_url: str) -> list[str]:
|
||
"""
|
||
Через POST-Search API вытаскивает все pipUrl категории.
|
||
"""
|
||
# --- достаём id категории из URL ----------------------------
|
||
m = re.search(r'-([0-9]+)/?$', cat_url.rstrip("/"))
|
||
if not m:
|
||
print(" ⚠️ не могу получить categoryId из URL")
|
||
return []
|
||
cat_id = m.group(1)
|
||
print (cat_id)
|
||
all_links = []
|
||
offset = 0
|
||
size = 24 # сайт именно так запрашивает
|
||
|
||
headers = {
|
||
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36",
|
||
"Content-Type": "application/json",
|
||
"session-id":"99c25720-5826-4c57-87b2-d6e29397b584",
|
||
"Accept":"*/*",
|
||
}
|
||
|
||
while True:
|
||
# --- формируем payload точно как фронт ------------------
|
||
payload = {
|
||
"searchParameters": {"input": cat_id, "type": "CATEGORY"},
|
||
"zip": "05-090",
|
||
"store": "188",
|
||
"isUserLoggedIn": False,
|
||
"optimizely": {
|
||
"listing_3547_filter_hnf_sticky": None,
|
||
"listing_3332_collapsed_filter_bar": None,
|
||
"discount_percentage": None,
|
||
"listing_3790_simplify_rating_stars": None
|
||
},
|
||
"optimizelyAttributes": {
|
||
"market": "pl",
|
||
"device": "desktop",
|
||
"deviceVendor": "Apple",
|
||
"deviceType": "desktop",
|
||
"isLoggedIn": False,
|
||
"environment": "prod",
|
||
"browser": "Chrome",
|
||
"os": "Mac OS",
|
||
"language": "pl",
|
||
"feedMarket": "pl-PL",
|
||
"locale": "pl-PL",
|
||
"customerType": "guest",
|
||
"isEntranceVisit": False,
|
||
"pip_to_pip_src": ""
|
||
},
|
||
"components": [
|
||
{
|
||
"component": "PRIMARY_AREA",
|
||
"columns": 4,
|
||
"types": {
|
||
"main": "PRODUCT",
|
||
"breakouts": ["PLANNER", "LOGIN_REMINDER", "MATTRESS_WARRANTY"]
|
||
},
|
||
"filterConfig": {"max-num-filters": 6},
|
||
"window": {"size": size, "offset": offset},
|
||
"forceFilterCalculation": True
|
||
}
|
||
]
|
||
}
|
||
print(payload)
|
||
print(f" POST offset={offset}")
|
||
|
||
r = requests.post(SEARCH_URL,
|
||
headers=headers,
|
||
data=json.dumps(payload),
|
||
timeout=15)
|
||
|
||
print(f" status: {r.status_code}")
|
||
# --- выводим статус ---
|
||
print(f" status: {r.status_code}")
|
||
|
||
# --- печатаем полученный JSON ---
|
||
try:
|
||
data = r.json()
|
||
print(" JSON:", json.dumps(data, ensure_ascii=False, indent=2))
|
||
except ValueError:
|
||
print(" ⚠️ ответ не JSON")
|
||
break
|
||
|
||
if r.status_code != 200:
|
||
break
|
||
|
||
# --- вытаскиваем все pipUrl из блока products ------------
|
||
products = data.get("product", [])
|
||
for prod in products:
|
||
purl = prod.get("pipUrl")
|
||
if purl:
|
||
all_links.append(urljoin(BASE_DOMAIN, purl))
|
||
|
||
print(f" +{len(products)} товаров")
|
||
|
||
# если меньше, чем size → это была последняя порция
|
||
if len(products) < size:
|
||
break
|
||
|
||
offset += size
|
||
#time.sleep(PAUSE_SEC) # пауза между API-вызовами
|
||
|
||
return all_links
|
||
|
||
# ---------- остальные страницы ----------
|
||
for page in range(2, pages_total + 1):
|
||
url = f"{cat_url}?page={page}"
|
||
print(f" fetch: {url}")
|
||
html = fetch_html(url)
|
||
time.sleep(WAIT_BEFORE_PARSE) # ★ пауза 5 с
|
||
page_links = extract_product_links(html)
|
||
print(f" page {page}: +{len(page_links)}")
|
||
all_links.extend(page_links)
|
||
#time.sleep(PAUSE_SEC) # старая пауза между запросами
|
||
|
||
return all_links
|
||
# ────────────────────────────────────────────────────────────────────
|
||
|
||
|
||
def main():
|
||
if not CAT_FILE.exists():
|
||
sys.exit("✖ leaf_categories.txt не найден — запустите сборщик категорий")
|
||
|
||
categories = [l.strip() for l in CAT_FILE.read_text(encoding="utf-8").splitlines() if l.strip()]
|
||
all_links = set()
|
||
|
||
for idx, cat in enumerate(categories, 1):
|
||
print(f"[{idx}/{len(categories)}] {cat}")
|
||
try:
|
||
# OLD: html = fetch_html(cat)
|
||
# OLD: links = extract_product_links(html)
|
||
links = product_links_from_category(cat) # NEW
|
||
print(f" Σ {len(links)} товаров") # NEW
|
||
# all_links.update(links) # ← удаляет дубликаты
|
||
# стало, чтобы сохранять повторы:
|
||
all_links = []
|
||
all_links.extend(links)
|
||
except Exception as e:
|
||
print(" ⚠️", e)
|
||
time.sleep(10) # пауза между категориями (оставляем как было)
|
||
|
||
print("\nВсего уникальных товаров:", len(all_links))
|
||
|
||
# ---------- сохраняем ----------
|
||
OUT_TXT.write_text("\n".join(sorted(all_links)), encoding="utf-8")
|
||
print("🔗 product_links.txt создан")
|
||
|
||
try:
|
||
pd.DataFrame(sorted(all_links), columns=["url"]).to_excel(
|
||
OUT_XLSX, index=False)
|
||
print("📊 product_links.xlsx создан")
|
||
except Exception as e:
|
||
print("⚠️ Excel не записан:", e)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|