import asyncio import logging import os import json import re from datetime import datetime from pathlib import Path from typing import List, Dict, Any, Optional from playwright.async_api import async_playwright from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type class FetchError(Exception): pass # ---- Price parsing helpers ---- _PLN_PRICE_RE = re.compile( r'(? float | None: if not price_text: return None t = ( price_text.replace("\u00a0", " ") .replace("\u2009", " ") .strip() ) m = _PLN_PRICE_RE.search(t) if not m: return None num = m.group(1) num = num.replace(" ", "").replace("\u00a0", "").replace("\u2009", "") num = num.replace(",", ".") try: return float(num) except Exception: return None class Fetcher: """ Playwright layer + инструменты: - Лёгкий рендер (блокируем image/font/media, оставляем CSS). - PLP: скролл до полного количества, сбор SSR + DOM. - PDP: обогащение color/description. - Дампы HTML/PNG в out/raw_html для отладки. """ def __init__(self, cfg: Dict[str, Any]): self.cfg = cfg self.base_url = cfg.get("base_url") self.xhr_patterns = [re.compile(p) for p in cfg.get("xhr_patterns", [])] self.collected_xhr: List[Dict[str, Any]] = [] async def __aenter__(self): self.playwright = await async_playwright().start() args = ["--disable-dev-shm-usage", "--no-sandbox"] self.browser = await self.playwright.chromium.launch( headless=self.cfg.get("headless", True), args=args, devtools=not self.cfg.get("headless", True), ) self.context = await self.browser.new_context( locale=self.cfg.get("locale", "en-GB"), timezone_id=self.cfg.get("timezoneId", "Europe/Warsaw"), user_agent=( "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " "AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/124.0.0.0 Safari/537.36" ), viewport={"width": 1366, "height": 900}, ) self.page = await self.context.new_page() await self.context.route("**/*", self._route) self.page.on("response", self._on_response) self.page.on("console", lambda msg: logging.debug(f"[page.console] {msg.type} {msg.text}")) return self async def __aexit__(self, exc_type, exc, tb): await self.context.close() await self.browser.close() await self.playwright.stop() async def _route(self, route, request): if request.resource_type in ["font", "media", "image"]: return await route.abort() return await route.continue_() def _on_response(self, response): try: url = response.url if any(p.search(url) for p in self.xhr_patterns): if "application/json" in (response.headers.get("content-type", "")): self.collected_xhr.append({"url": url, "response": response}) except Exception: pass async def _dump_debug(self, tag: str): try: raw_dir = Path("out/raw_html").resolve() raw_dir.mkdir(parents=True, exist_ok=True) ts = datetime.utcnow().strftime("%Y%m%d_%H%M%S_%f") html_path = raw_dir / f"{ts}_{tag}.html" png_path = raw_dir / f"{ts}_{tag}.png" logging.info(f"[dump_debug] CWD={Path(os.getcwd()).resolve()} → html={html_path} png={png_path}") try: html = await self.page.content() html_path.write_text(html, encoding="utf-8") except Exception as e: logging.warning(f"[dump_debug] writing HTML failed: {e}") try: await self.page.screenshot(path=str(png_path), full_page=True) except Exception as e: logging.warning(f"[dump_debug] screenshot failed: {e}") logging.info(f"[dump_debug] saved OK: {html_path.name}, {png_path.name}") except Exception as e: logging.warning(f"[dump_debug] general fail: {e}") async def _accept_cookies_if_any(self): selectors = [ "#onetrust-accept-btn-handler", "button#onetrust-accept-btn-handler", 'button:has-text("Accept all")', 'button:has-text("Accept All")', 'button[aria-label*="Accept"]', ] for sel in selectors: try: el = self.page.locator(sel) if await el.count() > 0: await el.first.click(timeout=2000) logging.info("Cookie banner accepted.") break except Exception: pass async def _log_plp_state(self, stage: str): try: scripts_count = await self.page.locator('script[id^="next-product-summary-script-"]').count() except Exception: scripts_count = -1 try: has_window = await self.page.evaluate("""() => { const ps = globalThis?.ssrClientSettings?.productSummary; return !!(ps && Array.isArray(ps.itemNumbers) && ps.itemNumbers.length > 0); }""") except Exception: has_window = False logging.info(f"[{stage}] scripts: {scripts_count}, window.ps: {has_window}") @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=1, max=8), retry=retry_if_exception_type(FetchError)) async def load_category(self, url: str): try: await self.page.goto(url, timeout=self.cfg.get("nav_timeout_ms", 60000), wait_until="domcontentloaded") await self._dump_debug("after_goto") await self._accept_cookies_if_any() await self._dump_debug("after_cookies") await self._log_plp_state("after_accept") for _ in range(3): await self.page.mouse.wheel(0, 1600) await self.page.wait_for_timeout(300) await self._dump_debug("after_warmup") await self._log_plp_state("after_warmup") await self.page.wait_for_selector('script[id^="next-product-summary-script-"]', state="attached", timeout=self.cfg.get("wait_timeout_ms", 30000)) await self._dump_debug("after_scripts_present") try: await self.page.wait_for_function( """() => { const ps = globalThis?.ssrClientSettings?.productSummary; return !!(ps && Array.isArray(ps.itemNumbers) && ps.itemNumbers.length > 0); }""", timeout=5000, ) except Exception: logging.info("window.ssrClientSettings not ready (non-fatal).") await self._dump_debug("after_window_check") return True except Exception as e: logging.error(f"load_category failed: {e}") await self._dump_debug("fail_load_category") raise FetchError(str(e)) async def read_total_from_header(self) -> Optional[int]: sels = ["#plp-seo-heading .esi-count", ".esi-count"] for sel in sels: try: el = self.page.locator(sel) if await el.count() > 0: txt = await el.first.inner_text(timeout=1500) digits = "".join(ch for ch in txt if ch.isdigit()) if digits: total = int(digits) logging.info(f"Total from header: {total}") return total except Exception: continue logging.info("Total from header: not found") return None async def auto_scroll_until_total(self, hard_max_scrolls: Optional[int] = None): hard_cap = hard_max_scrolls or self.cfg.get("scroll", {}).get("hard_max_scrolls", 2000) netidle_ms = self.cfg.get("scroll", {}).get("wait_networkidle_timeout_ms", 8000) sel_tiles = '[data-testid="plp-product-grid-item"], [data-testid="product-tile"], .ProductCard, [data-qa="plp-product"]' target = await self.read_total_from_header() last = 0 same_ticks = 0 same_limit = self.cfg.get("scroll", {}).get("stop_if_no_new_items_after", 8) for i in range(hard_cap): try: await self.page.evaluate("window.scrollTo(0, document.body.scrollHeight)") except Exception: pass try: await self.page.wait_for_load_state("networkidle", timeout=netidle_ms) except Exception: await asyncio.sleep(0.25) try: await self.page.mouse.wheel(0, -200); await asyncio.sleep(0.1) await self.page.mouse.wheel(0, 1200) except Exception: pass try: seen = await self.page.locator(sel_tiles).count() except Exception: seen = last if target and seen >= target: logging.info(f"Reached target: seen {seen}/{target} (i={i})") break if seen <= last: same_ticks += 1 if same_ticks >= same_limit: logging.info(f"No growth for a while: seen={seen}, i={i}") break else: same_ticks = 0 last = seen logging.info(f"Final seen items: {last} (target={target}, cap={hard_cap})") async def current_html(self) -> str: return await self.page.content() async def extract_xhr_json(self) -> List[Dict[str, Any]]: results = [] for entry in self.collected_xhr: try: body = await entry["response"].json() results.append({"url": entry["url"], "json": body}) except Exception: pass return results # ---------- PLP: SSR + DOM ---------- async def read_ssr_product_summaries(self) -> List[Dict[str, Any]]: # 1) window.* js_window = """ () => { const out = []; const ps = globalThis?.ssrClientSettings?.productSummary; if (!ps) return out; const ids = Array.isArray(ps.itemNumbers) ? ps.itemNumbers : []; for (const id of ids) { const obj = ps[id]; if (!obj) continue; const sd = obj?._STATE_?.productSummary?.summaryData; if (!sd) continue; const cw = Array.isArray(sd.colourways) && sd.colourways.length ? sd.colourways[0] : null; out.push({ id: sd.id || null, title: sd.title || null, baseUrl: sd.baseUrl || null, brand: sd.brand || null, category: sd.category || null, currencyCode: sd.currencyCode || null, colourway: cw ? { id: cw.id ?? null, url: cw.url ?? null, color: cw.c ?? null, title: cw.t ?? null, price: cw.p ?? null, priceMarket: cw.mp ?? null, selected: !!cw.s } : null, imageCdnUrl: sd.imageCdnUrl || null, productImageUrlPart: sd.productImageUrlPart || null, lgImagePath: sd.lgImagePath || null }); } return out; } """ try: w = await self.page.evaluate(js_window) if isinstance(w, list) and w: logging.info(f"SSR(window) summaries: {len(w)}") return w except Exception: pass # 2) inline