MacOS_Parsers/Parser_NEXT/fetcher.py

import asyncio
import logging
import re
import json
import os
from datetime import datetime
from pathlib import Path
from typing import List, Dict, Any, Optional
import re
from playwright.async_api import async_playwright
from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type


# ---- Price parsing helpers ----
_PLN_PRICE_RE = re.compile(
    r'(?<!\d)(\d{1,3}(?:[ \u00A0]?\d{3})*(?:[.,]\d{2})?)(?:\s*(?:zł|PLN))',
    re.IGNORECASE,
)

def parse_pln_price_to_float(price_text: str | None) -> float | None:
    """
    '1 299,00 zł' / '1299 zł' / '1 299 zł' -> 1299.00
    Возвращает None, если распарсить не удалось.
    """
    if not price_text:
        return None
    t = (
        price_text
        .replace("\u00a0", " ")  # NBSP
        .replace("\u2009", " ")  # thin space
        .strip()
    )
    m = _PLN_PRICE_RE.search(t)
    if not m:
        return None
    num = m.group(1)
    num = num.replace(" ", "").replace("\u00a0", "").replace("\u2009", "")
    num = num.replace(",", ".")
    try:
        return float(num)
    except Exception:
        return None


class FetchError(Exception):
    pass


class Fetcher:
    """
    Browser layer: Playwright Chromium with anti-bot hygiene and robust debug dumps.
    - Blocks heavy resources (fonts/media/images), keeps stylesheets.
    - Waits for either SSR summary scripts or window.ssrClientSettings.
    - Two ways to read product summaries:
        1) window.ssrClientSettings.productSummary
        2) inline <script id="next-product-summary-script-..."> content (fallback)
    - Captures XHR JSON responses by patterns.
    - Dumps HTML/PNG with timestamps at key checkpoints and on failure.
    """

    def __init__(self, cfg: Dict[str, Any]):
        self.cfg = cfg
        self.base_url = cfg.get("base_url")
        self.xhr_patterns = [re.compile(p) for p in cfg.get("xhr_patterns", [])]
        self.collected_xhr: List[Dict[str, Any]] = []

    async def __aenter__(self):
        self.playwright = await async_playwright().start()
        args = ["--disable-dev-shm-usage", "--no-sandbox"]
        self.browser = await self.playwright.chromium.launch(
            headless=self.cfg.get("headless", True),
            args=args,
            devtools=not self.cfg.get("headless", True),
        )
        self.context = await self.browser.new_context(
            locale=self.cfg.get("locale", "en-GB"),
            timezone_id=self.cfg.get("timezoneId", "Europe/Warsaw"),
            user_agent=(
                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                "AppleWebKit/537.36 (KHTML, like Gecko) "
                "Chrome/124.0.0.0 Safari/537.36"
            ),
            viewport={"width": 1366, "height": 900},
        )
        self.page = await self.context.new_page()

        # Block heavy resources; keep stylesheets.
        await self.context.route("**/*", self._route)

        # Listen to JSON XHRs for optional parsing.
        self.page.on("response", self._on_response)
        self.page.on("console", lambda msg: logging.debug(f"[page.console] {msg.type} {msg.text}"))
        return self

    async def __aexit__(self, exc_type, exc, tb):
        await self.context.close()
        await self.browser.close()
        await self.playwright.stop()

    async def _route(self, route, request):
        """
        Блокируем часть тяжёлых ресурсов.
        Для отладки с картинками убери 'image' из списка.
        """
        if request.resource_type in ["font", "media", "image"]:
            return await route.abort()
        return await route.continue_()

    def _on_response(self, response):
        try:
            url = response.url
            if any(p.search(url) for p in self.xhr_patterns):
                if "application/json" in (response.headers.get("content-type", "")):
                    self.collected_xhr.append({"url": url, "response": response})
        except Exception:
            pass

    async def _dump_debug(self, tag: str):
        """Save HTML and screenshot with timestamp; log absolute paths and CWD."""
        try:
            raw_dir = Path("out/raw_html").resolve()
            raw_dir.mkdir(parents=True, exist_ok=True)
            ts = datetime.utcnow().strftime("%Y%m%d_%H%M%S_%f")
            html_path = raw_dir / f"{ts}_{tag}.html"
            png_path = raw_dir / f"{ts}_{tag}.png"

            cwd = Path(os.getcwd()).resolve()
            logging.info(f"[dump_debug] CWD={cwd}  → html={html_path}  png={png_path}")

            try:
                html = await self.page.content()
                html_path.write_text(html, encoding="utf-8")
            except Exception as e:
                logging.warning(f"[dump_debug] writing HTML failed: {e}")

            try:
                await self.page.screenshot(path=str(png_path), full_page=True)
            except Exception as e:
                logging.warning(f"[dump_debug] screenshot failed: {e}")

            logging.info(f"[dump_debug] saved OK: {html_path.name}, {png_path.name}")
        except Exception as e:
            logging.warning(f"[dump_debug] general fail: {e}")

    async def _accept_cookies_if_any(self):
        selectors = [
            "#onetrust-accept-btn-handler",
            "button#onetrust-accept-btn-handler",
            'button:has-text("Accept all")',
            'button:has-text("Accept All")',
            'button[aria-label*="Accept"]',
        ]
        for sel in selectors:
            try:
                el = self.page.locator(sel)
                if await el.count() > 0:
                    await el.first.click(timeout=2000)
                    logging.info("Cookie banner accepted.")
                    break
            except Exception:
                pass

    async def _log_plp_state(self, stage: str):
        """Log counts of SSR scripts and presence of window.ssrClientSettings."""
        try:
            scripts_count = await self.page.locator('script[id^="next-product-summary-script-"]').count()
        except Exception:
            scripts_count = -1
        try:
            has_window = await self.page.evaluate("""() => {
                const ps = globalThis?.ssrClientSettings?.productSummary;
                return !!(ps && Array.isArray(ps.itemNumbers) && ps.itemNumbers.length > 0);
            }""")
        except Exception:
            has_window = False
        logging.info(f"[{stage}] scripts: {scripts_count}, window.ps: {has_window}")

    @retry(
        stop=stop_after_attempt(3),
        wait=wait_exponential(multiplier=1, min=1, max=8),
        retry=retry_if_exception_type(FetchError),
    )
    async def load_category(self, url: str):
        """
        Navigation + robust readiness:
        1) domcontentloaded
        2) accept cookies
        3) warm-up scroll
        4) wait for <script id^="next-product-summary-script-"> (attached)
        5) attempt window.ssrClientSettings (non-fatal)
        Dumps at key checkpoints and on failure.
        """
        try:
            await self.page.goto(
                url,
                timeout=self.cfg.get("nav_timeout_ms", 60000),
                wait_until="domcontentloaded",
            )
            await self._dump_debug("after_goto")

            await self._accept_cookies_if_any()
            await self._dump_debug("after_cookies")
            await self._log_plp_state("after_accept")

            # warm-up scroll to trigger scripts/lazy
            for _ in range(3):
                await self.page.mouse.wheel(0, 1600)
                await self.page.wait_for_timeout(300)
            await self._dump_debug("after_warmup")
            await self._log_plp_state("after_warmup")

            # wait for SSR script tags
            await self.page.wait_for_selector(
                'script[id^="next-product-summary-script-"]',
                state="attached",
                timeout=self.cfg.get("wait_timeout_ms", 30000),
            )
            await self._dump_debug("after_scripts_present")

            # optional window readiness
            try:
                await self.page.wait_for_function(
                    """
                    () => {
                        const ps = globalThis?.ssrClientSettings?.productSummary;
                        return !!(ps && Array.isArray(ps.itemNumbers) && ps.itemNumbers.length > 0);
                    }
                    """,
                    timeout=5000,
                )
            except Exception:
                logging.info("window.ssrClientSettings not ready (non-fatal).")

            await self._dump_debug("after_window_check")
            return True

        except Exception as e:
            logging.error(f"load_category failed: {e}")
            await self._dump_debug("fail_load_category")
            raise FetchError(str(e))

    # ---------- NEW: read total count and scroll until target ----------

    async def read_total_from_header(self) -> Optional[int]:
        """
        Tries to read category total from the header count like '(434)'.
        Looks in '#plp-seo-heading .esi-count' or any '.esi-count' fallback.
        """
        sels = ["#plp-seo-heading .esi-count", ".esi-count"]
        for sel in sels:
            try:
                el = self.page.locator(sel)
                if await el.count() > 0:
                    txt = await el.first.inner_text(timeout=1500)
                    digits = "".join(ch for ch in txt if ch.isdigit())
                    if digits:
                        total = int(digits)
                        logging.info(f"Total from header: {total}")
                        return total
            except Exception:
                continue
        logging.info("Total from header: not found")
        return None

    async def auto_scroll_until_total(self, hard_max_scrolls: Optional[int] = None):
        """
        Scrolls until we reach target total (from header), with a hard cap.
        Uses networkidle + a small jiggle to retrigger lazy loading.
        """
        hard_cap = hard_max_scrolls or self.cfg.get("scroll", {}).get("hard_max_scrolls", 2000)
        netidle_ms = self.cfg.get("scroll", {}).get("wait_networkidle_timeout_ms", 8000)
        # Combined product tile selector
        sel_tiles = '[data-testid="plp-product-grid-item"], [data-testid="product-tile"], .ProductCard, [data-qa="plp-product"]'

        target = await self.read_total_from_header()
        last = 0
        same_ticks = 0
        same_limit = self.cfg.get("scroll", {}).get("stop_if_no_new_items_after", 8)

        for i in range(hard_cap):
            # Scroll to bottom
            try:
                await self.page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
            except Exception:
                pass

            # Wait for network idle
            try:
                await self.page.wait_for_load_state("networkidle", timeout=netidle_ms)
            except Exception:
                # not fatal
                await asyncio.sleep(0.25)

            # Jiggle to retrigger observers
            try:
                await self.page.mouse.wheel(0, -200)
                await asyncio.sleep(0.1)
                await self.page.mouse.wheel(0, 1200)
            except Exception:
                pass

            try:
                seen = await self.page.locator(sel_tiles).count()
            except Exception:
                seen = last

            if target and seen >= target:
                logging.info(f"Reached target: seen {seen}/{target} (i={i})")
                break

            if seen <= last:
                same_ticks += 1
                if same_ticks >= same_limit:
                    logging.info(f"No growth for a while: seen={seen}, i={i}")
                    break
            else:
                same_ticks = 0
                last = seen

        logging.info(f"Final seen items: {last} (target={target}, cap={hard_cap})")

    # ---------- existing helpers ----------

    async def current_html(self) -> str:
        return await self.page.content()

    async def extract_xhr_json(self) -> List[Dict[str, Any]]:
        results = []
        for entry in self.collected_xhr:
            try:
                body = await entry["response"].json()
                results.append({"url": entry["url"], "json": body})
            except Exception:
                pass
        return results

    async def read_ssr_product_summaries(self) -> List[Dict[str, Any]]:
        """
        Returns simplified product summaries.
        Path 1: window.ssrClientSettings.productSummary
        Path 2: parse inline <script id="next-product-summary-script-..."> blocks
        """
        # Path 1 — from window
        js_window = """
        () => {
          const out = [];
          const ps = globalThis?.ssrClientSettings?.productSummary;
          if (!ps) return out;
          const ids = Array.isArray(ps.itemNumbers) ? ps.itemNumbers : [];
          for (const id of ids) {
            const obj = ps[id];
            if (!obj) continue;
            const sd = obj?._STATE_?.productSummary?.summaryData;
            if (!sd) continue;
            const cw = Array.isArray(sd.colourways) && sd.colourways.length ? sd.colourways[0] : null;
            out.push({
              id: sd.id || null,
              title: sd.title || null,
              baseUrl: sd.baseUrl || null,
              brand: sd.brand || null,
              category: sd.category || null,
              currencyCode: sd.currencyCode || null,
              colourway: cw ? {
                id: cw.id ?? null,
                url: cw.url ?? null,
                color: cw.c ?? null,
                title: cw.t ?? null,
                price: cw.p ?? null,
                priceMarket: cw.mp ?? null,
                selected: !!cw.s
              } : null,
              imageCdnUrl: sd.imageCdnUrl || null,
              productImageUrlPart: sd.productImageUrlPart || null,
              lgImagePath: sd.lgImagePath || null
            });
          }
          return out;
        }
        """
        try:
            w = await self.page.evaluate(js_window)
            if isinstance(w, list) and w:
                logging.info(f"SSR(window) summaries: {len(w)}")
                return w
        except Exception:
            pass

        # Path 2 — parse inline scripts
        js_scripts = """
        () => {
          const list = Array.from(document.querySelectorAll('script[id^="next-product-summary-script-"]'));
          return list.map(s => s.textContent || "");
        }
        """
        try:
            texts = await self.page.evaluate(js_scripts)
        except Exception:
            return []

        out: List[Dict[str, Any]] = []
        # productSummary["ID"] = { ... } OR productSummary['ID'] = { ... }
        assign_re = re.compile(r'productSummary\s*\[\s*([\'"])(.*?)\1\s*\]\s*=\s*\{')
        for t in texts or []:
            for m in assign_re.finditer(t):
                start = m.end() - 1  # at '{'
                depth = 0
                end = None
                for i in range(start, len(t)):
                    ch = t[i]
                    if ch == "{":
                        depth += 1
                    elif ch == "}":
                        depth -= 1
                        if depth == 0:
                            end = i + 1
                            break
                if end is None:
                    continue
                block = t[start:end]
                try:
                    data = json.loads(block)
                    sd = (
                        data.get("_STATE_", {})
                        .get("productSummary", {})
                        .get("summaryData", {})
                    )
                    cws = sd.get("colourways") or []
                    cw = cws[0] if cws else None
                    out.append(
                        {
                            "id": sd.get("id"),
                            "title": sd.get("title"),
                            "baseUrl": sd.get("baseUrl"),
                            "brand": sd.get("brand"),
                            "category": sd.get("category"),
                            "currencyCode": sd.get("currencyCode"),
                            "colourway": {
                                "id": cw.get("id"),
                                "url": cw.get("url"),
                                "color": cw.get("c"),
                                "title": cw.get("t"),
                                "price": cw.get("p"),
                                "priceMarket": cw.get("mp"),
                                "selected": bool(cw.get("s")),
                            } if cw else None,
                            "imageCdnUrl": sd.get("imageCdnUrl"),
                            "productImageUrlPart": sd.get("productImageUrlPart"),
                            "lgImagePath": sd.get("lgImagePath"),
                        }
                    )
                except Exception:
                    continue
    async def read_dom_products(self) -> List[Dict[str, Any]]:
        """
        Парсит карточки из DOM после прокрутки.
        Покрывает несколько вариантов разметки Next PLP.
        """
        js = r"""
        () => {
          const out = [];
          const gridItems = document.querySelectorAll('[data-testid="plp-product-grid-item"], .ProductCard, [data-qa="plp-product"]');

          const getPid = (container) => {
            // Вариант 1: data-pid на entrypoint
            const entry = container.querySelector('[id^="plp-product-summary-entrypoint-"]');
            if (entry && entry.getAttribute('data-pid')) return entry.getAttribute('data-pid');
            // Вариант 2: id="plp-product-summary-tile-<ID>"
            const tile = container.closest('[id^="plp-product-summary-tile-"]') || container.querySelector('[id^="plp-product-summary-tile-"]');
            if (tile) {
              const m = (tile.id || '').match(/plp-product-summary-tile-([A-Za-z0-9]+)/);
              if (m) return m[1];
            }
            // Вариант 3: вытащим из href вида .../<ID>#<ID> или .../T43162
            const a = container.querySelector('a[href*="/style/"], a[data-testid^="product_summary_tile_"], a[href*="/p/"]');
            if (a) {
              const href = a.getAttribute('href') || '';
              const m2 = href.match(/([A-Z]\d{4,})/i);
              if (m2) return m2[1].toUpperCase();
            }
            return null;
          };

          const getAbsUrl = (href) => {
            try {
              if (!href) return null;
              if (/^https?:\/\//i.test(href)) return href;
              const a = document.createElement('a');
              a.href = href;
              return a.href;
            } catch { return href || null; }
          };

          const getTitle = (container) => {
            const t1 = container.querySelector('[data-testid="product_summary_title"]');
            if (t1) return (t1.getAttribute('data-label') || t1.textContent || '').trim();
            const t2 = container.querySelector('[data-testid="product-name"], .productName, [itemprop="name"]');
            if (t2) return (t2.textContent || '').trim();
            return null;
          };

          const getPriceText = (container) => {
            // охватим несколько вариантов
            const priceRoots = [
              container.querySelector('[data-testid="price"]'),
              container.querySelector('[data-testid="ProductCard-Price"]'),
              container.querySelector('[itemprop="price"]'),
              container.querySelector('[aria-label*="price" i]'),
              container
            ].filter(Boolean);

            for (const root of priceRoots) {
              const spans = root.querySelectorAll('span, div');
              for (const el of spans) {
                const t = (el.textContent || '').trim();
                if (!t) continue;
                if (/\d/.test(t) && (t.includes('zł') || /PLN/i.test(t))) {
                  return t;
                }
              }
            }
            return null;
          };

          gridItems.forEach(container => {
            // Основная ссылка
            const link = container.querySelector('a[href*="/style/"], a[data-testid^="product_summary_tile_"], a[href*="/p/"]');
            const href = link ? link.getAttribute('href') : null;

            const rec = {
              id: getPid(container),
              title: getTitle(container),
              url: getAbsUrl(href),
              price_text: getPriceText(container),
              currency: null
            };

            if (rec.price_text) {
              if (rec.price_text.includes('zł') || /PLN/i.test(rec.price_text)) rec.currency = 'PLN';
            }

            // фильтруем пустые карточки без ссылки и заголовка
            if (rec.url || rec.title) out.push(rec);
          });

          // Удаляем дубли по id|url
          const seen = new Set();
          const uniq = [];
          for (const d of out) {
            const key = `${d.id || ''}|${d.url || ''}`;
            if (seen.has(key)) continue;
            seen.add(key);
            uniq.push(d);
          }
          return uniq;
        }
        """
        try:
            data = await self.page.evaluate(js)
            logging.info(f"DOM cards parsed: {len(data)}")
            return data
        except Exception as e:
            logging.warning(f"read_dom_products failed: {e}")
            return []

    async def collect_products(self) -> List[Dict[str, Any]]:
        """
        Унифицированный сбор: SSR (если есть) + DOM.
        Нормализуем к: id, title, url, price(float|None), currency('PLN'|...).
        """
        ssr = await self.read_ssr_product_summaries() or []
        dom = await self.read_dom_products() or []

        bykey: Dict[str, Dict[str, Any]] = {}

        def key(d: Dict[str, Any]) -> str:
            return f"{(d.get('id') or '')}|{(d.get('url') or '')}"

        # 1) Скелет из DOM
        for d in dom:
            bykey[key(d)] = {
                "id": d.get("id"),
                "title": d.get("title"),
                "url": d.get("url"),
                "price_text": d.get("price_text"),
                "currency": d.get("currency"),
            }

        # 2) Обогащаем из SSR (если есть)
        for s in ssr:
            cw = (s.get("colourway") or {})
            # собрать абсолютный URL
            url = None
            try:
                base = (s.get("baseUrl") or "").rstrip("/")
                rel = (cw.get("url") or "").lstrip("/")
                url = f"{base}/{rel}" if (base and rel) else None
            except Exception:
                pass

            cand = {"id": s.get("id"), "url": url}
            k = key(cand)
            rec = bykey.get(k)
            if rec is None:
                bykey[k] = {
                    "id": s.get("id"),
                    "title": s.get("title"),
                    "url": url,
                    "price_text": cw.get("price"),
                    "currency": s.get("currencyCode"),
                }
            else:
                if not rec.get("title") and s.get("title"):
                    rec["title"] = s["title"]
                if not rec.get("price_text") and cw.get("price"):
                    rec["price_text"] = cw["price"]
                if not rec.get("currency") and s.get("currencyCode"):
                    rec["currency"] = s["currencyCode"]

        # 3) Финальная нормализация цены
        out: List[Dict[str, Any]] = []
        for v in bykey.values():
            price_val = parse_pln_price_to_float(v.get("price_text"))
            currency = v.get("currency")
            if not currency and (v.get("price_text") or "").lower().find("zł") != -1:
                currency = "PLN"
            out.append({
                "id": v.get("id"),
                "title": v.get("title"),
                "url": v.get("url"),
                "price": price_val,           # float или None
                "currency": currency or "PLN"
            })

        logging.info(f"Total collected (SSR+DOM): {len(out)}")
        return out