MacOS_Parsers/Parser_NEXT/fetcher.py
2025-08-24 15:11:06 +03:00

637 lines
24 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import asyncio
import logging
import re
import json
import os
from datetime import datetime
from pathlib import Path
from typing import List, Dict, Any, Optional
import re
from playwright.async_api import async_playwright
from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type
# ---- Price parsing helpers ----
_PLN_PRICE_RE = re.compile(
r'(?<!\d)(\d{1,3}(?:[ \u00A0]?\d{3})*(?:[.,]\d{2})?)(?:\s*(?:zł|PLN))',
re.IGNORECASE,
)
def parse_pln_price_to_float(price_text: str | None) -> float | None:
"""
'1 299,00 zł' / '1299 zł' / '1 299 zł' -> 1299.00
Возвращает None, если распарсить не удалось.
"""
if not price_text:
return None
t = (
price_text
.replace("\u00a0", " ") # NBSP
.replace("\u2009", " ") # thin space
.strip()
)
m = _PLN_PRICE_RE.search(t)
if not m:
return None
num = m.group(1)
num = num.replace(" ", "").replace("\u00a0", "").replace("\u2009", "")
num = num.replace(",", ".")
try:
return float(num)
except Exception:
return None
class FetchError(Exception):
pass
class Fetcher:
"""
Browser layer: Playwright Chromium with anti-bot hygiene and robust debug dumps.
- Blocks heavy resources (fonts/media/images), keeps stylesheets.
- Waits for either SSR summary scripts or window.ssrClientSettings.
- Two ways to read product summaries:
1) window.ssrClientSettings.productSummary
2) inline <script id="next-product-summary-script-..."> content (fallback)
- Captures XHR JSON responses by patterns.
- Dumps HTML/PNG with timestamps at key checkpoints and on failure.
"""
def __init__(self, cfg: Dict[str, Any]):
self.cfg = cfg
self.base_url = cfg.get("base_url")
self.xhr_patterns = [re.compile(p) for p in cfg.get("xhr_patterns", [])]
self.collected_xhr: List[Dict[str, Any]] = []
async def __aenter__(self):
self.playwright = await async_playwright().start()
args = ["--disable-dev-shm-usage", "--no-sandbox"]
self.browser = await self.playwright.chromium.launch(
headless=self.cfg.get("headless", True),
args=args,
devtools=not self.cfg.get("headless", True),
)
self.context = await self.browser.new_context(
locale=self.cfg.get("locale", "en-GB"),
timezone_id=self.cfg.get("timezoneId", "Europe/Warsaw"),
user_agent=(
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/124.0.0.0 Safari/537.36"
),
viewport={"width": 1366, "height": 900},
)
self.page = await self.context.new_page()
# Block heavy resources; keep stylesheets.
await self.context.route("**/*", self._route)
# Listen to JSON XHRs for optional parsing.
self.page.on("response", self._on_response)
self.page.on("console", lambda msg: logging.debug(f"[page.console] {msg.type} {msg.text}"))
return self
async def __aexit__(self, exc_type, exc, tb):
await self.context.close()
await self.browser.close()
await self.playwright.stop()
async def _route(self, route, request):
"""
Блокируем часть тяжёлых ресурсов.
Для отладки с картинками убери 'image' из списка.
"""
if request.resource_type in ["font", "media", "image"]:
return await route.abort()
return await route.continue_()
def _on_response(self, response):
try:
url = response.url
if any(p.search(url) for p in self.xhr_patterns):
if "application/json" in (response.headers.get("content-type", "")):
self.collected_xhr.append({"url": url, "response": response})
except Exception:
pass
async def _dump_debug(self, tag: str):
"""Save HTML and screenshot with timestamp; log absolute paths and CWD."""
try:
raw_dir = Path("out/raw_html").resolve()
raw_dir.mkdir(parents=True, exist_ok=True)
ts = datetime.utcnow().strftime("%Y%m%d_%H%M%S_%f")
html_path = raw_dir / f"{ts}_{tag}.html"
png_path = raw_dir / f"{ts}_{tag}.png"
cwd = Path(os.getcwd()).resolve()
logging.info(f"[dump_debug] CWD={cwd} → html={html_path} png={png_path}")
try:
html = await self.page.content()
html_path.write_text(html, encoding="utf-8")
except Exception as e:
logging.warning(f"[dump_debug] writing HTML failed: {e}")
try:
await self.page.screenshot(path=str(png_path), full_page=True)
except Exception as e:
logging.warning(f"[dump_debug] screenshot failed: {e}")
logging.info(f"[dump_debug] saved OK: {html_path.name}, {png_path.name}")
except Exception as e:
logging.warning(f"[dump_debug] general fail: {e}")
async def _accept_cookies_if_any(self):
selectors = [
"#onetrust-accept-btn-handler",
"button#onetrust-accept-btn-handler",
'button:has-text("Accept all")',
'button:has-text("Accept All")',
'button[aria-label*="Accept"]',
]
for sel in selectors:
try:
el = self.page.locator(sel)
if await el.count() > 0:
await el.first.click(timeout=2000)
logging.info("Cookie banner accepted.")
break
except Exception:
pass
async def _log_plp_state(self, stage: str):
"""Log counts of SSR scripts and presence of window.ssrClientSettings."""
try:
scripts_count = await self.page.locator('script[id^="next-product-summary-script-"]').count()
except Exception:
scripts_count = -1
try:
has_window = await self.page.evaluate("""() => {
const ps = globalThis?.ssrClientSettings?.productSummary;
return !!(ps && Array.isArray(ps.itemNumbers) && ps.itemNumbers.length > 0);
}""")
except Exception:
has_window = False
logging.info(f"[{stage}] scripts: {scripts_count}, window.ps: {has_window}")
@retry(
stop=stop_after_attempt(3),
wait=wait_exponential(multiplier=1, min=1, max=8),
retry=retry_if_exception_type(FetchError),
)
async def load_category(self, url: str):
"""
Navigation + robust readiness:
1) domcontentloaded
2) accept cookies
3) warm-up scroll
4) wait for <script id^="next-product-summary-script-"> (attached)
5) attempt window.ssrClientSettings (non-fatal)
Dumps at key checkpoints and on failure.
"""
try:
await self.page.goto(
url,
timeout=self.cfg.get("nav_timeout_ms", 60000),
wait_until="domcontentloaded",
)
await self._dump_debug("after_goto")
await self._accept_cookies_if_any()
await self._dump_debug("after_cookies")
await self._log_plp_state("after_accept")
# warm-up scroll to trigger scripts/lazy
for _ in range(3):
await self.page.mouse.wheel(0, 1600)
await self.page.wait_for_timeout(300)
await self._dump_debug("after_warmup")
await self._log_plp_state("after_warmup")
# wait for SSR script tags
await self.page.wait_for_selector(
'script[id^="next-product-summary-script-"]',
state="attached",
timeout=self.cfg.get("wait_timeout_ms", 30000),
)
await self._dump_debug("after_scripts_present")
# optional window readiness
try:
await self.page.wait_for_function(
"""
() => {
const ps = globalThis?.ssrClientSettings?.productSummary;
return !!(ps && Array.isArray(ps.itemNumbers) && ps.itemNumbers.length > 0);
}
""",
timeout=5000,
)
except Exception:
logging.info("window.ssrClientSettings not ready (non-fatal).")
await self._dump_debug("after_window_check")
return True
except Exception as e:
logging.error(f"load_category failed: {e}")
await self._dump_debug("fail_load_category")
raise FetchError(str(e))
# ---------- NEW: read total count and scroll until target ----------
async def read_total_from_header(self) -> Optional[int]:
"""
Tries to read category total from the header count like '(434)'.
Looks in '#plp-seo-heading .esi-count' or any '.esi-count' fallback.
"""
sels = ["#plp-seo-heading .esi-count", ".esi-count"]
for sel in sels:
try:
el = self.page.locator(sel)
if await el.count() > 0:
txt = await el.first.inner_text(timeout=1500)
digits = "".join(ch for ch in txt if ch.isdigit())
if digits:
total = int(digits)
logging.info(f"Total from header: {total}")
return total
except Exception:
continue
logging.info("Total from header: not found")
return None
async def auto_scroll_until_total(self, hard_max_scrolls: Optional[int] = None):
"""
Scrolls until we reach target total (from header), with a hard cap.
Uses networkidle + a small jiggle to retrigger lazy loading.
"""
hard_cap = hard_max_scrolls or self.cfg.get("scroll", {}).get("hard_max_scrolls", 2000)
netidle_ms = self.cfg.get("scroll", {}).get("wait_networkidle_timeout_ms", 8000)
# Combined product tile selector
sel_tiles = '[data-testid="plp-product-grid-item"], [data-testid="product-tile"], .ProductCard, [data-qa="plp-product"]'
target = await self.read_total_from_header()
last = 0
same_ticks = 0
same_limit = self.cfg.get("scroll", {}).get("stop_if_no_new_items_after", 8)
for i in range(hard_cap):
# Scroll to bottom
try:
await self.page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
except Exception:
pass
# Wait for network idle
try:
await self.page.wait_for_load_state("networkidle", timeout=netidle_ms)
except Exception:
# not fatal
await asyncio.sleep(0.25)
# Jiggle to retrigger observers
try:
await self.page.mouse.wheel(0, -200)
await asyncio.sleep(0.1)
await self.page.mouse.wheel(0, 1200)
except Exception:
pass
try:
seen = await self.page.locator(sel_tiles).count()
except Exception:
seen = last
if target and seen >= target:
logging.info(f"Reached target: seen {seen}/{target} (i={i})")
break
if seen <= last:
same_ticks += 1
if same_ticks >= same_limit:
logging.info(f"No growth for a while: seen={seen}, i={i}")
break
else:
same_ticks = 0
last = seen
logging.info(f"Final seen items: {last} (target={target}, cap={hard_cap})")
# ---------- existing helpers ----------
async def current_html(self) -> str:
return await self.page.content()
async def extract_xhr_json(self) -> List[Dict[str, Any]]:
results = []
for entry in self.collected_xhr:
try:
body = await entry["response"].json()
results.append({"url": entry["url"], "json": body})
except Exception:
pass
return results
async def read_ssr_product_summaries(self) -> List[Dict[str, Any]]:
"""
Returns simplified product summaries.
Path 1: window.ssrClientSettings.productSummary
Path 2: parse inline <script id="next-product-summary-script-..."> blocks
"""
# Path 1 — from window
js_window = """
() => {
const out = [];
const ps = globalThis?.ssrClientSettings?.productSummary;
if (!ps) return out;
const ids = Array.isArray(ps.itemNumbers) ? ps.itemNumbers : [];
for (const id of ids) {
const obj = ps[id];
if (!obj) continue;
const sd = obj?._STATE_?.productSummary?.summaryData;
if (!sd) continue;
const cw = Array.isArray(sd.colourways) && sd.colourways.length ? sd.colourways[0] : null;
out.push({
id: sd.id || null,
title: sd.title || null,
baseUrl: sd.baseUrl || null,
brand: sd.brand || null,
category: sd.category || null,
currencyCode: sd.currencyCode || null,
colourway: cw ? {
id: cw.id ?? null,
url: cw.url ?? null,
color: cw.c ?? null,
title: cw.t ?? null,
price: cw.p ?? null,
priceMarket: cw.mp ?? null,
selected: !!cw.s
} : null,
imageCdnUrl: sd.imageCdnUrl || null,
productImageUrlPart: sd.productImageUrlPart || null,
lgImagePath: sd.lgImagePath || null
});
}
return out;
}
"""
try:
w = await self.page.evaluate(js_window)
if isinstance(w, list) and w:
logging.info(f"SSR(window) summaries: {len(w)}")
return w
except Exception:
pass
# Path 2 — parse inline scripts
js_scripts = """
() => {
const list = Array.from(document.querySelectorAll('script[id^="next-product-summary-script-"]'));
return list.map(s => s.textContent || "");
}
"""
try:
texts = await self.page.evaluate(js_scripts)
except Exception:
return []
out: List[Dict[str, Any]] = []
# productSummary["ID"] = { ... } OR productSummary['ID'] = { ... }
assign_re = re.compile(r'productSummary\s*\[\s*([\'"])(.*?)\1\s*\]\s*=\s*\{')
for t in texts or []:
for m in assign_re.finditer(t):
start = m.end() - 1 # at '{'
depth = 0
end = None
for i in range(start, len(t)):
ch = t[i]
if ch == "{":
depth += 1
elif ch == "}":
depth -= 1
if depth == 0:
end = i + 1
break
if end is None:
continue
block = t[start:end]
try:
data = json.loads(block)
sd = (
data.get("_STATE_", {})
.get("productSummary", {})
.get("summaryData", {})
)
cws = sd.get("colourways") or []
cw = cws[0] if cws else None
out.append(
{
"id": sd.get("id"),
"title": sd.get("title"),
"baseUrl": sd.get("baseUrl"),
"brand": sd.get("brand"),
"category": sd.get("category"),
"currencyCode": sd.get("currencyCode"),
"colourway": {
"id": cw.get("id"),
"url": cw.get("url"),
"color": cw.get("c"),
"title": cw.get("t"),
"price": cw.get("p"),
"priceMarket": cw.get("mp"),
"selected": bool(cw.get("s")),
} if cw else None,
"imageCdnUrl": sd.get("imageCdnUrl"),
"productImageUrlPart": sd.get("productImageUrlPart"),
"lgImagePath": sd.get("lgImagePath"),
}
)
except Exception:
continue
async def read_dom_products(self) -> List[Dict[str, Any]]:
"""
Парсит карточки из DOM после прокрутки.
Покрывает несколько вариантов разметки Next PLP.
"""
js = r"""
() => {
const out = [];
const gridItems = document.querySelectorAll('[data-testid="plp-product-grid-item"], .ProductCard, [data-qa="plp-product"]');
const getPid = (container) => {
// Вариант 1: data-pid на entrypoint
const entry = container.querySelector('[id^="plp-product-summary-entrypoint-"]');
if (entry && entry.getAttribute('data-pid')) return entry.getAttribute('data-pid');
// Вариант 2: id="plp-product-summary-tile-<ID>"
const tile = container.closest('[id^="plp-product-summary-tile-"]') || container.querySelector('[id^="plp-product-summary-tile-"]');
if (tile) {
const m = (tile.id || '').match(/plp-product-summary-tile-([A-Za-z0-9]+)/);
if (m) return m[1];
}
// Вариант 3: вытащим из href вида .../<ID>#<ID> или .../T43162
const a = container.querySelector('a[href*="/style/"], a[data-testid^="product_summary_tile_"], a[href*="/p/"]');
if (a) {
const href = a.getAttribute('href') || '';
const m2 = href.match(/([A-Z]\d{4,})/i);
if (m2) return m2[1].toUpperCase();
}
return null;
};
const getAbsUrl = (href) => {
try {
if (!href) return null;
if (/^https?:\/\//i.test(href)) return href;
const a = document.createElement('a');
a.href = href;
return a.href;
} catch { return href || null; }
};
const getTitle = (container) => {
const t1 = container.querySelector('[data-testid="product_summary_title"]');
if (t1) return (t1.getAttribute('data-label') || t1.textContent || '').trim();
const t2 = container.querySelector('[data-testid="product-name"], .productName, [itemprop="name"]');
if (t2) return (t2.textContent || '').trim();
return null;
};
const getPriceText = (container) => {
// охватим несколько вариантов
const priceRoots = [
container.querySelector('[data-testid="price"]'),
container.querySelector('[data-testid="ProductCard-Price"]'),
container.querySelector('[itemprop="price"]'),
container.querySelector('[aria-label*="price" i]'),
container
].filter(Boolean);
for (const root of priceRoots) {
const spans = root.querySelectorAll('span, div');
for (const el of spans) {
const t = (el.textContent || '').trim();
if (!t) continue;
if (/\d/.test(t) && (t.includes('') || /PLN/i.test(t))) {
return t;
}
}
}
return null;
};
gridItems.forEach(container => {
// Основная ссылка
const link = container.querySelector('a[href*="/style/"], a[data-testid^="product_summary_tile_"], a[href*="/p/"]');
const href = link ? link.getAttribute('href') : null;
const rec = {
id: getPid(container),
title: getTitle(container),
url: getAbsUrl(href),
price_text: getPriceText(container),
currency: null
};
if (rec.price_text) {
if (rec.price_text.includes('') || /PLN/i.test(rec.price_text)) rec.currency = 'PLN';
}
// фильтруем пустые карточки без ссылки и заголовка
if (rec.url || rec.title) out.push(rec);
});
// Удаляем дубли по id|url
const seen = new Set();
const uniq = [];
for (const d of out) {
const key = `${d.id || ''}|${d.url || ''}`;
if (seen.has(key)) continue;
seen.add(key);
uniq.push(d);
}
return uniq;
}
"""
try:
data = await self.page.evaluate(js)
logging.info(f"DOM cards parsed: {len(data)}")
return data
except Exception as e:
logging.warning(f"read_dom_products failed: {e}")
return []
async def collect_products(self) -> List[Dict[str, Any]]:
"""
Унифицированный сбор: SSR (если есть) + DOM.
Нормализуем к: id, title, url, price(float|None), currency('PLN'|...).
"""
ssr = await self.read_ssr_product_summaries() or []
dom = await self.read_dom_products() or []
bykey: Dict[str, Dict[str, Any]] = {}
def key(d: Dict[str, Any]) -> str:
return f"{(d.get('id') or '')}|{(d.get('url') or '')}"
# 1) Скелет из DOM
for d in dom:
bykey[key(d)] = {
"id": d.get("id"),
"title": d.get("title"),
"url": d.get("url"),
"price_text": d.get("price_text"),
"currency": d.get("currency"),
}
# 2) Обогащаем из SSR (если есть)
for s in ssr:
cw = (s.get("colourway") or {})
# собрать абсолютный URL
url = None
try:
base = (s.get("baseUrl") or "").rstrip("/")
rel = (cw.get("url") or "").lstrip("/")
url = f"{base}/{rel}" if (base and rel) else None
except Exception:
pass
cand = {"id": s.get("id"), "url": url}
k = key(cand)
rec = bykey.get(k)
if rec is None:
bykey[k] = {
"id": s.get("id"),
"title": s.get("title"),
"url": url,
"price_text": cw.get("price"),
"currency": s.get("currencyCode"),
}
else:
if not rec.get("title") and s.get("title"):
rec["title"] = s["title"]
if not rec.get("price_text") and cw.get("price"):
rec["price_text"] = cw["price"]
if not rec.get("currency") and s.get("currencyCode"):
rec["currency"] = s["currencyCode"]
# 3) Финальная нормализация цены
out: List[Dict[str, Any]] = []
for v in bykey.values():
price_val = parse_pln_price_to_float(v.get("price_text"))
currency = v.get("currency")
if not currency and (v.get("price_text") or "").lower().find("") != -1:
currency = "PLN"
out.append({
"id": v.get("id"),
"title": v.get("title"),
"url": v.get("url"),
"price": price_val, # float или None
"currency": currency or "PLN"
})
logging.info(f"Total collected (SSR+DOM): {len(out)}")
return out