diff --git a/Parser_NEXT/config.yaml b/Parser_NEXT/config.yaml index 507640c..7389816 100644 --- a/Parser_NEXT/config.yaml +++ b/Parser_NEXT/config.yaml @@ -48,6 +48,11 @@ output: csv_also: true jsonl_also: true +pdp: + max_concurrency: 3 # одновременно открытых PDP-страниц + nav_timeout_ms: 45000 + wait_timeout_ms: 15000 + debug: dump_always: false # true — чтобы писать дампы на каждом шаге diff --git a/Parser_NEXT/fetcher.py b/Parser_NEXT/fetcher.py index 4753a2f..5e58387 100644 --- a/Parser_NEXT/fetcher.py +++ b/Parser_NEXT/fetcher.py @@ -1,16 +1,20 @@ import asyncio import logging -import re -import json import os +import json +import re from datetime import datetime from pathlib import Path from typing import List, Dict, Any, Optional -import re + from playwright.async_api import async_playwright from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type +class FetchError(Exception): + pass + + # ---- Price parsing helpers ---- _PLN_PRICE_RE = re.compile( r'(? float | None: - """ - '1 299,00 zł' / '1299 zł' / '1 299 zł' -> 1299.00 - Возвращает None, если распарсить не удалось. - """ if not price_text: return None t = ( - price_text - .replace("\u00a0", " ") # NBSP - .replace("\u2009", " ") # thin space - .strip() + price_text.replace("\u00a0", " ") + .replace("\u2009", " ") + .strip() ) m = _PLN_PRICE_RE.search(t) if not m: @@ -42,20 +41,13 @@ def parse_pln_price_to_float(price_text: str | None) -> float | None: return None -class FetchError(Exception): - pass - - class Fetcher: """ - Browser layer: Playwright Chromium with anti-bot hygiene and robust debug dumps. - - Blocks heavy resources (fonts/media/images), keeps stylesheets. - - Waits for either SSR summary scripts or window.ssrClientSettings. - - Two ways to read product summaries: - 1) window.ssrClientSettings.productSummary - 2) inline