From 30456b2541806d35863dfb17d3791a34cfa1a0da Mon Sep 17 00:00:00 2001 From: va1is Date: Mon, 25 Aug 2025 14:46:19 +0300 Subject: [PATCH] IKEAmain for WIN --- Parser_NEXT/config.yaml | 5 + Parser_NEXT/fetcher.py | 372 +++++++++++---------- Parser_NEXT/main.py | 86 ++--- Parser_NEXT/models.py | 28 +- Parser_NEXT/sink.py | 41 +-- Парсер_IKEA/main_win.py | 714 ++++++++++++++++++++++++++++++++++++++++ 6 files changed, 978 insertions(+), 268 deletions(-) create mode 100644 Парсер_IKEA/main_win.py diff --git a/Parser_NEXT/config.yaml b/Parser_NEXT/config.yaml index 507640c..7389816 100644 --- a/Parser_NEXT/config.yaml +++ b/Parser_NEXT/config.yaml @@ -48,6 +48,11 @@ output: csv_also: true jsonl_also: true +pdp: + max_concurrency: 3 # одновременно открытых PDP-страниц + nav_timeout_ms: 45000 + wait_timeout_ms: 15000 + debug: dump_always: false # true — чтобы писать дампы на каждом шаге diff --git a/Parser_NEXT/fetcher.py b/Parser_NEXT/fetcher.py index 4753a2f..5e58387 100644 --- a/Parser_NEXT/fetcher.py +++ b/Parser_NEXT/fetcher.py @@ -1,16 +1,20 @@ import asyncio import logging -import re -import json import os +import json +import re from datetime import datetime from pathlib import Path from typing import List, Dict, Any, Optional -import re + from playwright.async_api import async_playwright from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type +class FetchError(Exception): + pass + + # ---- Price parsing helpers ---- _PLN_PRICE_RE = re.compile( r'(? float | None: - """ - '1 299,00 zł' / '1299 zł' / '1 299 zł' -> 1299.00 - Возвращает None, если распарсить не удалось. - """ if not price_text: return None t = ( - price_text - .replace("\u00a0", " ") # NBSP - .replace("\u2009", " ") # thin space - .strip() + price_text.replace("\u00a0", " ") + .replace("\u2009", " ") + .strip() ) m = _PLN_PRICE_RE.search(t) if not m: @@ -42,20 +41,13 @@ def parse_pln_price_to_float(price_text: str | None) -> float | None: return None -class FetchError(Exception): - pass - - class Fetcher: """ - Browser layer: Playwright Chromium with anti-bot hygiene and robust debug dumps. - - Blocks heavy resources (fonts/media/images), keeps stylesheets. - - Waits for either SSR summary scripts or window.ssrClientSettings. - - Two ways to read product summaries: - 1) window.ssrClientSettings.productSummary - 2) inline