diff --git a/.gitignore b/.gitignore index 3d0e2a0..f9adf74 100644 --- a/.gitignore +++ b/.gitignore @@ -28,6 +28,7 @@ Temporary Items .apdisk __pycache__ -/Parsing ZARAHOME/src/records_folder +records_folder Ignore_Temp -/Processing/Files-todo \ No newline at end of file +/Processing/Files-todo +out \ No newline at end of file diff --git a/Parser_NEXT/.env.example b/Parser_NEXT/.env.example new file mode 100644 index 0000000..7e68a3f --- /dev/null +++ b/Parser_NEXT/.env.example @@ -0,0 +1,2 @@ +# PROXY=http://user:pass@host:port +# RATE_LIMIT=1.0 diff --git a/Parser_NEXT/README.md b/Parser_NEXT/README.md new file mode 100644 index 0000000..3c73b01 --- /dev/null +++ b/Parser_NEXT/README.md @@ -0,0 +1,17 @@ +# NEXT.pl Parser (Playwright, Python 3.12) + +## Quick start +```bash +python -m venv .venv +source .venv/bin/activate # Windows: .venv\Scripts\activate +pip install -r requirements.txt +python -m playwright install chromium +python main.py +``` + +**categories.xlsx** — формат ввода: +- Первая колонка (A) — ссылки на категории (без заголовка). +- Любые другие колонки (B, C, …) игнорируются (можно писать пометки). +- Пустые строки и ячейки не учитываются. + +Outputs land in **records_folder/** as XLSX (+CSV/JSONL). Configure selectors/scroll in **config.yaml**. diff --git a/Parser_NEXT/categories.xlsx b/Parser_NEXT/categories.xlsx new file mode 100644 index 0000000..fbf50da Binary files /dev/null and b/Parser_NEXT/categories.xlsx differ diff --git a/Parser_NEXT/config.yaml b/Parser_NEXT/config.yaml new file mode 100644 index 0000000..507640c --- /dev/null +++ b/Parser_NEXT/config.yaml @@ -0,0 +1,53 @@ +base_url: "https://www.next.pl/en" +locale: "en-GB" +timezoneId: "Europe/Warsaw" + +# На время отладки удобно видеть браузер: +headless: false + +nav_timeout_ms: 60000 +wait_timeout_ms: 30000 +retries: 3 + +# Рейт-лимит можно настраивать при масштабировании +rate_limit_per_host_per_sec: 1.0 + +scroll: + # Старые параметры (используются в резервном auto_scroll и для пауз) + max_scrolls: 80 + pause_ms_between_scrolls_min: 300 + pause_ms_between_scrolls_max: 700 + stop_if_no_new_items_after: 8 + + # Новые параметры для auto_scroll_until_total + hard_max_scrolls: 2500 # предохранитель на максимум скроллов + wait_networkidle_timeout_ms: 8000 # ожидание networkidle после каждого скролла + +selectors: + # карточки товаров + product_tile: '[data-testid="plp-product-grid-item"], [data-testid="product-tile"], .ProductCard, [data-qa="plp-product"]' + product_link: 'a[href*="/style/"], a[href*="/p/"], a[data-testid="productLink"]' + product_name: '[data-testid="product-name"], .productName, [itemprop="name"]' + product_price: '[data-testid="price"], [itemprop="price"], .price' + + # признак готовности + grid_ready: 'script[id^="next-product-summary-script-"], [data-testid="plp-product-grid-item"], [data-testid="product-grid"], .plpGrid, [data-qa="plp-grid"]' + + # счётчик общего количества в шапке (например "(434)") + total_count: '#plp-seo-heading .esi-count, .esi-count' + +xhr_patterns: + - "/search" + - "/api/search" + - "/plp" + - "/productsummary" + +output: + folder: "records_folder" + excel_prefix: "next_dump" + csv_also: true + jsonl_also: true + +debug: + dump_always: false # true — чтобы писать дампы на каждом шаге + diff --git a/Parser_NEXT/fetcher.py b/Parser_NEXT/fetcher.py new file mode 100644 index 0000000..4753a2f --- /dev/null +++ b/Parser_NEXT/fetcher.py @@ -0,0 +1,636 @@ +import asyncio +import logging +import re +import json +import os +from datetime import datetime +from pathlib import Path +from typing import List, Dict, Any, Optional +import re +from playwright.async_api import async_playwright +from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type + + +# ---- Price parsing helpers ---- +_PLN_PRICE_RE = re.compile( + r'(? float | None: + """ + '1 299,00 zł' / '1299 zł' / '1 299 zł' -> 1299.00 + Возвращает None, если распарсить не удалось. + """ + if not price_text: + return None + t = ( + price_text + .replace("\u00a0", " ") # NBSP + .replace("\u2009", " ") # thin space + .strip() + ) + m = _PLN_PRICE_RE.search(t) + if not m: + return None + num = m.group(1) + num = num.replace(" ", "").replace("\u00a0", "").replace("\u2009", "") + num = num.replace(",", ".") + try: + return float(num) + except Exception: + return None + + +class FetchError(Exception): + pass + + +class Fetcher: + """ + Browser layer: Playwright Chromium with anti-bot hygiene and robust debug dumps. + - Blocks heavy resources (fonts/media/images), keeps stylesheets. + - Waits for either SSR summary scripts or window.ssrClientSettings. + - Two ways to read product summaries: + 1) window.ssrClientSettings.productSummary + 2) inline