637 lines
24 KiB
Python
637 lines
24 KiB
Python
import asyncio
|
||
import logging
|
||
import re
|
||
import json
|
||
import os
|
||
from datetime import datetime
|
||
from pathlib import Path
|
||
from typing import List, Dict, Any, Optional
|
||
import re
|
||
from playwright.async_api import async_playwright
|
||
from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type
|
||
|
||
|
||
# ---- Price parsing helpers ----
|
||
_PLN_PRICE_RE = re.compile(
|
||
r'(?<!\d)(\d{1,3}(?:[ \u00A0]?\d{3})*(?:[.,]\d{2})?)(?:\s*(?:zł|PLN))',
|
||
re.IGNORECASE,
|
||
)
|
||
|
||
def parse_pln_price_to_float(price_text: str | None) -> float | None:
|
||
"""
|
||
'1 299,00 zł' / '1299 zł' / '1 299 zł' -> 1299.00
|
||
Возвращает None, если распарсить не удалось.
|
||
"""
|
||
if not price_text:
|
||
return None
|
||
t = (
|
||
price_text
|
||
.replace("\u00a0", " ") # NBSP
|
||
.replace("\u2009", " ") # thin space
|
||
.strip()
|
||
)
|
||
m = _PLN_PRICE_RE.search(t)
|
||
if not m:
|
||
return None
|
||
num = m.group(1)
|
||
num = num.replace(" ", "").replace("\u00a0", "").replace("\u2009", "")
|
||
num = num.replace(",", ".")
|
||
try:
|
||
return float(num)
|
||
except Exception:
|
||
return None
|
||
|
||
|
||
class FetchError(Exception):
|
||
pass
|
||
|
||
|
||
class Fetcher:
|
||
"""
|
||
Browser layer: Playwright Chromium with anti-bot hygiene and robust debug dumps.
|
||
- Blocks heavy resources (fonts/media/images), keeps stylesheets.
|
||
- Waits for either SSR summary scripts or window.ssrClientSettings.
|
||
- Two ways to read product summaries:
|
||
1) window.ssrClientSettings.productSummary
|
||
2) inline <script id="next-product-summary-script-..."> content (fallback)
|
||
- Captures XHR JSON responses by patterns.
|
||
- Dumps HTML/PNG with timestamps at key checkpoints and on failure.
|
||
"""
|
||
|
||
def __init__(self, cfg: Dict[str, Any]):
|
||
self.cfg = cfg
|
||
self.base_url = cfg.get("base_url")
|
||
self.xhr_patterns = [re.compile(p) for p in cfg.get("xhr_patterns", [])]
|
||
self.collected_xhr: List[Dict[str, Any]] = []
|
||
|
||
async def __aenter__(self):
|
||
self.playwright = await async_playwright().start()
|
||
args = ["--disable-dev-shm-usage", "--no-sandbox"]
|
||
self.browser = await self.playwright.chromium.launch(
|
||
headless=self.cfg.get("headless", True),
|
||
args=args,
|
||
devtools=not self.cfg.get("headless", True),
|
||
)
|
||
self.context = await self.browser.new_context(
|
||
locale=self.cfg.get("locale", "en-GB"),
|
||
timezone_id=self.cfg.get("timezoneId", "Europe/Warsaw"),
|
||
user_agent=(
|
||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||
"Chrome/124.0.0.0 Safari/537.36"
|
||
),
|
||
viewport={"width": 1366, "height": 900},
|
||
)
|
||
self.page = await self.context.new_page()
|
||
|
||
# Block heavy resources; keep stylesheets.
|
||
await self.context.route("**/*", self._route)
|
||
|
||
# Listen to JSON XHRs for optional parsing.
|
||
self.page.on("response", self._on_response)
|
||
self.page.on("console", lambda msg: logging.debug(f"[page.console] {msg.type} {msg.text}"))
|
||
return self
|
||
|
||
async def __aexit__(self, exc_type, exc, tb):
|
||
await self.context.close()
|
||
await self.browser.close()
|
||
await self.playwright.stop()
|
||
|
||
async def _route(self, route, request):
|
||
"""
|
||
Блокируем часть тяжёлых ресурсов.
|
||
Для отладки с картинками убери 'image' из списка.
|
||
"""
|
||
if request.resource_type in ["font", "media", "image"]:
|
||
return await route.abort()
|
||
return await route.continue_()
|
||
|
||
def _on_response(self, response):
|
||
try:
|
||
url = response.url
|
||
if any(p.search(url) for p in self.xhr_patterns):
|
||
if "application/json" in (response.headers.get("content-type", "")):
|
||
self.collected_xhr.append({"url": url, "response": response})
|
||
except Exception:
|
||
pass
|
||
|
||
async def _dump_debug(self, tag: str):
|
||
"""Save HTML and screenshot with timestamp; log absolute paths and CWD."""
|
||
try:
|
||
raw_dir = Path("out/raw_html").resolve()
|
||
raw_dir.mkdir(parents=True, exist_ok=True)
|
||
ts = datetime.utcnow().strftime("%Y%m%d_%H%M%S_%f")
|
||
html_path = raw_dir / f"{ts}_{tag}.html"
|
||
png_path = raw_dir / f"{ts}_{tag}.png"
|
||
|
||
cwd = Path(os.getcwd()).resolve()
|
||
logging.info(f"[dump_debug] CWD={cwd} → html={html_path} png={png_path}")
|
||
|
||
try:
|
||
html = await self.page.content()
|
||
html_path.write_text(html, encoding="utf-8")
|
||
except Exception as e:
|
||
logging.warning(f"[dump_debug] writing HTML failed: {e}")
|
||
|
||
try:
|
||
await self.page.screenshot(path=str(png_path), full_page=True)
|
||
except Exception as e:
|
||
logging.warning(f"[dump_debug] screenshot failed: {e}")
|
||
|
||
logging.info(f"[dump_debug] saved OK: {html_path.name}, {png_path.name}")
|
||
except Exception as e:
|
||
logging.warning(f"[dump_debug] general fail: {e}")
|
||
|
||
async def _accept_cookies_if_any(self):
|
||
selectors = [
|
||
"#onetrust-accept-btn-handler",
|
||
"button#onetrust-accept-btn-handler",
|
||
'button:has-text("Accept all")',
|
||
'button:has-text("Accept All")',
|
||
'button[aria-label*="Accept"]',
|
||
]
|
||
for sel in selectors:
|
||
try:
|
||
el = self.page.locator(sel)
|
||
if await el.count() > 0:
|
||
await el.first.click(timeout=2000)
|
||
logging.info("Cookie banner accepted.")
|
||
break
|
||
except Exception:
|
||
pass
|
||
|
||
async def _log_plp_state(self, stage: str):
|
||
"""Log counts of SSR scripts and presence of window.ssrClientSettings."""
|
||
try:
|
||
scripts_count = await self.page.locator('script[id^="next-product-summary-script-"]').count()
|
||
except Exception:
|
||
scripts_count = -1
|
||
try:
|
||
has_window = await self.page.evaluate("""() => {
|
||
const ps = globalThis?.ssrClientSettings?.productSummary;
|
||
return !!(ps && Array.isArray(ps.itemNumbers) && ps.itemNumbers.length > 0);
|
||
}""")
|
||
except Exception:
|
||
has_window = False
|
||
logging.info(f"[{stage}] scripts: {scripts_count}, window.ps: {has_window}")
|
||
|
||
@retry(
|
||
stop=stop_after_attempt(3),
|
||
wait=wait_exponential(multiplier=1, min=1, max=8),
|
||
retry=retry_if_exception_type(FetchError),
|
||
)
|
||
async def load_category(self, url: str):
|
||
"""
|
||
Navigation + robust readiness:
|
||
1) domcontentloaded
|
||
2) accept cookies
|
||
3) warm-up scroll
|
||
4) wait for <script id^="next-product-summary-script-"> (attached)
|
||
5) attempt window.ssrClientSettings (non-fatal)
|
||
Dumps at key checkpoints and on failure.
|
||
"""
|
||
try:
|
||
await self.page.goto(
|
||
url,
|
||
timeout=self.cfg.get("nav_timeout_ms", 60000),
|
||
wait_until="domcontentloaded",
|
||
)
|
||
await self._dump_debug("after_goto")
|
||
|
||
await self._accept_cookies_if_any()
|
||
await self._dump_debug("after_cookies")
|
||
await self._log_plp_state("after_accept")
|
||
|
||
# warm-up scroll to trigger scripts/lazy
|
||
for _ in range(3):
|
||
await self.page.mouse.wheel(0, 1600)
|
||
await self.page.wait_for_timeout(300)
|
||
await self._dump_debug("after_warmup")
|
||
await self._log_plp_state("after_warmup")
|
||
|
||
# wait for SSR script tags
|
||
await self.page.wait_for_selector(
|
||
'script[id^="next-product-summary-script-"]',
|
||
state="attached",
|
||
timeout=self.cfg.get("wait_timeout_ms", 30000),
|
||
)
|
||
await self._dump_debug("after_scripts_present")
|
||
|
||
# optional window readiness
|
||
try:
|
||
await self.page.wait_for_function(
|
||
"""
|
||
() => {
|
||
const ps = globalThis?.ssrClientSettings?.productSummary;
|
||
return !!(ps && Array.isArray(ps.itemNumbers) && ps.itemNumbers.length > 0);
|
||
}
|
||
""",
|
||
timeout=5000,
|
||
)
|
||
except Exception:
|
||
logging.info("window.ssrClientSettings not ready (non-fatal).")
|
||
|
||
await self._dump_debug("after_window_check")
|
||
return True
|
||
|
||
except Exception as e:
|
||
logging.error(f"load_category failed: {e}")
|
||
await self._dump_debug("fail_load_category")
|
||
raise FetchError(str(e))
|
||
|
||
# ---------- NEW: read total count and scroll until target ----------
|
||
|
||
async def read_total_from_header(self) -> Optional[int]:
|
||
"""
|
||
Tries to read category total from the header count like '(434)'.
|
||
Looks in '#plp-seo-heading .esi-count' or any '.esi-count' fallback.
|
||
"""
|
||
sels = ["#plp-seo-heading .esi-count", ".esi-count"]
|
||
for sel in sels:
|
||
try:
|
||
el = self.page.locator(sel)
|
||
if await el.count() > 0:
|
||
txt = await el.first.inner_text(timeout=1500)
|
||
digits = "".join(ch for ch in txt if ch.isdigit())
|
||
if digits:
|
||
total = int(digits)
|
||
logging.info(f"Total from header: {total}")
|
||
return total
|
||
except Exception:
|
||
continue
|
||
logging.info("Total from header: not found")
|
||
return None
|
||
|
||
async def auto_scroll_until_total(self, hard_max_scrolls: Optional[int] = None):
|
||
"""
|
||
Scrolls until we reach target total (from header), with a hard cap.
|
||
Uses networkidle + a small jiggle to retrigger lazy loading.
|
||
"""
|
||
hard_cap = hard_max_scrolls or self.cfg.get("scroll", {}).get("hard_max_scrolls", 2000)
|
||
netidle_ms = self.cfg.get("scroll", {}).get("wait_networkidle_timeout_ms", 8000)
|
||
# Combined product tile selector
|
||
sel_tiles = '[data-testid="plp-product-grid-item"], [data-testid="product-tile"], .ProductCard, [data-qa="plp-product"]'
|
||
|
||
target = await self.read_total_from_header()
|
||
last = 0
|
||
same_ticks = 0
|
||
same_limit = self.cfg.get("scroll", {}).get("stop_if_no_new_items_after", 8)
|
||
|
||
for i in range(hard_cap):
|
||
# Scroll to bottom
|
||
try:
|
||
await self.page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
||
except Exception:
|
||
pass
|
||
|
||
# Wait for network idle
|
||
try:
|
||
await self.page.wait_for_load_state("networkidle", timeout=netidle_ms)
|
||
except Exception:
|
||
# not fatal
|
||
await asyncio.sleep(0.25)
|
||
|
||
# Jiggle to retrigger observers
|
||
try:
|
||
await self.page.mouse.wheel(0, -200)
|
||
await asyncio.sleep(0.1)
|
||
await self.page.mouse.wheel(0, 1200)
|
||
except Exception:
|
||
pass
|
||
|
||
try:
|
||
seen = await self.page.locator(sel_tiles).count()
|
||
except Exception:
|
||
seen = last
|
||
|
||
if target and seen >= target:
|
||
logging.info(f"Reached target: seen {seen}/{target} (i={i})")
|
||
break
|
||
|
||
if seen <= last:
|
||
same_ticks += 1
|
||
if same_ticks >= same_limit:
|
||
logging.info(f"No growth for a while: seen={seen}, i={i}")
|
||
break
|
||
else:
|
||
same_ticks = 0
|
||
last = seen
|
||
|
||
logging.info(f"Final seen items: {last} (target={target}, cap={hard_cap})")
|
||
|
||
# ---------- existing helpers ----------
|
||
|
||
async def current_html(self) -> str:
|
||
return await self.page.content()
|
||
|
||
async def extract_xhr_json(self) -> List[Dict[str, Any]]:
|
||
results = []
|
||
for entry in self.collected_xhr:
|
||
try:
|
||
body = await entry["response"].json()
|
||
results.append({"url": entry["url"], "json": body})
|
||
except Exception:
|
||
pass
|
||
return results
|
||
|
||
async def read_ssr_product_summaries(self) -> List[Dict[str, Any]]:
|
||
"""
|
||
Returns simplified product summaries.
|
||
Path 1: window.ssrClientSettings.productSummary
|
||
Path 2: parse inline <script id="next-product-summary-script-..."> blocks
|
||
"""
|
||
# Path 1 — from window
|
||
js_window = """
|
||
() => {
|
||
const out = [];
|
||
const ps = globalThis?.ssrClientSettings?.productSummary;
|
||
if (!ps) return out;
|
||
const ids = Array.isArray(ps.itemNumbers) ? ps.itemNumbers : [];
|
||
for (const id of ids) {
|
||
const obj = ps[id];
|
||
if (!obj) continue;
|
||
const sd = obj?._STATE_?.productSummary?.summaryData;
|
||
if (!sd) continue;
|
||
const cw = Array.isArray(sd.colourways) && sd.colourways.length ? sd.colourways[0] : null;
|
||
out.push({
|
||
id: sd.id || null,
|
||
title: sd.title || null,
|
||
baseUrl: sd.baseUrl || null,
|
||
brand: sd.brand || null,
|
||
category: sd.category || null,
|
||
currencyCode: sd.currencyCode || null,
|
||
colourway: cw ? {
|
||
id: cw.id ?? null,
|
||
url: cw.url ?? null,
|
||
color: cw.c ?? null,
|
||
title: cw.t ?? null,
|
||
price: cw.p ?? null,
|
||
priceMarket: cw.mp ?? null,
|
||
selected: !!cw.s
|
||
} : null,
|
||
imageCdnUrl: sd.imageCdnUrl || null,
|
||
productImageUrlPart: sd.productImageUrlPart || null,
|
||
lgImagePath: sd.lgImagePath || null
|
||
});
|
||
}
|
||
return out;
|
||
}
|
||
"""
|
||
try:
|
||
w = await self.page.evaluate(js_window)
|
||
if isinstance(w, list) and w:
|
||
logging.info(f"SSR(window) summaries: {len(w)}")
|
||
return w
|
||
except Exception:
|
||
pass
|
||
|
||
# Path 2 — parse inline scripts
|
||
js_scripts = """
|
||
() => {
|
||
const list = Array.from(document.querySelectorAll('script[id^="next-product-summary-script-"]'));
|
||
return list.map(s => s.textContent || "");
|
||
}
|
||
"""
|
||
try:
|
||
texts = await self.page.evaluate(js_scripts)
|
||
except Exception:
|
||
return []
|
||
|
||
out: List[Dict[str, Any]] = []
|
||
# productSummary["ID"] = { ... } OR productSummary['ID'] = { ... }
|
||
assign_re = re.compile(r'productSummary\s*\[\s*([\'"])(.*?)\1\s*\]\s*=\s*\{')
|
||
for t in texts or []:
|
||
for m in assign_re.finditer(t):
|
||
start = m.end() - 1 # at '{'
|
||
depth = 0
|
||
end = None
|
||
for i in range(start, len(t)):
|
||
ch = t[i]
|
||
if ch == "{":
|
||
depth += 1
|
||
elif ch == "}":
|
||
depth -= 1
|
||
if depth == 0:
|
||
end = i + 1
|
||
break
|
||
if end is None:
|
||
continue
|
||
block = t[start:end]
|
||
try:
|
||
data = json.loads(block)
|
||
sd = (
|
||
data.get("_STATE_", {})
|
||
.get("productSummary", {})
|
||
.get("summaryData", {})
|
||
)
|
||
cws = sd.get("colourways") or []
|
||
cw = cws[0] if cws else None
|
||
out.append(
|
||
{
|
||
"id": sd.get("id"),
|
||
"title": sd.get("title"),
|
||
"baseUrl": sd.get("baseUrl"),
|
||
"brand": sd.get("brand"),
|
||
"category": sd.get("category"),
|
||
"currencyCode": sd.get("currencyCode"),
|
||
"colourway": {
|
||
"id": cw.get("id"),
|
||
"url": cw.get("url"),
|
||
"color": cw.get("c"),
|
||
"title": cw.get("t"),
|
||
"price": cw.get("p"),
|
||
"priceMarket": cw.get("mp"),
|
||
"selected": bool(cw.get("s")),
|
||
} if cw else None,
|
||
"imageCdnUrl": sd.get("imageCdnUrl"),
|
||
"productImageUrlPart": sd.get("productImageUrlPart"),
|
||
"lgImagePath": sd.get("lgImagePath"),
|
||
}
|
||
)
|
||
except Exception:
|
||
continue
|
||
async def read_dom_products(self) -> List[Dict[str, Any]]:
|
||
"""
|
||
Парсит карточки из DOM после прокрутки.
|
||
Покрывает несколько вариантов разметки Next PLP.
|
||
"""
|
||
js = r"""
|
||
() => {
|
||
const out = [];
|
||
const gridItems = document.querySelectorAll('[data-testid="plp-product-grid-item"], .ProductCard, [data-qa="plp-product"]');
|
||
|
||
const getPid = (container) => {
|
||
// Вариант 1: data-pid на entrypoint
|
||
const entry = container.querySelector('[id^="plp-product-summary-entrypoint-"]');
|
||
if (entry && entry.getAttribute('data-pid')) return entry.getAttribute('data-pid');
|
||
// Вариант 2: id="plp-product-summary-tile-<ID>"
|
||
const tile = container.closest('[id^="plp-product-summary-tile-"]') || container.querySelector('[id^="plp-product-summary-tile-"]');
|
||
if (tile) {
|
||
const m = (tile.id || '').match(/plp-product-summary-tile-([A-Za-z0-9]+)/);
|
||
if (m) return m[1];
|
||
}
|
||
// Вариант 3: вытащим из href вида .../<ID>#<ID> или .../T43162
|
||
const a = container.querySelector('a[href*="/style/"], a[data-testid^="product_summary_tile_"], a[href*="/p/"]');
|
||
if (a) {
|
||
const href = a.getAttribute('href') || '';
|
||
const m2 = href.match(/([A-Z]\d{4,})/i);
|
||
if (m2) return m2[1].toUpperCase();
|
||
}
|
||
return null;
|
||
};
|
||
|
||
const getAbsUrl = (href) => {
|
||
try {
|
||
if (!href) return null;
|
||
if (/^https?:\/\//i.test(href)) return href;
|
||
const a = document.createElement('a');
|
||
a.href = href;
|
||
return a.href;
|
||
} catch { return href || null; }
|
||
};
|
||
|
||
const getTitle = (container) => {
|
||
const t1 = container.querySelector('[data-testid="product_summary_title"]');
|
||
if (t1) return (t1.getAttribute('data-label') || t1.textContent || '').trim();
|
||
const t2 = container.querySelector('[data-testid="product-name"], .productName, [itemprop="name"]');
|
||
if (t2) return (t2.textContent || '').trim();
|
||
return null;
|
||
};
|
||
|
||
const getPriceText = (container) => {
|
||
// охватим несколько вариантов
|
||
const priceRoots = [
|
||
container.querySelector('[data-testid="price"]'),
|
||
container.querySelector('[data-testid="ProductCard-Price"]'),
|
||
container.querySelector('[itemprop="price"]'),
|
||
container.querySelector('[aria-label*="price" i]'),
|
||
container
|
||
].filter(Boolean);
|
||
|
||
for (const root of priceRoots) {
|
||
const spans = root.querySelectorAll('span, div');
|
||
for (const el of spans) {
|
||
const t = (el.textContent || '').trim();
|
||
if (!t) continue;
|
||
if (/\d/.test(t) && (t.includes('zł') || /PLN/i.test(t))) {
|
||
return t;
|
||
}
|
||
}
|
||
}
|
||
return null;
|
||
};
|
||
|
||
gridItems.forEach(container => {
|
||
// Основная ссылка
|
||
const link = container.querySelector('a[href*="/style/"], a[data-testid^="product_summary_tile_"], a[href*="/p/"]');
|
||
const href = link ? link.getAttribute('href') : null;
|
||
|
||
const rec = {
|
||
id: getPid(container),
|
||
title: getTitle(container),
|
||
url: getAbsUrl(href),
|
||
price_text: getPriceText(container),
|
||
currency: null
|
||
};
|
||
|
||
if (rec.price_text) {
|
||
if (rec.price_text.includes('zł') || /PLN/i.test(rec.price_text)) rec.currency = 'PLN';
|
||
}
|
||
|
||
// фильтруем пустые карточки без ссылки и заголовка
|
||
if (rec.url || rec.title) out.push(rec);
|
||
});
|
||
|
||
// Удаляем дубли по id|url
|
||
const seen = new Set();
|
||
const uniq = [];
|
||
for (const d of out) {
|
||
const key = `${d.id || ''}|${d.url || ''}`;
|
||
if (seen.has(key)) continue;
|
||
seen.add(key);
|
||
uniq.push(d);
|
||
}
|
||
return uniq;
|
||
}
|
||
"""
|
||
try:
|
||
data = await self.page.evaluate(js)
|
||
logging.info(f"DOM cards parsed: {len(data)}")
|
||
return data
|
||
except Exception as e:
|
||
logging.warning(f"read_dom_products failed: {e}")
|
||
return []
|
||
|
||
async def collect_products(self) -> List[Dict[str, Any]]:
|
||
"""
|
||
Унифицированный сбор: SSR (если есть) + DOM.
|
||
Нормализуем к: id, title, url, price(float|None), currency('PLN'|...).
|
||
"""
|
||
ssr = await self.read_ssr_product_summaries() or []
|
||
dom = await self.read_dom_products() or []
|
||
|
||
bykey: Dict[str, Dict[str, Any]] = {}
|
||
|
||
def key(d: Dict[str, Any]) -> str:
|
||
return f"{(d.get('id') or '')}|{(d.get('url') or '')}"
|
||
|
||
# 1) Скелет из DOM
|
||
for d in dom:
|
||
bykey[key(d)] = {
|
||
"id": d.get("id"),
|
||
"title": d.get("title"),
|
||
"url": d.get("url"),
|
||
"price_text": d.get("price_text"),
|
||
"currency": d.get("currency"),
|
||
}
|
||
|
||
# 2) Обогащаем из SSR (если есть)
|
||
for s in ssr:
|
||
cw = (s.get("colourway") or {})
|
||
# собрать абсолютный URL
|
||
url = None
|
||
try:
|
||
base = (s.get("baseUrl") or "").rstrip("/")
|
||
rel = (cw.get("url") or "").lstrip("/")
|
||
url = f"{base}/{rel}" if (base and rel) else None
|
||
except Exception:
|
||
pass
|
||
|
||
cand = {"id": s.get("id"), "url": url}
|
||
k = key(cand)
|
||
rec = bykey.get(k)
|
||
if rec is None:
|
||
bykey[k] = {
|
||
"id": s.get("id"),
|
||
"title": s.get("title"),
|
||
"url": url,
|
||
"price_text": cw.get("price"),
|
||
"currency": s.get("currencyCode"),
|
||
}
|
||
else:
|
||
if not rec.get("title") and s.get("title"):
|
||
rec["title"] = s["title"]
|
||
if not rec.get("price_text") and cw.get("price"):
|
||
rec["price_text"] = cw["price"]
|
||
if not rec.get("currency") and s.get("currencyCode"):
|
||
rec["currency"] = s["currencyCode"]
|
||
|
||
# 3) Финальная нормализация цены
|
||
out: List[Dict[str, Any]] = []
|
||
for v in bykey.values():
|
||
price_val = parse_pln_price_to_float(v.get("price_text"))
|
||
currency = v.get("currency")
|
||
if not currency and (v.get("price_text") or "").lower().find("zł") != -1:
|
||
currency = "PLN"
|
||
out.append({
|
||
"id": v.get("id"),
|
||
"title": v.get("title"),
|
||
"url": v.get("url"),
|
||
"price": price_val, # float или None
|
||
"currency": currency or "PLN"
|
||
})
|
||
|
||
logging.info(f"Total collected (SSR+DOM): {len(out)}")
|
||
return out
|
||
|