MacOS_Parsers/Parser_NEXT/parser.py
2025-08-24 15:11:06 +03:00

116 lines
4.2 KiB
Python

from urllib.parse import urljoin
from bs4 import BeautifulSoup
from typing import List, Dict, Any
from models import Product
def parse_products_from_ssr(summaries: List[Dict[str, Any]]) -> List[Product]:
out: List[Product] = []
for s in summaries or []:
cw = (s.get("colourway") or {}) if isinstance(s, dict) else {}
base = s.get("baseUrl") or ""
rel = cw.get("url") or ""
url = urljoin(base + "/", rel) if rel else (base or None)
name = s.get("title") or cw.get("title") or None
price = cw.get("price") or cw.get("priceMarket") or None
color = cw.get("color") or None
currency = s.get("currencyCode") or None
out.append(Product(
product_id = s.get("id") or None,
url = url,
name = name,
price = str(price) if price is not None else None,
currency = currency,
image_urls = [], # картинки построим позже по imageCdnUrl + productImageUrlPart
color = color,
size_variants = [] # для homeware обычно пусто; для fashion добавим позже
))
return out
def parse_products_from_dom(html: str, cfg: Dict[str, Any]) -> List[Product]:
soup = BeautifulSoup(html, "lxml")
sel = cfg["selectors"]
tiles = soup.select(sel["product_tile"])
out = []
for t in tiles:
try:
a = t.select_one(sel["product_link"])
name_el = t.select_one(sel["product_name"])
price_el = t.select_one(sel["product_price"])
url = a.get("href") if a else None
if url and url.startswith("/"):
url = cfg.get("base_url", "").rstrip("/") + url
name = name_el.get_text(strip=True) if name_el else None
price = price_el.get_text(strip=True) if price_el else None
pid = t.get("data-style-id") or t.get("data-product-id") or None
out.append(Product(
product_id=pid,
url=url,
name=name,
price=price,
currency=None,
image_urls=[],
color=None,
size_variants=[]
))
except Exception:
continue
return out
def parse_products_from_xhr(xhrs: List[Dict[str, Any]]) -> List[Product]:
out = []
for item in xhrs:
j = item.get("json") or {}
candidates = []
if isinstance(j, dict):
for key in ["products", "items", "results", "hits"]:
if isinstance(j.get(key), list):
candidates = j[key]
break
if not candidates and isinstance(j, list):
candidates = j
for p in candidates:
pid = str(p.get("id") or p.get("productId") or p.get("styleId") or "") or None
url = p.get("url") or p.get("link") or None
name = p.get("name") or p.get("productName") or None
price = None
currency = None
for k in ["price", "currentPrice", "sellingPrice"]:
v = p.get(k)
if isinstance(v, (int, float, str)):
price = str(v)
break
if isinstance(v, dict):
price = str(v.get("value") or v.get("amount") or "")
currency = v.get("currency") or currency
images = []
for k in ["images", "imageList", "media"]:
v = p.get(k)
if isinstance(v, list):
for it in v:
if isinstance(it, str):
images.append(it)
elif isinstance(it, dict):
for kk in ["url", "src", "href"]:
if it.get(kk):
images.append(it[kk])
out.append(Product(
product_id=pid,
url=url,
name=name,
price=price,
currency=currency,
image_urls=images,
color=p.get("color") or None,
size_variants=[s for s in p.get("sizes", []) if isinstance(s, str)]
))
return out