116 lines
4.2 KiB
Python
116 lines
4.2 KiB
Python
from urllib.parse import urljoin
|
|
from bs4 import BeautifulSoup
|
|
from typing import List, Dict, Any
|
|
from models import Product
|
|
|
|
def parse_products_from_ssr(summaries: List[Dict[str, Any]]) -> List[Product]:
|
|
out: List[Product] = []
|
|
for s in summaries or []:
|
|
cw = (s.get("colourway") or {}) if isinstance(s, dict) else {}
|
|
base = s.get("baseUrl") or ""
|
|
rel = cw.get("url") or ""
|
|
url = urljoin(base + "/", rel) if rel else (base or None)
|
|
|
|
name = s.get("title") or cw.get("title") or None
|
|
price = cw.get("price") or cw.get("priceMarket") or None
|
|
color = cw.get("color") or None
|
|
currency = s.get("currencyCode") or None
|
|
|
|
out.append(Product(
|
|
product_id = s.get("id") or None,
|
|
url = url,
|
|
name = name,
|
|
price = str(price) if price is not None else None,
|
|
currency = currency,
|
|
image_urls = [], # картинки построим позже по imageCdnUrl + productImageUrlPart
|
|
color = color,
|
|
size_variants = [] # для homeware обычно пусто; для fashion добавим позже
|
|
))
|
|
return out
|
|
|
|
def parse_products_from_dom(html: str, cfg: Dict[str, Any]) -> List[Product]:
|
|
soup = BeautifulSoup(html, "lxml")
|
|
sel = cfg["selectors"]
|
|
tiles = soup.select(sel["product_tile"])
|
|
out = []
|
|
for t in tiles:
|
|
try:
|
|
a = t.select_one(sel["product_link"])
|
|
name_el = t.select_one(sel["product_name"])
|
|
price_el = t.select_one(sel["product_price"])
|
|
|
|
url = a.get("href") if a else None
|
|
if url and url.startswith("/"):
|
|
url = cfg.get("base_url", "").rstrip("/") + url
|
|
|
|
name = name_el.get_text(strip=True) if name_el else None
|
|
price = price_el.get_text(strip=True) if price_el else None
|
|
|
|
pid = t.get("data-style-id") or t.get("data-product-id") or None
|
|
|
|
out.append(Product(
|
|
product_id=pid,
|
|
url=url,
|
|
name=name,
|
|
price=price,
|
|
currency=None,
|
|
image_urls=[],
|
|
color=None,
|
|
size_variants=[]
|
|
))
|
|
except Exception:
|
|
continue
|
|
return out
|
|
|
|
def parse_products_from_xhr(xhrs: List[Dict[str, Any]]) -> List[Product]:
|
|
out = []
|
|
for item in xhrs:
|
|
j = item.get("json") or {}
|
|
candidates = []
|
|
if isinstance(j, dict):
|
|
for key in ["products", "items", "results", "hits"]:
|
|
if isinstance(j.get(key), list):
|
|
candidates = j[key]
|
|
break
|
|
if not candidates and isinstance(j, list):
|
|
candidates = j
|
|
|
|
for p in candidates:
|
|
pid = str(p.get("id") or p.get("productId") or p.get("styleId") or "") or None
|
|
url = p.get("url") or p.get("link") or None
|
|
name = p.get("name") or p.get("productName") or None
|
|
price = None
|
|
currency = None
|
|
for k in ["price", "currentPrice", "sellingPrice"]:
|
|
v = p.get(k)
|
|
if isinstance(v, (int, float, str)):
|
|
price = str(v)
|
|
break
|
|
if isinstance(v, dict):
|
|
price = str(v.get("value") or v.get("amount") or "")
|
|
currency = v.get("currency") or currency
|
|
|
|
images = []
|
|
for k in ["images", "imageList", "media"]:
|
|
v = p.get(k)
|
|
if isinstance(v, list):
|
|
for it in v:
|
|
if isinstance(it, str):
|
|
images.append(it)
|
|
elif isinstance(it, dict):
|
|
for kk in ["url", "src", "href"]:
|
|
if it.get(kk):
|
|
images.append(it[kk])
|
|
|
|
out.append(Product(
|
|
product_id=pid,
|
|
url=url,
|
|
name=name,
|
|
price=price,
|
|
currency=currency,
|
|
image_urls=images,
|
|
color=p.get("color") or None,
|
|
size_variants=[s for s in p.get("sizes", []) if isinstance(s, str)]
|
|
))
|
|
return out
|