MacOS_Parsers/Parser_NEXT/main.py
2025-08-25 14:46:19 +03:00

172 lines
5.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import asyncio
import logging
from pathlib import Path
from typing import List, Tuple
from datetime import datetime, timedelta
import pandas as pd
import yaml
from fetcher import Fetcher, FetchError
from sink import write_outputs, write_master_excel
from models import Product
# ---------- конфиг/логи ----------
def setup_logging():
Path("out/logs").mkdir(parents=True, exist_ok=True)
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(levelname)s %(message)s",
handlers=[
logging.FileHandler("out/logs/run.log", encoding="utf-8"),
logging.StreamHandler(),
],
)
def load_config() -> dict:
with open("config.yaml", "r", encoding="utf-8") as f:
return yaml.safe_load(f)
# ---------- загрузка категорий из первой колонки A ----------
def load_categories() -> List[Tuple[str, str]]:
"""
Читает categories.xlsx без заголовка.
Берёт только первую колонку (A) — ссылки на категории.
Имя категории вычисляет из последнего сегмента URL.
"""
from urllib.parse import urlparse
xlsx = Path("categories.xlsx")
if not xlsx.exists():
# демо, если файл не создан
return [
("bathroom-accessories", "https://www.next.pl/en/shop/home/bathroom/bathroom-accessories"),
]
df = pd.read_excel(xlsx, header=None)
if df.shape[1] == 0:
return []
urls: List[str] = []
for val in df.iloc[:, 0].tolist():
if isinstance(val, str):
u = val.strip()
elif pd.notna(val):
u = str(val).strip()
else:
continue
if not u or not u.lower().startswith(("http://", "https://")):
continue
urls.append(u)
def name_from_url(u: str) -> str:
p = urlparse(u)
parts = [s for s in p.path.split("/") if s]
return parts[-1] if parts else p.netloc
return [(name_from_url(u), u) for u in urls]
# ---------- адаптер: dict -> Product ----------
def normalize_to_models(collected: List[dict]) -> List[Product]:
"""
Вход: элементы от fetcher.collect_products():
{ id, title, url, price(float|None), currency('PLN'|...), color, description }
Выход: список Product (минимально необходимые поля)
"""
out: List[Product] = []
for d in collected:
price_val = d.get("price")
price_str = f"{float(price_val):.2f}" if isinstance(price_val, (int, float)) else None
out.append(Product(
product_id=str(d.get("id")) if d.get("id") is not None else None,
url=str(d.get("url")) if d.get("url") else None,
name=d.get("title"),
price=price_str,
currency=(d.get("currency") or "PLN").upper(),
color=d.get("color"),
description=d.get("description"),
image_urls=[],
size_variants=[]
))
return out
# ---------- основной сценарий ----------
async def run_category(fetcher: Fetcher, cfg: dict, name: str, url: str):
logging.info(f"Category start: {name}{url}")
try:
await fetcher.load_category(url)
# доскроллить до полного количества (считает из шапки "(N)")
await fetcher.auto_scroll_until_total()
# собрать товары (SSR + DOM) и обогатить с PDP
collected = await fetcher.collect_products()
collected = await fetcher.enrich_with_pdp_details(
collected,
max_concurrency=cfg.get("pdp", {}).get("max_concurrency", 3)
)
products = normalize_to_models(collected)
# сохранить в xlsx/csv/jsonl
path, n, rows = write_outputs(
category_name=name,
category_url=url,
products=products,
out_folder=cfg["output"]["folder"],
excel_prefix=cfg["output"]["excel_prefix"],
csv_also=cfg["output"].get("csv_also", True),
jsonl_also=cfg["output"].get("jsonl_also", True),
)
logging.info(f"{name}: {n} товаров → {path}")
return rows
except FetchError as e:
logging.error(f"Category failed: {name}{e}")
return []
except Exception as e:
logging.exception(f"Category crashed: {name}{e}")
return []
async def main_async():
setup_logging()
cfg = load_config()
categories = load_categories()
if not categories:
logging.warning("categories.xlsx пуст — добавьте ссылки в первую колонку (без заголовков).")
return
# Аккумулятор для общего XLSX
master_rows: List[dict] = []
# Имя общего файла: all_YYYYMMDD_HHMMSS_UTC+3.xlsx
ts_utc_plus3 = (datetime.utcnow() + timedelta(hours=3)).strftime("%Y%m%d_%H%M%S")
all_filename = f"all_{ts_utc_plus3}_UTC+3.xlsx"
all_path = str(Path(cfg["output"]["folder"]) / all_filename)
async with Fetcher(cfg) as fetcher:
for name, url in categories:
rows = await run_category(fetcher, cfg, name, url)
master_rows.extend(rows)
# По завершении всех категорий — пишем общий XLSX
all_written_path, total = write_master_excel(all_path, master_rows)
logging.info(f"◎ ALL: {total} товаров → {all_written_path}")
def main():
asyncio.run(main_async())
if __name__ == "__main__":
main()