MacOS_Parsers/Parser_NEXT/main.py

import asyncio
import logging
from pathlib import Path
from typing import List, Tuple
from datetime import timedelta
import pandas as pd
import yaml

from fetcher import Fetcher, FetchError
from sink import write_outputs
from models import Product


# ---------- конфиг/логи ----------

def setup_logging():
    Path("out/logs").mkdir(parents=True, exist_ok=True)
    logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s %(levelname)s %(message)s",
        handlers=[
            logging.FileHandler("out/logs/run.log", encoding="utf-8"),
            logging.StreamHandler(),
        ],
    )

def load_config() -> dict:
    with open("config.yaml", "r", encoding="utf-8") as f:
        return yaml.safe_load(f)


# ---------- загрузка категорий из первой колонки A ----------

def load_categories() -> List[Tuple[str, str]]:
    """
    Читает categories.xlsx без заголовка.
    Берёт только первую колонку (A) — ссылки на категории.
    Имя категории вычисляет из последнего сегмента URL.
    """
    from urllib.parse import urlparse
    xlsx = Path("categories.xlsx")
    if not xlsx.exists():
        # демо, если файл не создан
        return [
            ("bathroom-accessories", "https://www.next.pl/en/shop/home/bathroom/bathroom-accessories"),
        ]

    df = pd.read_excel(xlsx, header=None)
    if df.shape[1] == 0:
        return []

    urls: List[str] = []
    for val in df.iloc[:, 0].tolist():
        if isinstance(val, str):
            u = val.strip()
        elif pd.notna(val):
            u = str(val).strip()
        else:
            continue
        if not u or not u.lower().startswith(("http://", "https://")):
            continue
        urls.append(u)

    def name_from_url(u: str) -> str:
        p = urlparse(u)
        parts = [s for s in p.path.split("/") if s]
        return parts[-1] if parts else p.netloc

    return [(name_from_url(u), u) for u in urls]


# ---------- адаптер: dict -> Product ----------

def normalize_to_models(collected: List[dict]) -> List[Product]:
    out: List[Product] = []
    for d in collected:
        pid = d.get("id")
        url = d.get("url")
        title = d.get("title")
        price_val = d.get("price")  # float | None
        currency = (d.get("currency") or "PLN").upper()

        price_str = None
        if price_val is not None:
            try:
                price_str = f"{float(price_val):.2f}"
            except Exception:
                price_str = None

        out.append(Product(
            product_id=str(pid) if pid is not None else None,
            url=str(url) if url else None,
            name=title,
            price=price_str,
            currency=currency,
            image_urls=[],
            color=None,
            size_variants=[]
        ))
    return out

# ---------- основной сценарий ----------

async def run_category(fetcher: Fetcher, cfg: dict, name: str, url: str):
    logging.info(f"Category start: {name} — {url}")
    try:
        await fetcher.load_category(url)
        # доскроллить до полного количества (считает из шапки "(N)")
        await fetcher.auto_scroll_until_total()

        # собрать товары (SSR + DOM)
        collected = await fetcher.collect_products()
        products = normalize_to_models(collected)

        # сохранить в xlsx/csv/jsonl
        path, n = write_outputs(
            category_name=name,
            category_url=url,
            products=products,
            out_folder=cfg["output"]["folder"],
            excel_prefix=cfg["output"]["excel_prefix"],
            csv_also=cfg["output"].get("csv_also", True),
            jsonl_also=cfg["output"].get("jsonl_also", True),
        )
        logging.info(f"✔ {name}: {n} товаров → {path}")

    except FetchError as e:
        logging.error(f"Category failed: {name} — {e}")
    except Exception as e:
        logging.exception(f"Category crashed: {name} — {e}")


async def main_async():
    setup_logging()
    cfg = load_config()
    categories = load_categories()
    if not categories:
        logging.warning("categories.xlsx пуст — добавьте ссылки в первую колонку (без заголовков).")
        return

    # Аккумулятор для общего XLSX
    master_rows: List[dict] = []

    # Имя общего файла: all_YYYYMMDD_HHMMSS_UTC+3.xlsx
    now_utc = pd.Timestamp.utcnow().to_pydatetime()
    ts_utc_plus3 = (now_utc + timedelta(hours=3)).strftime("%Y%m%d_%H%M%S")
    all_filename = f"all_{ts_utc_plus3}_UTC+3.xlsx"
    all_path = str(Path(cfg["output"]["folder"]) / all_filename)

    async with Fetcher(cfg) as fetcher:
        for name, url in categories:
            # обычный прогон по категории
            try:
                logging.info(f"Category start: {name} — {url}")
                await fetcher.load_category(url)
                await fetcher.auto_scroll_until_total()

                collected = await fetcher.collect_products()
                products = normalize_to_models(collected)

                # запись per‑category
                path, n, rows = write_outputs(
                    category_name=name,
                    category_url=url,
                    products=products,
                    out_folder=cfg["output"]["folder"],
                    excel_prefix=cfg["output"]["excel_prefix"],
                    csv_also=cfg["output"].get("csv_also", True),
                    jsonl_also=cfg["output"].get("jsonl_also", True),
                )
                logging.info(f"✔ {name}: {n} товаров → {path}")

                # накапливаем в общий список
                master_rows.extend(rows)

            except FetchError as e:
                logging.error(f"Category failed: {name} — {e}")
            except Exception as e:
                logging.exception(f"Category crashed: {name} — {e}")

    # По завершении всех категорий — пишем общий XLSX
    from sink import write_master_excel
    all_written_path, total = write_master_excel(all_path, master_rows)
    logging.info(f"◎ ALL: {total} товаров → {all_written_path}")


def main():
    asyncio.run(main_async())


if __name__ == "__main__":
    main()