194 lines
6.4 KiB
Python
194 lines
6.4 KiB
Python
import asyncio
|
||
import logging
|
||
from pathlib import Path
|
||
from typing import List, Tuple
|
||
from datetime import timedelta
|
||
import pandas as pd
|
||
import yaml
|
||
|
||
from fetcher import Fetcher, FetchError
|
||
from sink import write_outputs
|
||
from models import Product
|
||
|
||
|
||
# ---------- конфиг/логи ----------
|
||
|
||
def setup_logging():
|
||
Path("out/logs").mkdir(parents=True, exist_ok=True)
|
||
logging.basicConfig(
|
||
level=logging.INFO,
|
||
format="%(asctime)s %(levelname)s %(message)s",
|
||
handlers=[
|
||
logging.FileHandler("out/logs/run.log", encoding="utf-8"),
|
||
logging.StreamHandler(),
|
||
],
|
||
)
|
||
|
||
def load_config() -> dict:
|
||
with open("config.yaml", "r", encoding="utf-8") as f:
|
||
return yaml.safe_load(f)
|
||
|
||
|
||
# ---------- загрузка категорий из первой колонки A ----------
|
||
|
||
def load_categories() -> List[Tuple[str, str]]:
|
||
"""
|
||
Читает categories.xlsx без заголовка.
|
||
Берёт только первую колонку (A) — ссылки на категории.
|
||
Имя категории вычисляет из последнего сегмента URL.
|
||
"""
|
||
from urllib.parse import urlparse
|
||
xlsx = Path("categories.xlsx")
|
||
if not xlsx.exists():
|
||
# демо, если файл не создан
|
||
return [
|
||
("bathroom-accessories", "https://www.next.pl/en/shop/home/bathroom/bathroom-accessories"),
|
||
]
|
||
|
||
df = pd.read_excel(xlsx, header=None)
|
||
if df.shape[1] == 0:
|
||
return []
|
||
|
||
urls: List[str] = []
|
||
for val in df.iloc[:, 0].tolist():
|
||
if isinstance(val, str):
|
||
u = val.strip()
|
||
elif pd.notna(val):
|
||
u = str(val).strip()
|
||
else:
|
||
continue
|
||
if not u or not u.lower().startswith(("http://", "https://")):
|
||
continue
|
||
urls.append(u)
|
||
|
||
def name_from_url(u: str) -> str:
|
||
p = urlparse(u)
|
||
parts = [s for s in p.path.split("/") if s]
|
||
return parts[-1] if parts else p.netloc
|
||
|
||
return [(name_from_url(u), u) for u in urls]
|
||
|
||
|
||
# ---------- адаптер: dict -> Product ----------
|
||
|
||
def normalize_to_models(collected: List[dict]) -> List[Product]:
|
||
out: List[Product] = []
|
||
for d in collected:
|
||
pid = d.get("id")
|
||
url = d.get("url")
|
||
title = d.get("title")
|
||
price_val = d.get("price") # float | None
|
||
currency = (d.get("currency") or "PLN").upper()
|
||
|
||
price_str = None
|
||
if price_val is not None:
|
||
try:
|
||
price_str = f"{float(price_val):.2f}"
|
||
except Exception:
|
||
price_str = None
|
||
|
||
out.append(Product(
|
||
product_id=str(pid) if pid is not None else None,
|
||
url=str(url) if url else None,
|
||
name=title,
|
||
price=price_str,
|
||
currency=currency,
|
||
image_urls=[],
|
||
color=None,
|
||
size_variants=[]
|
||
))
|
||
return out
|
||
|
||
# ---------- основной сценарий ----------
|
||
|
||
async def run_category(fetcher: Fetcher, cfg: dict, name: str, url: str):
|
||
logging.info(f"Category start: {name} — {url}")
|
||
try:
|
||
await fetcher.load_category(url)
|
||
# доскроллить до полного количества (считает из шапки "(N)")
|
||
await fetcher.auto_scroll_until_total()
|
||
|
||
# собрать товары (SSR + DOM)
|
||
collected = await fetcher.collect_products()
|
||
products = normalize_to_models(collected)
|
||
|
||
# сохранить в xlsx/csv/jsonl
|
||
path, n = write_outputs(
|
||
category_name=name,
|
||
category_url=url,
|
||
products=products,
|
||
out_folder=cfg["output"]["folder"],
|
||
excel_prefix=cfg["output"]["excel_prefix"],
|
||
csv_also=cfg["output"].get("csv_also", True),
|
||
jsonl_also=cfg["output"].get("jsonl_also", True),
|
||
)
|
||
logging.info(f"✔ {name}: {n} товаров → {path}")
|
||
|
||
except FetchError as e:
|
||
logging.error(f"Category failed: {name} — {e}")
|
||
except Exception as e:
|
||
logging.exception(f"Category crashed: {name} — {e}")
|
||
|
||
|
||
async def main_async():
|
||
setup_logging()
|
||
cfg = load_config()
|
||
categories = load_categories()
|
||
if not categories:
|
||
logging.warning("categories.xlsx пуст — добавьте ссылки в первую колонку (без заголовков).")
|
||
return
|
||
|
||
# Аккумулятор для общего XLSX
|
||
master_rows: List[dict] = []
|
||
|
||
# Имя общего файла: all_YYYYMMDD_HHMMSS_UTC+3.xlsx
|
||
now_utc = pd.Timestamp.utcnow().to_pydatetime()
|
||
ts_utc_plus3 = (now_utc + timedelta(hours=3)).strftime("%Y%m%d_%H%M%S")
|
||
all_filename = f"all_{ts_utc_plus3}_UTC+3.xlsx"
|
||
all_path = str(Path(cfg["output"]["folder"]) / all_filename)
|
||
|
||
async with Fetcher(cfg) as fetcher:
|
||
for name, url in categories:
|
||
# обычный прогон по категории
|
||
try:
|
||
logging.info(f"Category start: {name} — {url}")
|
||
await fetcher.load_category(url)
|
||
await fetcher.auto_scroll_until_total()
|
||
|
||
collected = await fetcher.collect_products()
|
||
products = normalize_to_models(collected)
|
||
|
||
# запись per‑category
|
||
path, n, rows = write_outputs(
|
||
category_name=name,
|
||
category_url=url,
|
||
products=products,
|
||
out_folder=cfg["output"]["folder"],
|
||
excel_prefix=cfg["output"]["excel_prefix"],
|
||
csv_also=cfg["output"].get("csv_also", True),
|
||
jsonl_also=cfg["output"].get("jsonl_also", True),
|
||
)
|
||
logging.info(f"✔ {name}: {n} товаров → {path}")
|
||
|
||
# накапливаем в общий список
|
||
master_rows.extend(rows)
|
||
|
||
except FetchError as e:
|
||
logging.error(f"Category failed: {name} — {e}")
|
||
except Exception as e:
|
||
logging.exception(f"Category crashed: {name} — {e}")
|
||
|
||
# По завершении всех категорий — пишем общий XLSX
|
||
from sink import write_master_excel
|
||
all_written_path, total = write_master_excel(all_path, master_rows)
|
||
logging.info(f"◎ ALL: {total} товаров → {all_written_path}")
|
||
|
||
|
||
|
||
def main():
|
||
asyncio.run(main_async())
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|