342 lines
16 KiB
Python
342 lines
16 KiB
Python
# extractor.py · v 2.0 · 2025-07-24
|
||
from json import load, loads
|
||
from os.path import abspath
|
||
from bs4 import BeautifulSoup
|
||
from lxml import etree
|
||
import logging, os, sys
|
||
|
||
# ────────────────────────── конфигурация ───────────────────────────
|
||
DEL_SAME = "YES" # "YES" → фильтрация, "NO" → без фильтра
|
||
|
||
_log_level = os.getenv("LOG_LEVEL", "INFO").upper()
|
||
logging.basicConfig(
|
||
level=_log_level,
|
||
stream=sys.stdout,
|
||
format="%(asctime)s │ %(levelname)-5s │ %(message)s",
|
||
datefmt="%H:%M:%S"
|
||
)
|
||
log = logging.getLogger("extractor")
|
||
|
||
# ────────────────────── вспом-функции Zara Home ────────────────────
|
||
def extract_components_zarahome(parts):
|
||
comp = []
|
||
for part in parts:
|
||
if part.get("areas") and part.get("description"):
|
||
if len(parts) != 1:
|
||
comp.append(part["description"])
|
||
for area in part["areas"]:
|
||
comp.append(f"{area['description']} ({area['percentageArea']})")
|
||
for c in area["components"]:
|
||
comp.append(f"{c['percentage']} {c['material']}")
|
||
elif part.get("components") and part.get("description"):
|
||
if len(parts) != 1:
|
||
comp.append(part["description"])
|
||
for c in part["components"]:
|
||
comp.append(f"{c['percentage']} {c['material']}")
|
||
return comp
|
||
|
||
# ───────────────────── фильтр дубликатов on-the-fly ────────────────
|
||
def push_row_dedup(row, idx, seen, out):
|
||
"""Добавляет строку в out, соблюдая правила DEL_SAME."""
|
||
art, name, size, price, clr = (row[idx["Артикул"]],
|
||
row[idx["Название товара или услуги"]],
|
||
row[idx["Свойство: Размер"]],
|
||
row[idx["Цена закупки"]],
|
||
row[idx["Свойство: Цвет"]])
|
||
base = (art, name, size, price, clr)
|
||
|
||
if base not in seen:
|
||
seen[base] = row
|
||
out.append(row)
|
||
return
|
||
|
||
old = seen[base]
|
||
pn_old, pn_new = old[idx["PartNumber"]], row[idx["PartNumber"]]
|
||
vis_old, vis_new = old[idx["Наличие на сайте"]], row[idx["Наличие на сайте"]]
|
||
|
||
# 2) одинаковый PartNumber -> игнор новой строки
|
||
if pn_old == pn_new:
|
||
return
|
||
|
||
# 3) vis одинаковый?
|
||
if vis_old == vis_new:
|
||
art4 = art[:4]
|
||
pn4_old = pn_old[1:5] if len(pn_old) >= 5 else ""
|
||
pn4_new = pn_new[1:5] if len(pn_new) >= 5 else ""
|
||
|
||
# 4) оставляем только совпавшие 4-символа
|
||
if art4 == pn4_new and art4 != pn4_old:
|
||
# новая подходит лучше -> заменить
|
||
seen[base] = row
|
||
out[out.index(old)] = row
|
||
# если старая совпадает, новая — нет -> игнор
|
||
return
|
||
|
||
# 5) vis разные -> оставляем SHOW
|
||
if vis_new == "SHOW" and vis_old != "SHOW":
|
||
seen[base] = row
|
||
out[out.index(old)] = row
|
||
# иначе — оставляем старую (SHOW уже сохранён) или обе off-SHOW пропускаем.
|
||
|
||
|
||
class Extractor:
|
||
# ----------------------------------------------------------------
|
||
def __init__(self, json_data):
|
||
|
||
self.methods = {
|
||
"": (self.default_extract_method, []),
|
||
|
||
"zarahome": (self.zarahome_extract_method, [
|
||
"Краткое описание",
|
||
"Артикул",
|
||
"SKU",
|
||
"PartNumber",
|
||
"Название товара или услуги",
|
||
"Полное описание",
|
||
"Образец цвета",
|
||
"Свойство: Цвет",
|
||
"Свойство: Размер",
|
||
"Цена закупки",
|
||
"Свойство: Вес(г)",
|
||
"Наличие на сайте",
|
||
"Изображения",
|
||
"Изображения варианта",
|
||
"Параметр: Состав",
|
||
"Параметр: Уход",
|
||
"Параметр: Происхождение",
|
||
"Размещение на сайте",
|
||
"Свойство: Бренд"
|
||
]),
|
||
|
||
"zara": (self.zara_extract_method, []),
|
||
"eobuwie": (self.eobuwie_extract_method, []),
|
||
"decathlon": (self.decathlon_extract_method, []),
|
||
"chanel": (self.chanel_extract_method, []),
|
||
}
|
||
|
||
self.method = json_data["method"]
|
||
self.tags = json_data["tags"]
|
||
self.headers = self.methods[self.method][1].copy()
|
||
|
||
for tag in self.tags:
|
||
self.headers.insert(tag["column_number"], tag["column_name"])
|
||
|
||
# ─────────────────────────── утилиты ───────────────────────────
|
||
def extract(self, parser, recorder, categories):
|
||
self.methods[self.method][0](parser, recorder, categories)
|
||
|
||
def default_extract_method(self, *a, **kw):
|
||
log.info("Default extractor → nothing to do.")
|
||
|
||
def tags_extract(self, soup, row):
|
||
dom = etree.HTML(str(soup))
|
||
for tag in self.tags:
|
||
res = dom.xpath(tag["xpath"])
|
||
col = ""
|
||
if res:
|
||
for el in res:
|
||
col += ''.join(el.itertext()).strip() + "\n"
|
||
row.insert(tag["column_number"], col)
|
||
|
||
# ───── заглушки ─────
|
||
def zara_extract_method(self, *_, **__): log.info("ZARA extractor disabled.")
|
||
def eobuwie_extract_method(self, *_, **__): log.info("Eobuwie extractor disabled.")
|
||
def decathlon_extract_method(self, *_, **__): log.info("Decathlon extractor disabled.")
|
||
def chanel_extract_method(self, *_, **__): log.info("Chanel extractor disabled.")
|
||
|
||
# ─────────────────────── ZARA HOME ─────────────────────────────
|
||
def zarahome_extract_method(self, parser, recorder, categories):
|
||
|
||
BASE_API = "https://www.zarahome.com/itxrest/3/catalog/store/85009924/80290000"
|
||
USER_BRAND = "ZARAHOME"
|
||
|
||
def fetch_json(url):
|
||
try:
|
||
return parser.parse(url, return_type="json")
|
||
except Exception as err:
|
||
log.warning("Request Error: %s - %s", err, url)
|
||
alt = url.replace("ieec2cihslb3-zarahome.central.inditex.grp",
|
||
"www.zarahome.com")
|
||
if alt != url:
|
||
log.info("→ retry via public host")
|
||
return parser.parse(alt, return_type="json")
|
||
return None
|
||
|
||
for c_idx, category in enumerate(categories, 1):
|
||
log.info("Categories: %s / %s %s", c_idx, len(categories), category)
|
||
# подготовка структур фильтра
|
||
clean_rows = [self.headers]
|
||
if DEL_SAME == "YES":
|
||
idx_map = {h: i for i, h in enumerate(self.headers)}
|
||
seen = {}
|
||
|
||
# ── HTML категории
|
||
html = parser.parse(category)
|
||
if html is None:
|
||
log.warning("Extractor Error: empty page"); continue
|
||
soup = BeautifulSoup(html, "lxml")
|
||
|
||
script = soup.select_one("#serverApp-state")
|
||
if not script:
|
||
log.warning("Extractor Error: script not found"); continue
|
||
state = loads(script.string)
|
||
|
||
cat_key = next(k for k in state if "/category?" in k)
|
||
cat_info = state[cat_key]
|
||
ids = [str(p["id"]) for p in cat_info.get("products", [])]
|
||
|
||
summaries = []
|
||
# (A) via productIds
|
||
if ids:
|
||
CHUNK = 60
|
||
for p in range(0, len(ids), CHUNK):
|
||
api = (f"{BASE_API}/productsArray?languageId=-1&"
|
||
f"productIds={','.join(ids[p:p+CHUNK])}&appId=1")
|
||
data = fetch_json(api)
|
||
if data and "products" in data:
|
||
summaries += data["products"]
|
||
else:
|
||
prod_key = next((k for k in state if "/product?" in k), None)
|
||
if prod_key and "products" in state[prod_key]:
|
||
for grp in state[prod_key]["products"]:
|
||
summaries += grp.get("bundleProductSummaries", [])
|
||
elif prod_key and "productIds" in state[prod_key]:
|
||
ids = state[prod_key]["productIds"]
|
||
CHUNK = 60
|
||
for p in range(0, len(ids), CHUNK):
|
||
api = (f"{BASE_API}/productsArray?languageId=-1&"
|
||
f"productIds={','.join(map(str, ids[p:p+CHUNK]))}&appId=1")
|
||
data = fetch_json(api)
|
||
if data and "products" in data:
|
||
summaries += data["products"]
|
||
else:
|
||
subcats = cat_info.get("subcategories") or []
|
||
for sub in subcats:
|
||
sub_url = "https://www.zarahome.com/pl/en/" + sub["url"]
|
||
sub_html = parser.parse(sub_url)
|
||
if not sub_html:
|
||
continue
|
||
sub_state = loads(BeautifulSoup(sub_html, "lxml")
|
||
.select_one("#serverApp-state").string)
|
||
sub_prod_key = next((k for k in sub_state if "/product?" in k), None)
|
||
if sub_prod_key and "products" in sub_state[sub_prod_key]:
|
||
for grp in sub_state[sub_prod_key]["products"]:
|
||
summaries += grp.get("bundleProductSummaries", [])
|
||
|
||
seen_ids = set()
|
||
for prod in summaries:
|
||
prod_id = prod.get("id")
|
||
short_url = prod.get("productUrl") or (
|
||
f"{prod['seo']['keyword']}-p{prod['seo']['seoProductId']}.html"
|
||
if prod.get("seo") else "")
|
||
|
||
if not short_url or prod_id in seen_ids:
|
||
continue
|
||
seen_ids.add(prod_id)
|
||
|
||
variants = prod.get("bundleProductSummaries") or [prod]
|
||
for vprod in variants:
|
||
det = vprod["detail"]
|
||
|
||
sec, fam, subfam = (vprod.get("sectionNameEN") or "",
|
||
vprod.get("familyName") or "",
|
||
vprod.get("subFamilyName") or "")
|
||
cat_path = "Каталог/ZaraHome/" + "/".join(p for p in (sec, fam, subfam) if p)
|
||
|
||
url_full = f"https://www.zarahome.com/pl/en/{vprod.get('productUrl','')}"
|
||
name = vprod.get("name", "")
|
||
article = det["displayReference"]
|
||
root_price = int(vprod.get("price", 0)) / 100
|
||
root_wt = vprod.get("weight", "")
|
||
|
||
raw_xmedia = det.get("xmedia") or vprod.get("xmedia") or []
|
||
default_idx = det.get("xmediaDefaultSet")
|
||
if isinstance(raw_xmedia, list) and raw_xmedia:
|
||
media_sets = [raw_xmedia[default_idx]] if isinstance(default_idx, int) else raw_xmedia
|
||
elif isinstance(raw_xmedia, dict):
|
||
media_sets = [raw_xmedia]
|
||
else:
|
||
media_sets = []
|
||
all_imgs = [f"https://static.zarahome.net/8/photos4{loc['path']}/{m['idMedia']}2.jpg"
|
||
for loc in media_sets for m in loc["xmediaItems"][0]["medias"]]
|
||
all_imgs_s = "\n".join(all_imgs)
|
||
|
||
comp_txt = ""
|
||
if det.get("compositionDetail") and det["compositionDetail"].get("parts"):
|
||
comp_txt = "\n".join(
|
||
extract_components_zarahome(det["compositionDetail"]["parts"])
|
||
)
|
||
care = "\n".join(c["description"] for c in det.get("care", []))
|
||
trace = ""
|
||
|
||
colors = det.get("colors") or [{
|
||
"id": 0, "name": "DEFAULT", "image": {"url": ""},
|
||
"sizes": [{
|
||
"visibilityValue": "SHOW",
|
||
"name": "", "description": "",
|
||
"weight": root_wt, "price": vprod.get("price", 0)
|
||
}]
|
||
}]
|
||
|
||
for clr in colors:
|
||
clr_code = clr.get("id")
|
||
clr_name = clr.get("name", "")
|
||
clr_image = ""
|
||
if clr.get("image") and clr["image"].get("url"):
|
||
clr_image = f"https://static.zarahome.net/8/photos4{clr['image']['url']}_3_1_5.jpg"
|
||
|
||
clr_sets = [loc for loc in media_sets if loc.get("colorCode") == clr_code] or media_sets
|
||
clr_imgs = [f"https://static.zarahome.net/8/photos4{loc['path']}/{m['idMedia']}2.jpg"
|
||
for loc in clr_sets for m in loc["xmediaItems"][0]["medias"]]
|
||
clr_imgs_s = "\n".join(clr_imgs)
|
||
|
||
for size in clr["sizes"]:
|
||
vis = size.get("visibilityValue", "UNKNOWN")
|
||
price = int(size.get("price") or vprod.get("price", 0)) / 100
|
||
weight = size.get("weight") or root_wt
|
||
size_name = size.get("name", "")
|
||
size_descr = size.get("description", "")
|
||
size_full = f"{size_descr} ({size_name})" if size_descr else size_name
|
||
|
||
sku_val = size.get("sku", "")
|
||
partnumber_val = size.get("partnumber", "")
|
||
|
||
country = size.get("country") or ""
|
||
trace_local = f"Made in {country}" if country else trace
|
||
|
||
row = [
|
||
url_full,
|
||
article,
|
||
sku_val,
|
||
partnumber_val,
|
||
name,
|
||
det.get("longDescription", ""),
|
||
clr_image,
|
||
clr_name,
|
||
size_full,
|
||
price,
|
||
weight,
|
||
vis,
|
||
all_imgs_s,
|
||
clr_imgs_s,
|
||
comp_txt,
|
||
care,
|
||
trace_local,
|
||
cat_path,
|
||
USER_BRAND
|
||
]
|
||
|
||
if DEL_SAME == "YES":
|
||
push_row_dedup(row, idx_map, seen, clean_rows)
|
||
else:
|
||
clean_rows.append(row)
|
||
|
||
csv_name = category.split("/")[-1]
|
||
recorder.record(csv_name, clean_rows)
|
||
|
||
|
||
# ───────────────────────────────────────────────────────────────────
|
||
def get_extractor():
|
||
with open(abspath("parse_settings.json"), "r", encoding="utf-8") as fh:
|
||
return Extractor(load(fh))
|