380 lines
18 KiB
Python
380 lines
18 KiB
Python
# extractor.py · v 2.0 · 2025-07-23
|
||
from json import load, loads
|
||
from os.path import abspath
|
||
from bs4 import BeautifulSoup
|
||
from lxml import etree
|
||
import logging, os, sys
|
||
|
||
# включение / выключение фильтра дубликатов
|
||
DEL_SAME = "YES" # "YES" → фильтр активен, "NO" → пишем всё как есть
|
||
|
||
# ────────────────────── настройка логирования ─────────────────────
|
||
_log_level = os.getenv("LOG_LEVEL", "INFO").upper()
|
||
logging.basicConfig(
|
||
level=_log_level,
|
||
stream=sys.stdout,
|
||
format="%(asctime)s │ %(levelname)-5s │ %(message)s",
|
||
datefmt="%H:%M:%S"
|
||
)
|
||
log = logging.getLogger("extractor")
|
||
|
||
# ────────────────────── вспомогательные функции ───────────────────
|
||
def extract_components_zarahome(parts):
|
||
comp = []
|
||
for part in parts:
|
||
if part.get("areas") and part.get("description"):
|
||
if len(parts) != 1:
|
||
comp.append(part["description"])
|
||
for area in part["areas"]:
|
||
comp.append(f"{area['description']} ({area['percentageArea']})")
|
||
for c in area["components"]:
|
||
comp.append(f"{c['percentage']} {c['material']}")
|
||
elif part.get("components") and part.get("description"):
|
||
if len(parts) != 1:
|
||
comp.append(part["description"])
|
||
for c in part["components"]:
|
||
comp.append(f"{c['percentage']} {c['material']}")
|
||
return comp
|
||
# ────────────────────────────────────────────────────────────────────
|
||
# ────────────────── фильтр «одинаковых» товаров ──────────────────
|
||
def filter_duplicates(table, headers):
|
||
"""Убирает строки по правилам DEL_SAME. table[0] — заголовок."""
|
||
if DEL_SAME != "YES" or len(table) <= 2:
|
||
return table
|
||
|
||
# индексы нужных колонок
|
||
idx = {h: i for i, h in enumerate(headers)}
|
||
art_i = idx["Артикул"]
|
||
name_i = idx["Название товара или услуги"]
|
||
size_i = idx["Свойство: Размер"]
|
||
price_i = idx["Цена закупки"]
|
||
clr_i = idx["Свойство: Цвет"]
|
||
pn_i = idx["PartNumber"]
|
||
vis_i = idx["Наличие на сайте"]
|
||
|
||
keep_rows = [table[0]] # сохраняем заголовок
|
||
groups = {}
|
||
|
||
# ── группируем по 5 базовым полям ───────────────────────────────
|
||
for row in table[1:]:
|
||
key = (row[art_i], row[name_i], row[size_i], row[price_i], row[clr_i])
|
||
groups.setdefault(key, []).append(row)
|
||
|
||
# ── применяем правила к каждой группе ───────────────────────────
|
||
for rows in groups.values():
|
||
if len(rows) == 1:
|
||
keep_rows.append(rows[0])
|
||
continue
|
||
|
||
# 2) одни и те же PartNumber? → оставляем первую
|
||
pn_set = {r[pn_i] for r in rows}
|
||
if len(pn_set) == 1:
|
||
keep_rows.append(rows[0])
|
||
continue
|
||
|
||
# 3) vis одинаковый?
|
||
vis_set = {r[vis_i] for r in rows}
|
||
if len(vis_set) == 1: # одинаковые
|
||
# 4) сравниваем 4-символьные коды
|
||
good = []
|
||
for r in rows:
|
||
art4 = r[art_i][:4]
|
||
pn4 = r[pn_i][1:5] if len(r[pn_i]) >= 5 else ""
|
||
if art4 == pn4:
|
||
good.append(r)
|
||
# оставляем только подходящие; если ни одного — первую
|
||
keep_rows.extend(good or [rows[0]])
|
||
else: # 5) vis разные
|
||
show = [r for r in rows if r[vis_i] == "SHOW"]
|
||
keep_rows.extend(show or rows) # остаётся SHOW, иначе всё
|
||
|
||
return keep_rows
|
||
|
||
|
||
class Extractor:
|
||
def __init__(self, json_data):
|
||
|
||
self.methods = {
|
||
"": (self.default_extract_method, []),
|
||
|
||
"zarahome": (self.zarahome_extract_method, [
|
||
"Краткое описание",
|
||
"Артикул",
|
||
"SKU",
|
||
"PartNumber",
|
||
"Название товара или услуги",
|
||
"Полное описание",
|
||
"Образец цвета",
|
||
"Свойство: Цвет",
|
||
"Свойство: Размер",
|
||
"Цена закупки",
|
||
"Свойство: Вес(г)",
|
||
"Наличие на сайте",
|
||
"Изображения",
|
||
"Изображения варианта",
|
||
"Параметр: Состав",
|
||
"Параметр: Уход",
|
||
"Параметр: Происхождение",
|
||
"Размещение на сайте",
|
||
"Свойство: Бренд"
|
||
]),
|
||
|
||
"zara": (self.zara_extract_method, []),
|
||
"eobuwie": (self.eobuwie_extract_method, []),
|
||
"decathlon": (self.decathlon_extract_method, []),
|
||
"chanel": (self.chanel_extract_method, []),
|
||
}
|
||
|
||
self.method = json_data["method"]
|
||
self.tags = json_data["tags"]
|
||
self.headers = self.methods[self.method][1].copy()
|
||
|
||
for tag in self.tags:
|
||
self.headers.insert(tag["column_number"], tag["column_name"])
|
||
|
||
# ────────────────────────── общие утилиты ─────────────────────
|
||
def extract(self, parser, recorder, categories):
|
||
self.methods[self.method][0](parser, recorder, categories)
|
||
|
||
def default_extract_method(self, *a, **kw):
|
||
log.info("Default extractor → nothing to do.")
|
||
|
||
def tags_extract(self, soup, row):
|
||
dom = etree.HTML(str(soup))
|
||
for tag in self.tags:
|
||
res = dom.xpath(tag["xpath"])
|
||
col = ""
|
||
if res:
|
||
for el in res:
|
||
col += ''.join(el.itertext()).strip() + "\n"
|
||
row.insert(tag["column_number"], col)
|
||
|
||
# ─────────── заглушки для неиспользуемых магазинов ────────────
|
||
def zara_extract_method(self, *_, **__): log.info("ZARA extractor disabled.")
|
||
def eobuwie_extract_method(self, *_, **__): log.info("Eobuwie extractor disabled.")
|
||
def decathlon_extract_method(self, *_, **__): log.info("Decathlon extractor disabled.")
|
||
def chanel_extract_method(self, *_, **__): log.info("Chanel extractor disabled.")
|
||
|
||
# ───────────────────── Z A R A H O M E ───────────────────────
|
||
def zarahome_extract_method(self, parser, recorder, categories):
|
||
|
||
BASE_API = "https://www.zarahome.com/itxrest/3/catalog/store/85009924/80290000"
|
||
USER_BRAND = "ZARAHOME"
|
||
|
||
def fetch_json(url):
|
||
try:
|
||
return parser.parse(url, return_type="json")
|
||
except Exception as err:
|
||
log.warning("Request Error: %s - %s", err, url)
|
||
alt = url.replace(
|
||
"ieec2cihslb3-zarahome.central.inditex.grp",
|
||
"www.zarahome.com"
|
||
)
|
||
if alt != url:
|
||
log.info("→ retry via public host")
|
||
return parser.parse(alt, return_type="json")
|
||
return None
|
||
|
||
for c_idx, category in enumerate(categories, 1):
|
||
table = [self.headers]
|
||
log.info("Categories: %s / %s %s", c_idx, len(categories), category)
|
||
|
||
html = parser.parse(category)
|
||
if html is None:
|
||
log.warning("Extractor Error: empty page"); continue
|
||
soup = BeautifulSoup(html, "html.parser")
|
||
|
||
script = soup.select_one("#serverApp-state")
|
||
if not script:
|
||
log.warning("Extractor Error: script not found for %s", category)
|
||
continue
|
||
state = loads(script.string)
|
||
|
||
cat_key = next(k for k in state if "/category?" in k)
|
||
cat_info = state[cat_key]
|
||
ids = [str(p["id"]) for p in cat_info.get("products", [])]
|
||
|
||
summaries = []
|
||
|
||
# (A) productIds
|
||
if ids:
|
||
log.debug("→ pulling %s productIds via API", len(ids))
|
||
CHUNK = 20
|
||
for p in range(0, len(ids), CHUNK):
|
||
api = (f"{BASE_API}/productsArray?languageId=-1&"
|
||
f"productIds={','.join(ids[p:p+CHUNK])}&appId=1")
|
||
data = fetch_json(api)
|
||
if not data or "products" not in data:
|
||
log.debug("Skip chunk (no data)")
|
||
continue
|
||
summaries += data["products"]
|
||
|
||
# (B) products в state или рекурсивный обход
|
||
else:
|
||
prod_key = next((k for k in state if "/product?" in k), None)
|
||
if prod_key and "products" in state[prod_key]:
|
||
log.debug("→ products array found in state")
|
||
for grp in state[prod_key]["products"]:
|
||
summaries += grp.get("bundleProductSummaries", [])
|
||
# ★ если products нет, но есть productIds → пользуемся API
|
||
elif prod_key and "productIds" in state[prod_key]:
|
||
ids = state[prod_key]["productIds"]
|
||
log.debug("→ pulling %s productIds via API (from prod_block)", len(ids))
|
||
CHUNK = 60
|
||
for p in range(0, len(ids), CHUNK):
|
||
api = (f"{BASE_API}/productsArray?languageId=-1&"
|
||
f"productIds={','.join(map(str, ids[p:p+CHUNK]))}&appId=1")
|
||
data = fetch_json(api)
|
||
if not data or "products" not in data:
|
||
log.debug("Skip chunk (no data)")
|
||
continue
|
||
summaries += data["products"]
|
||
else:
|
||
subcats = cat_info.get("subcategories") or []
|
||
if not subcats:
|
||
log.info("→ no products in this category")
|
||
continue
|
||
log.info("→ diving into %s subcategories", len(subcats))
|
||
for sub in subcats:
|
||
sub_url = "https://www.zarahome.com/pl/en/" + sub["url"]
|
||
sub_html = parser.parse(sub_url)
|
||
if not sub_html:
|
||
continue
|
||
sub_state = loads(BeautifulSoup(sub_html, "html.parser")
|
||
.select_one("#serverApp-state").string)
|
||
sub_prod_key = next((k for k in sub_state if "/product?" in k), None)
|
||
if sub_prod_key and "products" in sub_state[sub_prod_key]:
|
||
for grp in sub_state[sub_prod_key]["products"]:
|
||
summaries += grp.get("bundleProductSummaries", [])
|
||
|
||
log.debug("JSON summaries count: %s", len(summaries))
|
||
|
||
seen_ids = set()
|
||
for n, prod in enumerate(summaries, 1):
|
||
prod_id = prod.get("id")
|
||
short_url = prod.get("productUrl")
|
||
|
||
if not short_url and prod.get("seo"):
|
||
kw = prod["seo"].get("keyword", "")
|
||
sid = prod["seo"].get("seoProductId", "")
|
||
if kw and sid:
|
||
short_url = f"{kw}-p{sid}.html"
|
||
prod["productUrl"] = short_url
|
||
|
||
if not short_url or prod_id in seen_ids:
|
||
continue
|
||
seen_ids.add(prod_id)
|
||
log.info("Products: %s / %s %s", n, len(summaries),
|
||
f"https://www.zarahome.com/pl/{short_url}")
|
||
|
||
variants = prod.get("bundleProductSummaries") or [prod]
|
||
|
||
for vprod in variants:
|
||
det = vprod["detail"]
|
||
|
||
sec, fam, sub = (vprod.get("sectionNameEN") or "",
|
||
vprod.get("familyName") or "",
|
||
vprod.get("subFamilyName") or "")
|
||
cat_path = "Каталог/ZaraHome/" + "/".join(p for p in (sec, fam, sub) if p)
|
||
|
||
url_full = f"https://www.zarahome.com/pl/en/{vprod.get('productUrl','')}"
|
||
name = vprod.get("name", "")
|
||
article = det["displayReference"]
|
||
root_price = int(vprod.get("price", 0)) / 100
|
||
root_wt = vprod.get("weight", "")
|
||
|
||
raw_xmedia = det.get("xmedia") or vprod.get("xmedia") or []
|
||
default_idx = det.get("xmediaDefaultSet")
|
||
if isinstance(raw_xmedia, list) and raw_xmedia:
|
||
media_sets = [raw_xmedia[default_idx]] if isinstance(default_idx, int) else raw_xmedia
|
||
elif isinstance(raw_xmedia, dict):
|
||
media_sets = [raw_xmedia]
|
||
else:
|
||
media_sets = []
|
||
all_imgs = [f"https://static.zarahome.net/8/photos4{loc['path']}/{m['idMedia']}2.jpg"
|
||
for loc in media_sets
|
||
for m in loc["xmediaItems"][0]["medias"]]
|
||
all_imgs_s = "\n".join(all_imgs)
|
||
|
||
comp_txt = ""
|
||
if det.get("compositionDetail") and det["compositionDetail"].get("parts"):
|
||
comp_txt = "\n".join(
|
||
extract_components_zarahome(det["compositionDetail"]["parts"])
|
||
)
|
||
care = "\n".join(c["description"] for c in det.get("care", []))
|
||
trace = ""
|
||
|
||
colors = det.get("colors") or [{
|
||
"id": 0, "name": "DEFAULT", "image": {"url": ""},
|
||
"sizes": [{
|
||
"visibilityValue": "SHOW",
|
||
"name": "", "description": "",
|
||
"weight": root_wt, "price": vprod.get("price", 0)
|
||
}]
|
||
}]
|
||
|
||
#serial = 0
|
||
for clr in colors:
|
||
clr_code = clr.get("id")
|
||
clr_name = clr.get("name", "")
|
||
clr_image = ""
|
||
if clr.get("image") and clr["image"].get("url"):
|
||
clr_image = f"https://static.zarahome.net/8/photos4{clr['image']['url']}_3_1_5.jpg"
|
||
|
||
clr_sets = [loc for loc in media_sets if loc.get("colorCode") == clr_code] or media_sets
|
||
clr_imgs = [f"https://static.zarahome.net/8/photos4{loc['path']}/{m['idMedia']}2.jpg"
|
||
for loc in clr_sets
|
||
for m in loc["xmediaItems"][0]["medias"]]
|
||
clr_imgs_s = "\n".join(clr_imgs)
|
||
|
||
for size in clr["sizes"]:
|
||
vis = size.get("visibilityValue", "UNKNOWN")
|
||
price = int(size.get("price") or vprod.get("price", 0)) / 100
|
||
weight = size.get("weight") or root_wt
|
||
# ── страна изготовления (если есть в size)
|
||
country = size.get("country") or ""
|
||
trace_local = f"Made in {country}" if country else trace
|
||
|
||
size_name = size.get("name", "")
|
||
size_descr = size.get("description", "")
|
||
size_full = f"{size_descr} ({size_name})" if size_descr else size_name
|
||
# ── SKU / PartNumber берём из size ───────────────
|
||
sku_val = size.get("sku", "")
|
||
partnumber_val = size.get("partnumber", "")
|
||
table.append([
|
||
url_full,
|
||
article,
|
||
sku_val,
|
||
partnumber_val,
|
||
name,
|
||
det.get("longDescription", ""),
|
||
clr_image,
|
||
clr_name,
|
||
size_full,
|
||
price,
|
||
weight,
|
||
vis,
|
||
all_imgs_s,
|
||
clr_imgs_s,
|
||
comp_txt,
|
||
care,
|
||
trace_local,
|
||
cat_path,
|
||
USER_BRAND
|
||
])
|
||
|
||
csv_name = category.split("/")[-1]
|
||
clean_table = filter_duplicates(table, self.headers)
|
||
recorder.record(csv_name, clean_table)
|
||
|
||
|
||
#csv_name = category.split("/")[-1]
|
||
#recorder.record(csv_name, table)
|
||
|
||
|
||
|
||
# ────────────────────────────────────────────────────────────────────
|
||
def get_extractor():
|
||
with open(abspath("parse_settings.json"), "r", encoding="utf-8") as fh:
|
||
return Extractor(load(fh))
|