MacOS_Parsers/Parsing ZARAHOME/src/extractor copy 4 -delthesame1.py
2025-07-28 16:20:11 +03:00

380 lines
18 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# extractor.py · v 2.0 · 2025-07-23
from json import load, loads
from os.path import abspath
from bs4 import BeautifulSoup
from lxml import etree
import logging, os, sys
# включение / выключение фильтра дубликатов
DEL_SAME = "YES" # "YES" → фильтр активен, "NO" → пишем всё как есть
# ────────────────────── настройка логирования ─────────────────────
_log_level = os.getenv("LOG_LEVEL", "INFO").upper()
logging.basicConfig(
level=_log_level,
stream=sys.stdout,
format="%(asctime)s%(levelname)-5s%(message)s",
datefmt="%H:%M:%S"
)
log = logging.getLogger("extractor")
# ────────────────────── вспомогательные функции ───────────────────
def extract_components_zarahome(parts):
comp = []
for part in parts:
if part.get("areas") and part.get("description"):
if len(parts) != 1:
comp.append(part["description"])
for area in part["areas"]:
comp.append(f"{area['description']} ({area['percentageArea']})")
for c in area["components"]:
comp.append(f"{c['percentage']} {c['material']}")
elif part.get("components") and part.get("description"):
if len(parts) != 1:
comp.append(part["description"])
for c in part["components"]:
comp.append(f"{c['percentage']} {c['material']}")
return comp
# ────────────────────────────────────────────────────────────────────
# ────────────────── фильтр «одинаковых» товаров ──────────────────
def filter_duplicates(table, headers):
"""Убирает строки по правилам DEL_SAME. table[0] — заголовок."""
if DEL_SAME != "YES" or len(table) <= 2:
return table
# индексы нужных колонок
idx = {h: i for i, h in enumerate(headers)}
art_i = idx["Артикул"]
name_i = idx["Название товара или услуги"]
size_i = idx["Свойство: Размер"]
price_i = idx["Цена закупки"]
clr_i = idx["Свойство: Цвет"]
pn_i = idx["PartNumber"]
vis_i = idx["Наличие на сайте"]
keep_rows = [table[0]] # сохраняем заголовок
groups = {}
# ── группируем по 5 базовым полям ───────────────────────────────
for row in table[1:]:
key = (row[art_i], row[name_i], row[size_i], row[price_i], row[clr_i])
groups.setdefault(key, []).append(row)
# ── применяем правила к каждой группе ───────────────────────────
for rows in groups.values():
if len(rows) == 1:
keep_rows.append(rows[0])
continue
# 2) одни и те же PartNumber? → оставляем первую
pn_set = {r[pn_i] for r in rows}
if len(pn_set) == 1:
keep_rows.append(rows[0])
continue
# 3) vis одинаковый?
vis_set = {r[vis_i] for r in rows}
if len(vis_set) == 1: # одинаковые
# 4) сравниваем 4-символьные коды
good = []
for r in rows:
art4 = r[art_i][:4]
pn4 = r[pn_i][1:5] if len(r[pn_i]) >= 5 else ""
if art4 == pn4:
good.append(r)
# оставляем только подходящие; если ни одного — первую
keep_rows.extend(good or [rows[0]])
else: # 5) vis разные
show = [r for r in rows if r[vis_i] == "SHOW"]
keep_rows.extend(show or rows) # остаётся SHOW, иначе всё
return keep_rows
class Extractor:
def __init__(self, json_data):
self.methods = {
"": (self.default_extract_method, []),
"zarahome": (self.zarahome_extract_method, [
"Краткое описание",
"Артикул",
"SKU",
"PartNumber",
"Название товара или услуги",
"Полное описание",
"Образец цвета",
"Свойство: Цвет",
"Свойство: Размер",
"Цена закупки",
"Свойство: Вес(г)",
"Наличие на сайте",
"Изображения",
"Изображения варианта",
"Параметр: Состав",
"Параметр: Уход",
"Параметр: Происхождение",
"Размещение на сайте",
"Свойство: Бренд"
]),
"zara": (self.zara_extract_method, []),
"eobuwie": (self.eobuwie_extract_method, []),
"decathlon": (self.decathlon_extract_method, []),
"chanel": (self.chanel_extract_method, []),
}
self.method = json_data["method"]
self.tags = json_data["tags"]
self.headers = self.methods[self.method][1].copy()
for tag in self.tags:
self.headers.insert(tag["column_number"], tag["column_name"])
# ────────────────────────── общие утилиты ─────────────────────
def extract(self, parser, recorder, categories):
self.methods[self.method][0](parser, recorder, categories)
def default_extract_method(self, *a, **kw):
log.info("Default extractor → nothing to do.")
def tags_extract(self, soup, row):
dom = etree.HTML(str(soup))
for tag in self.tags:
res = dom.xpath(tag["xpath"])
col = ""
if res:
for el in res:
col += ''.join(el.itertext()).strip() + "\n"
row.insert(tag["column_number"], col)
# ─────────── заглушки для неиспользуемых магазинов ────────────
def zara_extract_method(self, *_, **__): log.info("ZARA extractor disabled.")
def eobuwie_extract_method(self, *_, **__): log.info("Eobuwie extractor disabled.")
def decathlon_extract_method(self, *_, **__): log.info("Decathlon extractor disabled.")
def chanel_extract_method(self, *_, **__): log.info("Chanel extractor disabled.")
# ───────────────────── Z A R A H O M E ───────────────────────
def zarahome_extract_method(self, parser, recorder, categories):
BASE_API = "https://www.zarahome.com/itxrest/3/catalog/store/85009924/80290000"
USER_BRAND = "ZARAHOME"
def fetch_json(url):
try:
return parser.parse(url, return_type="json")
except Exception as err:
log.warning("Request Error: %s - %s", err, url)
alt = url.replace(
"ieec2cihslb3-zarahome.central.inditex.grp",
"www.zarahome.com"
)
if alt != url:
log.info("→ retry via public host")
return parser.parse(alt, return_type="json")
return None
for c_idx, category in enumerate(categories, 1):
table = [self.headers]
log.info("Categories: %s / %s %s", c_idx, len(categories), category)
html = parser.parse(category)
if html is None:
log.warning("Extractor Error: empty page"); continue
soup = BeautifulSoup(html, "html.parser")
script = soup.select_one("#serverApp-state")
if not script:
log.warning("Extractor Error: script not found for %s", category)
continue
state = loads(script.string)
cat_key = next(k for k in state if "/category?" in k)
cat_info = state[cat_key]
ids = [str(p["id"]) for p in cat_info.get("products", [])]
summaries = []
# (A) productIds
if ids:
log.debug("→ pulling %s productIds via API", len(ids))
CHUNK = 20
for p in range(0, len(ids), CHUNK):
api = (f"{BASE_API}/productsArray?languageId=-1&"
f"productIds={','.join(ids[p:p+CHUNK])}&appId=1")
data = fetch_json(api)
if not data or "products" not in data:
log.debug("Skip chunk (no data)")
continue
summaries += data["products"]
# (B) products в state или рекурсивный обход
else:
prod_key = next((k for k in state if "/product?" in k), None)
if prod_key and "products" in state[prod_key]:
log.debug("→ products array found in state")
for grp in state[prod_key]["products"]:
summaries += grp.get("bundleProductSummaries", [])
# ★ если products нет, но есть productIds → пользуемся API
elif prod_key and "productIds" in state[prod_key]:
ids = state[prod_key]["productIds"]
log.debug("→ pulling %s productIds via API (from prod_block)", len(ids))
CHUNK = 60
for p in range(0, len(ids), CHUNK):
api = (f"{BASE_API}/productsArray?languageId=-1&"
f"productIds={','.join(map(str, ids[p:p+CHUNK]))}&appId=1")
data = fetch_json(api)
if not data or "products" not in data:
log.debug("Skip chunk (no data)")
continue
summaries += data["products"]
else:
subcats = cat_info.get("subcategories") or []
if not subcats:
log.info("→ no products in this category")
continue
log.info("→ diving into %s subcategories", len(subcats))
for sub in subcats:
sub_url = "https://www.zarahome.com/pl/en/" + sub["url"]
sub_html = parser.parse(sub_url)
if not sub_html:
continue
sub_state = loads(BeautifulSoup(sub_html, "html.parser")
.select_one("#serverApp-state").string)
sub_prod_key = next((k for k in sub_state if "/product?" in k), None)
if sub_prod_key and "products" in sub_state[sub_prod_key]:
for grp in sub_state[sub_prod_key]["products"]:
summaries += grp.get("bundleProductSummaries", [])
log.debug("JSON summaries count: %s", len(summaries))
seen_ids = set()
for n, prod in enumerate(summaries, 1):
prod_id = prod.get("id")
short_url = prod.get("productUrl")
if not short_url and prod.get("seo"):
kw = prod["seo"].get("keyword", "")
sid = prod["seo"].get("seoProductId", "")
if kw and sid:
short_url = f"{kw}-p{sid}.html"
prod["productUrl"] = short_url
if not short_url or prod_id in seen_ids:
continue
seen_ids.add(prod_id)
log.info("Products: %s / %s %s", n, len(summaries),
f"https://www.zarahome.com/pl/{short_url}")
variants = prod.get("bundleProductSummaries") or [prod]
for vprod in variants:
det = vprod["detail"]
sec, fam, sub = (vprod.get("sectionNameEN") or "",
vprod.get("familyName") or "",
vprod.get("subFamilyName") or "")
cat_path = "Каталог/ZaraHome/" + "/".join(p for p in (sec, fam, sub) if p)
url_full = f"https://www.zarahome.com/pl/en/{vprod.get('productUrl','')}"
name = vprod.get("name", "")
article = det["displayReference"]
root_price = int(vprod.get("price", 0)) / 100
root_wt = vprod.get("weight", "")
raw_xmedia = det.get("xmedia") or vprod.get("xmedia") or []
default_idx = det.get("xmediaDefaultSet")
if isinstance(raw_xmedia, list) and raw_xmedia:
media_sets = [raw_xmedia[default_idx]] if isinstance(default_idx, int) else raw_xmedia
elif isinstance(raw_xmedia, dict):
media_sets = [raw_xmedia]
else:
media_sets = []
all_imgs = [f"https://static.zarahome.net/8/photos4{loc['path']}/{m['idMedia']}2.jpg"
for loc in media_sets
for m in loc["xmediaItems"][0]["medias"]]
all_imgs_s = "\n".join(all_imgs)
comp_txt = ""
if det.get("compositionDetail") and det["compositionDetail"].get("parts"):
comp_txt = "\n".join(
extract_components_zarahome(det["compositionDetail"]["parts"])
)
care = "\n".join(c["description"] for c in det.get("care", []))
trace = ""
colors = det.get("colors") or [{
"id": 0, "name": "DEFAULT", "image": {"url": ""},
"sizes": [{
"visibilityValue": "SHOW",
"name": "", "description": "",
"weight": root_wt, "price": vprod.get("price", 0)
}]
}]
#serial = 0
for clr in colors:
clr_code = clr.get("id")
clr_name = clr.get("name", "")
clr_image = ""
if clr.get("image") and clr["image"].get("url"):
clr_image = f"https://static.zarahome.net/8/photos4{clr['image']['url']}_3_1_5.jpg"
clr_sets = [loc for loc in media_sets if loc.get("colorCode") == clr_code] or media_sets
clr_imgs = [f"https://static.zarahome.net/8/photos4{loc['path']}/{m['idMedia']}2.jpg"
for loc in clr_sets
for m in loc["xmediaItems"][0]["medias"]]
clr_imgs_s = "\n".join(clr_imgs)
for size in clr["sizes"]:
vis = size.get("visibilityValue", "UNKNOWN")
price = int(size.get("price") or vprod.get("price", 0)) / 100
weight = size.get("weight") or root_wt
# ── страна изготовления (если есть в size)
country = size.get("country") or ""
trace_local = f"Made in {country}" if country else trace
size_name = size.get("name", "")
size_descr = size.get("description", "")
size_full = f"{size_descr} ({size_name})" if size_descr else size_name
# ── SKU / PartNumber берём из size ───────────────
sku_val = size.get("sku", "")
partnumber_val = size.get("partnumber", "")
table.append([
url_full,
article,
sku_val,
partnumber_val,
name,
det.get("longDescription", ""),
clr_image,
clr_name,
size_full,
price,
weight,
vis,
all_imgs_s,
clr_imgs_s,
comp_txt,
care,
trace_local,
cat_path,
USER_BRAND
])
csv_name = category.split("/")[-1]
clean_table = filter_duplicates(table, self.headers)
recorder.record(csv_name, clean_table)
#csv_name = category.split("/")[-1]
#recorder.record(csv_name, table)
# ────────────────────────────────────────────────────────────────────
def get_extractor():
with open(abspath("parse_settings.json"), "r", encoding="utf-8") as fh:
return Extractor(load(fh))