# extractor.py · v 2.0 · 2025-07-23 from json import load, loads from os.path import abspath from bs4 import BeautifulSoup from lxml import etree import logging, os, sys # включение / выключение фильтра дубликатов DEL_SAME = "YES" # "YES" → фильтр активен, "NO" → пишем всё как есть # ────────────────────── настройка логирования ───────────────────── _log_level = os.getenv("LOG_LEVEL", "INFO").upper() logging.basicConfig( level=_log_level, stream=sys.stdout, format="%(asctime)s │ %(levelname)-5s │ %(message)s", datefmt="%H:%M:%S" ) log = logging.getLogger("extractor") # ────────────────────── вспомогательные функции ─────────────────── def extract_components_zarahome(parts): comp = [] for part in parts: if part.get("areas") and part.get("description"): if len(parts) != 1: comp.append(part["description"]) for area in part["areas"]: comp.append(f"{area['description']} ({area['percentageArea']})") for c in area["components"]: comp.append(f"{c['percentage']} {c['material']}") elif part.get("components") and part.get("description"): if len(parts) != 1: comp.append(part["description"]) for c in part["components"]: comp.append(f"{c['percentage']} {c['material']}") return comp # ──────────────────────────────────────────────────────────────────── # ────────────────── фильтр «одинаковых» товаров ────────────────── def filter_duplicates(table, headers): """Убирает строки по правилам DEL_SAME. table[0] — заголовок.""" if DEL_SAME != "YES" or len(table) <= 2: return table # индексы нужных колонок idx = {h: i for i, h in enumerate(headers)} art_i = idx["Артикул"] name_i = idx["Название товара или услуги"] size_i = idx["Свойство: Размер"] price_i = idx["Цена закупки"] clr_i = idx["Свойство: Цвет"] pn_i = idx["PartNumber"] vis_i = idx["Наличие на сайте"] keep_rows = [table[0]] # сохраняем заголовок groups = {} # ── группируем по 5 базовым полям ─────────────────────────────── for row in table[1:]: key = (row[art_i], row[name_i], row[size_i], row[price_i], row[clr_i]) groups.setdefault(key, []).append(row) # ── применяем правила к каждой группе ─────────────────────────── for rows in groups.values(): if len(rows) == 1: keep_rows.append(rows[0]) continue # 2) одни и те же PartNumber? → оставляем первую pn_set = {r[pn_i] for r in rows} if len(pn_set) == 1: keep_rows.append(rows[0]) continue # 3) vis одинаковый? vis_set = {r[vis_i] for r in rows} if len(vis_set) == 1: # одинаковые # 4) сравниваем 4-символьные коды good = [] for r in rows: art4 = r[art_i][:4] pn4 = r[pn_i][1:5] if len(r[pn_i]) >= 5 else "" if art4 == pn4: good.append(r) # оставляем только подходящие; если ни одного — первую keep_rows.extend(good or [rows[0]]) else: # 5) vis разные show = [r for r in rows if r[vis_i] == "SHOW"] keep_rows.extend(show or rows) # остаётся SHOW, иначе всё return keep_rows class Extractor: def __init__(self, json_data): self.methods = { "": (self.default_extract_method, []), "zarahome": (self.zarahome_extract_method, [ "Краткое описание", "Артикул", "SKU", "PartNumber", "Название товара или услуги", "Полное описание", "Образец цвета", "Свойство: Цвет", "Свойство: Размер", "Цена закупки", "Свойство: Вес(г)", "Наличие на сайте", "Изображения", "Изображения варианта", "Параметр: Состав", "Параметр: Уход", "Параметр: Происхождение", "Размещение на сайте", "Свойство: Бренд" ]), "zara": (self.zara_extract_method, []), "eobuwie": (self.eobuwie_extract_method, []), "decathlon": (self.decathlon_extract_method, []), "chanel": (self.chanel_extract_method, []), } self.method = json_data["method"] self.tags = json_data["tags"] self.headers = self.methods[self.method][1].copy() for tag in self.tags: self.headers.insert(tag["column_number"], tag["column_name"]) # ────────────────────────── общие утилиты ───────────────────── def extract(self, parser, recorder, categories): self.methods[self.method][0](parser, recorder, categories) def default_extract_method(self, *a, **kw): log.info("Default extractor → nothing to do.") def tags_extract(self, soup, row): dom = etree.HTML(str(soup)) for tag in self.tags: res = dom.xpath(tag["xpath"]) col = "" if res: for el in res: col += ''.join(el.itertext()).strip() + "\n" row.insert(tag["column_number"], col) # ─────────── заглушки для неиспользуемых магазинов ──────────── def zara_extract_method(self, *_, **__): log.info("ZARA extractor disabled.") def eobuwie_extract_method(self, *_, **__): log.info("Eobuwie extractor disabled.") def decathlon_extract_method(self, *_, **__): log.info("Decathlon extractor disabled.") def chanel_extract_method(self, *_, **__): log.info("Chanel extractor disabled.") # ───────────────────── Z A R A H O M E ─────────────────────── def zarahome_extract_method(self, parser, recorder, categories): BASE_API = "https://www.zarahome.com/itxrest/3/catalog/store/85009924/80290000" USER_BRAND = "ZARAHOME" def fetch_json(url): try: return parser.parse(url, return_type="json") except Exception as err: log.warning("Request Error: %s - %s", err, url) alt = url.replace( "ieec2cihslb3-zarahome.central.inditex.grp", "www.zarahome.com" ) if alt != url: log.info("→ retry via public host") return parser.parse(alt, return_type="json") return None for c_idx, category in enumerate(categories, 1): table = [self.headers] log.info("Categories: %s / %s %s", c_idx, len(categories), category) html = parser.parse(category) if html is None: log.warning("Extractor Error: empty page"); continue soup = BeautifulSoup(html, "html.parser") script = soup.select_one("#serverApp-state") if not script: log.warning("Extractor Error: script not found for %s", category) continue state = loads(script.string) cat_key = next(k for k in state if "/category?" in k) cat_info = state[cat_key] ids = [str(p["id"]) for p in cat_info.get("products", [])] summaries = [] # (A) productIds if ids: log.debug("→ pulling %s productIds via API", len(ids)) CHUNK = 20 for p in range(0, len(ids), CHUNK): api = (f"{BASE_API}/productsArray?languageId=-1&" f"productIds={','.join(ids[p:p+CHUNK])}&appId=1") data = fetch_json(api) if not data or "products" not in data: log.debug("Skip chunk (no data)") continue summaries += data["products"] # (B) products в state или рекурсивный обход else: prod_key = next((k for k in state if "/product?" in k), None) if prod_key and "products" in state[prod_key]: log.debug("→ products array found in state") for grp in state[prod_key]["products"]: summaries += grp.get("bundleProductSummaries", []) # ★ если products нет, но есть productIds → пользуемся API elif prod_key and "productIds" in state[prod_key]: ids = state[prod_key]["productIds"] log.debug("→ pulling %s productIds via API (from prod_block)", len(ids)) CHUNK = 60 for p in range(0, len(ids), CHUNK): api = (f"{BASE_API}/productsArray?languageId=-1&" f"productIds={','.join(map(str, ids[p:p+CHUNK]))}&appId=1") data = fetch_json(api) if not data or "products" not in data: log.debug("Skip chunk (no data)") continue summaries += data["products"] else: subcats = cat_info.get("subcategories") or [] if not subcats: log.info("→ no products in this category") continue log.info("→ diving into %s subcategories", len(subcats)) for sub in subcats: sub_url = "https://www.zarahome.com/pl/en/" + sub["url"] sub_html = parser.parse(sub_url) if not sub_html: continue sub_state = loads(BeautifulSoup(sub_html, "html.parser") .select_one("#serverApp-state").string) sub_prod_key = next((k for k in sub_state if "/product?" in k), None) if sub_prod_key and "products" in sub_state[sub_prod_key]: for grp in sub_state[sub_prod_key]["products"]: summaries += grp.get("bundleProductSummaries", []) log.debug("JSON summaries count: %s", len(summaries)) seen_ids = set() for n, prod in enumerate(summaries, 1): prod_id = prod.get("id") short_url = prod.get("productUrl") if not short_url and prod.get("seo"): kw = prod["seo"].get("keyword", "") sid = prod["seo"].get("seoProductId", "") if kw and sid: short_url = f"{kw}-p{sid}.html" prod["productUrl"] = short_url if not short_url or prod_id in seen_ids: continue seen_ids.add(prod_id) log.info("Products: %s / %s %s", n, len(summaries), f"https://www.zarahome.com/pl/{short_url}") variants = prod.get("bundleProductSummaries") or [prod] for vprod in variants: det = vprod["detail"] sec, fam, sub = (vprod.get("sectionNameEN") or "", vprod.get("familyName") or "", vprod.get("subFamilyName") or "") cat_path = "Каталог/ZaraHome/" + "/".join(p for p in (sec, fam, sub) if p) url_full = f"https://www.zarahome.com/pl/en/{vprod.get('productUrl','')}" name = vprod.get("name", "") article = det["displayReference"] root_price = int(vprod.get("price", 0)) / 100 root_wt = vprod.get("weight", "") raw_xmedia = det.get("xmedia") or vprod.get("xmedia") or [] default_idx = det.get("xmediaDefaultSet") if isinstance(raw_xmedia, list) and raw_xmedia: media_sets = [raw_xmedia[default_idx]] if isinstance(default_idx, int) else raw_xmedia elif isinstance(raw_xmedia, dict): media_sets = [raw_xmedia] else: media_sets = [] all_imgs = [f"https://static.zarahome.net/8/photos4{loc['path']}/{m['idMedia']}2.jpg" for loc in media_sets for m in loc["xmediaItems"][0]["medias"]] all_imgs_s = "\n".join(all_imgs) comp_txt = "" if det.get("compositionDetail") and det["compositionDetail"].get("parts"): comp_txt = "\n".join( extract_components_zarahome(det["compositionDetail"]["parts"]) ) care = "\n".join(c["description"] for c in det.get("care", [])) trace = "" colors = det.get("colors") or [{ "id": 0, "name": "DEFAULT", "image": {"url": ""}, "sizes": [{ "visibilityValue": "SHOW", "name": "", "description": "", "weight": root_wt, "price": vprod.get("price", 0) }] }] #serial = 0 for clr in colors: clr_code = clr.get("id") clr_name = clr.get("name", "") clr_image = "" if clr.get("image") and clr["image"].get("url"): clr_image = f"https://static.zarahome.net/8/photos4{clr['image']['url']}_3_1_5.jpg" clr_sets = [loc for loc in media_sets if loc.get("colorCode") == clr_code] or media_sets clr_imgs = [f"https://static.zarahome.net/8/photos4{loc['path']}/{m['idMedia']}2.jpg" for loc in clr_sets for m in loc["xmediaItems"][0]["medias"]] clr_imgs_s = "\n".join(clr_imgs) for size in clr["sizes"]: vis = size.get("visibilityValue", "UNKNOWN") price = int(size.get("price") or vprod.get("price", 0)) / 100 weight = size.get("weight") or root_wt # ── страна изготовления (если есть в size) country = size.get("country") or "" trace_local = f"Made in {country}" if country else trace size_name = size.get("name", "") size_descr = size.get("description", "") size_full = f"{size_descr} ({size_name})" if size_descr else size_name # ── SKU / PartNumber берём из size ─────────────── sku_val = size.get("sku", "") partnumber_val = size.get("partnumber", "") table.append([ url_full, article, sku_val, partnumber_val, name, det.get("longDescription", ""), clr_image, clr_name, size_full, price, weight, vis, all_imgs_s, clr_imgs_s, comp_txt, care, trace_local, cat_path, USER_BRAND ]) csv_name = category.split("/")[-1] clean_table = filter_duplicates(table, self.headers) recorder.record(csv_name, clean_table) #csv_name = category.split("/")[-1] #recorder.record(csv_name, table) # ──────────────────────────────────────────────────────────────────── def get_extractor(): with open(abspath("parse_settings.json"), "r", encoding="utf-8") as fh: return Extractor(load(fh))