from json import load, loads from os.path import abspath from bs4 import BeautifulSoup from lxml import etree from re import split, search, sub import json, textwrap from pathlib import Path, PurePath import json, time def extract_components_zarahome(parts): composition = [] for part in parts: if part.get("areas") and part.get("description"): if len(parts) != 1: composition.append(part["description"]) for area in part["areas"]: area_name = area["description"] percentage_area = area["percentageArea"] composition.append(f"{area_name} ({percentage_area})") for component in area["components"]: material = component["material"] percentage = component["percentage"] composition.append(f"{percentage} {material}") elif part.get("components") and part.get("description"): if len(parts) != 1: composition.append(part["description"]) for component in part["components"]: material = component["material"] percentage = component["percentage"] composition.append(f"{percentage} {material}") return composition # класс для извлечения нужных данных class Extractor: def __init__(self, json_data): self.methods = { "": (self.default_extract_method, []), "zarahome": (self.zarahome_extract_method, [ "Краткое описание", "Артикул", "Название товара или услуги", "Полное описание", "Образец цвета", "Свойство: Цвет", "Свойство: Размер", "Цена закупки", "Свойство: Вес(г)", "Изображения", "Изображения варианта", "Параметр: Состав", "Параметр: Уход", "Параметр: Происхождение", "Размещение на сайте", "Свойство: Бренд" ]), "eobuwie": (self.eobuwie_extract_method, [ "Краткое описание", "Артикул", "Свойство: Размер", "Полное описание(Таблица)", "Название товара или услуги", "Изображения", "Размещение на сайте", "Цена", "Наличие" ]), "decathlon": (self.decathlon_extract_method, [ "Краткое описание", "Артикул", "Название товара или услуги", "Полное описание", "Наличие", "Свойство: Цвет", "Свойство: Размер", "Цена закупки", "Параметр: Вес(г)", "Изображения варианта", "Размещение на сайте" ]), "zara": (self.zara_extract_method, [ "Краткое описание", "Артикул", "Название товара или услуги", "Наличие", "Образец цвета", "Свойство: Цвет", "Свойство: Размер", "Цена закупки", "Изображения", "Параметр: Состав", "Параметр: Уход", "Параметр: Происхождение", "Размещение на сайте", "Свойство: Бренд" ]), "chanel": (self.chanel_extract_method, [ "Краткое описание", "Артикул", "Наличие", "Свойство: Цвет", "Свойство: Размер", "Цена закупки", "Изображения", "Размещение на сайте", "Свойство: Бренд" ]) } self.method = json_data["method"] self.tags = json_data["tags"] self.headers = self.methods[self.method][1] for tag in self.tags: self.headers.insert(tag["column_number"], tag["column_name"]) def extract(self, parser, recorder, categories): self.methods[self.method][0](parser, recorder, categories) def default_extract_method(self): pass def tags_extract(self, soup, row): dom_tree = etree.HTML(str(soup)) for tag in self.tags: xpath_result = dom_tree.xpath(tag["xpath"]) column_data = "" if len(xpath_result): for element in xpath_result: column_data = ''.join(element.itertext()).strip() + "\n" row.insert(tag["column_number"], column_data) def chanel_extract_method(self, parser, recorder, categories): BASE_URL = "https://www.chanel.com" for i, category in enumerate(categories): table = [self.headers] print(f"Categories: {i + 1} / {len(categories)}", category) continue_loop = True category_page = 1 request_elements_count = 24 product_number = 1 category_pattern = r"\/pl\/[\w\d]+\/" location = "chanel/" + search(category_pattern, category)[0].replace("pl", "").replace("/", "") while continue_loop: category_data = parser.parse(f"{category}?requestType=ajax&page={category_page}&totalElementsCount={request_elements_count}", return_type="json") if not category_data["next"]: continue_loop = False products_count = category_data["totalProducts"] for product in category_data["dataLayer"]["productList"].values(): first_variant = True article_pattern = r"\/p\/[\d\w]+/" base_link = BASE_URL + product["quickviewPopin"]["page"] print(f"Products: {product_number} / {products_count}", base_link) product_number += 1 links = [base_link] while len(links): product_url = links.pop(0) product_page = parser.parse(product_url) if product_page == None: continue soup = BeautifulSoup(product_page, "html.parser") if first_variant: first_variant = False variants_links = soup.select(".link.js-tabpanel-anchor") replace_pattern = r"\/p\/.+$" for variant_link in variants_links: article = variant_link.get("data-value") if not article in product_url: links.append(sub(replace_pattern, f"/p/{article}", product_url)) product_url = soup.select("[property=\"og:url\"]")[0].get("content") article = search(article_pattern, product_url)[0].replace("/", "").replace("p", "") product_info = parser.parse(f"{BASE_URL}/pl/yapi/product/{article}?options=basic,vto,variants,stock&site=chanel", return_type="json") stock = 0 if product_info["stock"]["stockLevel"] == "IN_STOCK": stock = 1 product_color_name = product_info["color"]["name"] product_size = product_info.get("size") product_price = product_info["buyNow"].get("priceValue") images = "\n".join(map(lambda x: x["url"], product_info["basic"]["images"])) product_brand = "chanel" try: table_data = [] table_data.append([ product_url, article, stock, product_color_name, product_size, product_price, images, location, product_brand ]) self.tags_extract(soup, table_data[-1]) table += table_data.copy() except Exception as error: print(f"Extractor Error: {error}") csv_name = category.replace(f"{BASE_URL}/pl/", "").replace("/", "_") recorder.record(csv_name, table) def zara_extract_method(self, parser, recorder, categories): BASE_URL = "https://www.zara.com" BASE_POLISH_URL = "https://www.zara.com/pl/en/" for i, category in enumerate(categories): table = [self.headers] print(f"Categories: {i + 1} / {len(categories)}", category) category_page = parser.parse(category) category_soup = BeautifulSoup(category_page, "html.parser") verify_url = category_soup.select("[http-equiv=\"refresh\"]")[0].get("content").split("'")[1] bm_verify = verify_url.split("?")[-1] category_page = parser.parse(BASE_URL + verify_url) category_soup = BeautifulSoup(category_page, "html.parser") tag_script_inner = category_soup.select("[type=\"text/javascript\"][data-compress=\"true\"]")[0].text analytics_data = loads(search(r"zara\.analyticsData\s?=\s?{.+};", tag_script_inner)[0].split("=")[1].replace(";", "")) category_id = analytics_data["catGroupId"] category_products = parser.parse(f"{BASE_POLISH_URL}category/{category_id}/products?ajax=true", return_type="json") location = "ZARA/" + "/".join(category.split("/")[5].split("-")[:2]).upper() all_products_count = 0 for element in category_products["productGroups"][0]["elements"]: products = element.get("commercialComponents") if not products: continue for product in products: if not product.get("name"): continue all_products_count += 1 product_number = 0 for element in category_products["productGroups"][0]["elements"]: products = element.get("commercialComponents") if not products: continue for product in products: product_name = product.get("name") if not product_name: continue product_number += 1 seo_keyword = product["seo"]["keyword"] seo_id = product["seo"]["seoProductId"] if not seo_keyword: continue product_url = f"{BASE_POLISH_URL}{seo_keyword}-p{seo_id}.html" print(f"Products: {product_number} / {all_products_count}", product_url) article = product["detail"]["displayReference"] product_color_hex = product["colorInfo"].get("mainColorHexCode") product_color_name = product["detail"]["colors"][0]["name"] product_price = product["price"] / 100 product_brand = product["brand"].get("brandGroupCode") product_page = parser.parse(f"{product_url}?{bm_verify}") if product_page == None: continue soup = BeautifulSoup(product_page, "html.parser") sizes = soup.select("[data-qa-action][role=\"option\"]") images = "\n".join(map(lambda x: x.get("srcset").split(", ")[-1].split(" ")[0], soup.select(f"source[sizes=\"32vw\"]"))) product_id = product["id"] extra_data = parser.parse(f"https://www.zara.com/pl/pl/product/{product_id}/extra-detail?ajax=true", return_type="json") extra_data_extracted = {} for section in extra_data: extra_data_extracted[section["sectionType"]] = "" for component in section["components"]: if component["datatype"] in ["subtitle", "paragraph"]: extra_data_extracted[section["sectionType"]] += component["text"]["value"] + "\n" elif component["datatype"] == "spacer": extra_data_extracted[section["sectionType"]] += "\n" elif component["datatype"] == "iconList": for item in component["items"]: if item["datatype"] == "iconListItem" and item["description"]["datatype"] == "text": extra_data_extracted[section["sectionType"]] += item["description"]["value"] + "\n" materials = extra_data_extracted.get("materials") care = extra_data_extracted.get("care") origin = extra_data_extracted.get("origin") for size in sizes: try: table_data = [] if size.get("data-qa-action") == "size-in-stock": stock = 1 else: stock = 0 product_size = size.select(".product-size-info__main-label")[0].text table_data.append([ product_url, f"{article} - {product_size}", product_name, stock, product_color_hex, product_color_name, product_size, product_price, images, materials, care, origin, location, product_brand ]) self.tags_extract(soup, table_data[-1]) table += table_data.copy() except Exception as error: print(f"Extractor Error: {error}") csv_name = category.split("/")[-1].split("?")[0] recorder.record(csv_name, table) def decathlon_extract_method(self, parser, recorder, categories): BASE_URL = "https://www.decathlon.pl" for i, category in enumerate(categories): table = [self.headers] print(f"Categories: {i + 1} / {len(categories)}", category) continue_loop = True category_from = 0 while continue_loop: category_page = parser.parse(f"{category}?from={category_from}") category_soup = BeautifulSoup(category_page, "html.parser") offers_count = int(category_soup.select("h1 ~ span.count")[0].text.split(" ")[0]) products_links = category_soup.select("[class$=\"model-link\"]") products_links_count = len(products_links) for e, product_link in enumerate(products_links): product_url = BASE_URL + product_link.get("href") print(f"Products: {e + 1 + category_from} / {offers_count}", product_url) product_page = parser.parse(product_url) if product_page == None: continue soup = BeautifulSoup(product_page, "html.parser") meta_script_tags = soup.select("[type=\"application/ld+json\"]") if len(meta_script_tags) <= 1: continue meta_data = loads(meta_script_tags[1].text) path_steps = [] for step in meta_data["itemListElement"]: path_steps.append(step["item"]["name"]) product_path = "decathlon/" + "/".join(path_steps) script_json = soup.select("#__dkt")[0] __dkt = loads(script_json.text.replace("__DKT = ", "")) if __dkt["_ctx"]["page"]["id"] != "product": continue models_data = __dkt["_ctx"]["data"][4]["data"]["models"] for model in models_data: color = "" colors = [] if model.get("colors"): for color_info in model["colors"]: colors.append(color_info["label"]) color = " / ".join(colors) images = [] for image_info in model["images"]["product"]: images.append(image_info["url"].replace("/250x250", "")) image_lines = "\n".join(images) product_name = model["webLabel"] product_description = soup.select("[id^=\"ProductFunctionalities\"]") if len(product_description): product_description = product_description[0].encode_contents() else: product_description = "" skus_data = model["skus"] sku_ids = [] for sku in skus_data: sku_ids.append(sku["skuId"]) sku_ids = ",".join(sku_ids) stocks = parser.parse(f"https://www.decathlon.pl/pl/ajax/nfs/stocks/online?skuIds={sku_ids}", return_type="json") for sku in skus_data: try: sku_id = sku["skuId"] stock = stocks[sku_id]["stockOnline"] if stocks.get(sku_id) else "unknown" table_data = [] article = f'{model["modelId"]}-{sku_id}' size = "" if sku.get("size"): size = sku["size"] price = "" if sku.get("price"): price = sku["price"] weight = "" if sku.get("grossWeight"): weight = float(sku["grossWeight"]) table_data.append([ product_url, article, product_name, product_description, stock, color, size, price, weight, image_lines, product_path ]) self.tags_extract(soup, table_data[-1]) table += table_data.copy() except Exception as error: print(f"Extractor Error: {error}") if offers_count == products_links_count + category_from: continue_loop = False else: category_from += products_links_count csv_name = "_".join(category.split("/")[4:]).replace(":", "-").replace("?", "_").replace("=", "_") recorder.record(csv_name, table) def eobuwie_extract_method(self, parser, recorder, categories): for i, category in enumerate(categories): table = [self.headers] print(f"Categories: {i + 1} / {len(categories)}", category) category_page = 1 category_marka = category.split(":")[2].split("?")[0] category_type = category.split("/")[4] while True: category_products_data = parser.parse(f"https://eobuwie.com.pl/t-api/rest/search/eobuwie/v5/search?channel=eobuwie¤cy=PLN&locale=pl_PL&limit=100&page={category_page}&filters[marka][in][]={category_marka}&categories[]={category_type}&select[]=url_key&select[]=product_group_associated&select[]=images&select[]=final_price&select[]=footwear_size&select_locales[]=pl_PL", return_type="json") total = category_products_data["total"] products = category_products_data["products"] for e, product in enumerate(products): short_url = product["values"]["url_key"]["value"]["pl_PL"] product_url = f"https://eobuwie.com.pl/p/{short_url}" print(f"Products: {e + 1 + ((category_page - 1) * 100)} / {total}", product_url) product_page = parser.parse(product_url) if product_page == None: continue soup = BeautifulSoup(product_page, "html.parser") links = soup.select(".breadcrumb-list .text-link")[2:] product_location = "/".join(list(map(lambda x: x.text, links))) product_group = "" if product["values"].get("product_group_associated") and product["values"]["product_group_associated"].get("value"): product_group = product["values"]["product_group_associated"]["value"] product_name = soup.select("[data-test-id=\"product-name\"]")[0].text.strip() product_name = split(r"\d", product_name)[0] product_name = f"{product_name} - {product_group}" images_list = [] if product["values"].get("images") and product["values"]["images"].get("value"): for image in product["values"]["images"]["value"]: if image.get("url"): images_list.append(f'https://img.modivo.cloud/eob_product_1800w_1800h({image["url"]}.jpg,webp)') images_list = "\n".join(images_list) for i, variant in enumerate(product["variants"].values()): try: table_data = [] size_url = variant["size"] variant_url = f"{product_url}?size={size_url}" article = variant["id"] size_name = "" if variant["values"].get("footwear_size"): size_name = variant["values"]["footwear_size"]["value"]["label"] description = "" location = f"Каталог/Обувь и аксессуары/{product_location}" availability = variant["stock_quantity"] if variant["stock_quantity"]: price = variant["offers"][0]["final_price"]["amount"] else: price = product["values"]["final_price"]["value"]["pl_PL"]["PLN"]["amount"] table_data.append([ variant_url, article, size_name, description, product_name, images_list, location, price, availability ]) self.tags_extract(soup, table_data[-1]) table += table_data.copy() except Exception as error: print(f"Extractor Error: {error}") if category_page * 100 >= total: break category_page += 1 csv_name = category.split("/")[-1].replace(":", "-").replace("?", "_").replace("=", "_") recorder.record(csv_name, table) # ──────────────────────────────────────────────────────────────── # ZARA HOME — обновлённый метод # ──────────────────────────────────────────────────────────────── def zarahome_extract_method(self, parser, recorder, categories): BASE_API = "https://www.zarahome.com/itxrest/3/catalog/store/85009924/80290000" USER_BRAND = "ZARAHOME" for i, category in enumerate(categories): table = [self.headers] print(f"Categories: {i + 1} / {len(categories)} {category}") # ── HTML категории ─────────────────────────────────────── html = parser.parse(category) if html is None: print("Extractor Error: empty category page"); continue soup = BeautifulSoup(html, "html.parser") script = soup.select_one("#serverApp-state") if not script: print("Extractor Error: script#serverApp-state not found"); continue try: state = loads(script.string) except Exception as e: print(f"Extractor Error: bad JSON ({e})"); continue # ── category_id ────────────────────────────────────────── cdata = state.get("inditex-data", {}) cat_id = (cdata.get("iCategoryId") or cdata.get("categoryId") or cdata.get("iCategoryJSON", {}).get("id")) if not cat_id: for k in state: m = search(r"/category/(\d+)/product", k) if m: cat_id = m.group(1); break if not cat_id: print("Extractor Error: cannot detect category_id"); continue # ── блок с продуктами или их ID ───────────────────────── key = next((k for k in state if f"/category/{cat_id}/product" in k), None) if not key: print("Extractor Error: products block not found"); continue prod_block = state[key] summaries = [] # ★ СТАРАЯ схема: в JSON уже есть ["products"] if "products" in prod_block: for grp in prod_block["products"]: for s in grp["bundleProductSummaries"]: summaries.append({ "productUrl": s.get("productUrl", ""), "__full": None, # полного JSON пока нет "detail": s["detail"] # нужен reference }) # ★ НОВАЯ схема: есть только ID-шки, тянем их пачками else: ids = (prod_block.get("productIds") or prod_block.get("sortedProductIds") or prod_block.get("sortedProductIdsByPricesAsc") or []) print(f"→ pulling {len(ids)} products via API") CHUNK = 1 for p in range(0, len(ids), CHUNK): ids_chunk = ",".join(map(str, ids[p:p+CHUNK])) api = (f"{BASE_API}/productsArray" f"?languageId=-1&productIds={ids_chunk}&appId=1") data = parser.parse(api, return_type="json") # печатаем красиво (ANSI-символы не экранируем, чтобы было читаемо) print("\n=== RAW API JSON ===") print(textwrap.indent(json.dumps(data, ensure_ascii=False, indent=2), " ")) print("=== END ===\n") #### Печать в файл fname = PurePath(api).parts[-1].split("?")[0] # productsArray ts = int(time.time()) Path(f"/Users/valis/Yandex.Disk.localized/Python3/Parsing ZARAHOME/src_2024-09-05/records_folderdebug_{fname}_{ts}.json").write_text( json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8" ) print(f"→ RAW saved to debug_{fname}_{ts}.json") for prod in data.get("products", []): summaries.append({ "productUrl": prod.get("productUrl", ""), "__full": prod # уже полный JSON }) # ── путь категории для итоговой таблицы ───────────────── cat_json = cdata.get("iCategoryJSON", {}) cat_title = "/".join(cat_json.get("parentNames", []) + [cat_json.get("name", "")]) cat_path = f"Каталог/ZaraHome/{cat_title}" seen = set() for n, summary in enumerate(summaries, 1): short_url = summary.get("productUrl") if not short_url or short_url in seen: continue seen.add(short_url) print(f"Products: {n} / {len(summaries)} " f"https://www.zarahome.com/pl/{short_url}") # ── получаем полный JSON товара ───────────────────── prod = summary.get("__full") if prod is None: # старая схема ref_id = summary["detail"]["reference"].split("-")[0] api = (f"{BASE_API}/productsArray" f"?languageId=-1&referenceIds={ref_id}&appId=1") data = parser.parse(api, return_type="json") if not data or "products" not in data: print(f"Skip (no data) → {short_url}"); continue prod = data["products"][0] det = prod["detail"] url_full = f"https://www.zarahome.com/pl/en/{prod.get('productUrl','')}" article = det["displayReference"] name = prod["name"] descr = det["longDescription"] # ── перед блоком "все изображения" ─────────────────────────────── print("DETAIL KEYS:", list(det.keys())[:20]) # покажем первые 20 ключей print( textwrap.indent( json.dumps(det, ensure_ascii=False, indent=2), # полный JSON prefix=" " # немного отступа ) ) # ───────────────────────────────────────────────────────────────── # ── ВСЕ ИЗОБРАЖЕНИЯ ────────────────────────────────────────────── # raw_xmedia → либо список set-ов, либо None raw_xmedia = (det.get("xmedia") or prod.get("xmedia") or []) # default_idx → целое число (индекс) либо None default_idx = det.get("xmediaDefaultSet") # получаем список наборов, которые надо разобрать if isinstance(raw_xmedia, list) and raw_xmedia: if isinstance(default_idx, int): media_sets = [raw_xmedia[default_idx]] # только дефолтный else: media_sets = raw_xmedia # все наборы elif isinstance(raw_xmedia, dict): media_sets = [raw_xmedia] # иногда словарь else: media_sets = [] all_imgs = [ f"https://static.zarahome.net/8/photos4{loc['path']}/{m['idMedia']}2.jpg" for loc in media_sets for m in loc["xmediaItems"][0]["medias"] ] all_imgs_s = "\n".join(all_imgs) # состав colors_list = det.get("colors") or [] # может быть [] #### colors_list = det.get("colors") or [] if not colors_list: # псевдо-цвет colors_list = [{ "id": 0, "name": "DEFAULT", "image": {"url": ""}, "sizes": [{ "visibilityValue": "SHOW", "name": "", "description": "", "weight": prod.get("weight", ""), "price": prod.get("price", 0) }] }] ## comp_block = det.get("compositionDetail") or \ (colors_list[0].get("compositionDetail") if colors_list else None) comp_txt = "" if comp_block and comp_block.get("parts"): comp_txt = "\n".join( extract_components_zarahome(comp_block["parts"]) ) # уход care = "\n".join(c["description"] for c in det["care"]) # traceability trace = "" if colors_list and colors_list[0].get("traceability"): trace = "\n".join( f"{v['name']}\n" + "\n".join(v["country"]) for v in colors_list[0]["traceability"].values() if isinstance(v, dict) and v.get("country") and v.get("name") ) # ── цвета / размеры ───────────────────────────────── serial = 0 rows = [] if not colors_list: # у товара вообще нет вариантов цвета continue # переходим к следующему товару for clr in colors_list: if clr["image"] is None: continue clr_code = clr.get("id") clr_name = clr.get("name", "") # безопасно строим картинку: если поля нет — остаётся пусто clr_image = "" if clr.get("image") and clr["image"].get("url"): clr_image = f"https://static.zarahome.net/8/photos4{clr['image']['url']}_3_1_5.jpg" # ── ИЗОБРАЖЕНИЯ ЭТОГО ЦВЕТА ───────────────────────────────────── raw_xmedia = (det.get("xmedia") or prod.get("xmedia") or []) default_idx = det.get("xmediaDefaultSet") if isinstance(raw_xmedia, list) and raw_xmedia: media_sets = [raw_xmedia[default_idx]] if isinstance(default_idx, int) else raw_xmedia elif isinstance(raw_xmedia, dict): media_sets = [raw_xmedia] else: media_sets = [] clr_imgs = [ f"https://static.zarahome.net/8/photos4{loc['path']}/{m['idMedia']}2.jpg" for loc in media_sets if loc.get("colorCode") == clr_code for m in loc["xmediaItems"][0]["medias"] ] clr_imgs_s = "\n".join(clr_imgs) for size in clr["sizes"]: if size["visibilityValue"] != "SHOW": continue suffix = "" if serial == 0 else f"-{serial}" serial += 1 size_name = size["name"] size_descr = size["description"] size_full = f"{size_descr} ({size_name})" if size_descr else size_name weight = size.get("weight") or prod.get("weight", "") buy_price = int(size.get("price") or prod.get("price", 0)) / 100 rows.append([ url_full, f"{article}{suffix}", name, descr, clr_image, clr_name, size_full, buy_price, weight, all_imgs_s, clr_imgs_s, comp_txt, care, trace, cat_path, USER_BRAND ]) table += rows # ── сохраняем категорию ──────────────────────────────── csv_name = category.split("/")[-1] recorder.record(csv_name, table) def get_extractor(): with open(abspath("parse_settings.json"), "r", encoding="utf-8") as file: return Extractor(load(file))