996 lines
41 KiB
Python
996 lines
41 KiB
Python
from json import load, loads
|
||
from os.path import abspath
|
||
from bs4 import BeautifulSoup
|
||
from lxml import etree
|
||
from re import split, search, sub
|
||
import json, textwrap
|
||
from pathlib import Path, PurePath
|
||
import json, time
|
||
|
||
def extract_components_zarahome(parts):
|
||
composition = []
|
||
for part in parts:
|
||
if part.get("areas") and part.get("description"):
|
||
if len(parts) != 1:
|
||
composition.append(part["description"])
|
||
for area in part["areas"]:
|
||
area_name = area["description"]
|
||
percentage_area = area["percentageArea"]
|
||
|
||
composition.append(f"{area_name} ({percentage_area})")
|
||
for component in area["components"]:
|
||
material = component["material"]
|
||
percentage = component["percentage"]
|
||
|
||
composition.append(f"{percentage} {material}")
|
||
elif part.get("components") and part.get("description"):
|
||
if len(parts) != 1:
|
||
composition.append(part["description"])
|
||
for component in part["components"]:
|
||
material = component["material"]
|
||
percentage = component["percentage"]
|
||
|
||
composition.append(f"{percentage} {material}")
|
||
|
||
return composition
|
||
|
||
# класс для извлечения нужных данных
|
||
class Extractor:
|
||
def __init__(self, json_data):
|
||
self.methods = {
|
||
"": (self.default_extract_method, []),
|
||
"zarahome": (self.zarahome_extract_method, [
|
||
"Краткое описание",
|
||
"Артикул",
|
||
"Название товара или услуги",
|
||
"Полное описание",
|
||
"Образец цвета",
|
||
"Свойство: Цвет",
|
||
"Свойство: Размер",
|
||
"Цена закупки",
|
||
"Свойство: Вес(г)",
|
||
"Изображения",
|
||
"Изображения варианта",
|
||
"Параметр: Состав",
|
||
"Параметр: Уход",
|
||
"Параметр: Происхождение",
|
||
"Размещение на сайте",
|
||
"Свойство: Бренд"
|
||
]),
|
||
"eobuwie": (self.eobuwie_extract_method, [
|
||
"Краткое описание",
|
||
"Артикул",
|
||
"Свойство: Размер",
|
||
"Полное описание(Таблица)",
|
||
"Название товара или услуги",
|
||
"Изображения",
|
||
"Размещение на сайте",
|
||
"Цена",
|
||
"Наличие"
|
||
]),
|
||
"decathlon": (self.decathlon_extract_method, [
|
||
"Краткое описание",
|
||
"Артикул",
|
||
"Название товара или услуги",
|
||
"Полное описание",
|
||
"Наличие",
|
||
"Свойство: Цвет",
|
||
"Свойство: Размер",
|
||
"Цена закупки",
|
||
"Параметр: Вес(г)",
|
||
"Изображения варианта",
|
||
"Размещение на сайте"
|
||
]),
|
||
"zara": (self.zara_extract_method, [
|
||
"Краткое описание",
|
||
"Артикул",
|
||
"Название товара или услуги",
|
||
"Наличие",
|
||
"Образец цвета",
|
||
"Свойство: Цвет",
|
||
"Свойство: Размер",
|
||
"Цена закупки",
|
||
"Изображения",
|
||
"Параметр: Состав",
|
||
"Параметр: Уход",
|
||
"Параметр: Происхождение",
|
||
"Размещение на сайте",
|
||
"Свойство: Бренд"
|
||
]),
|
||
"chanel": (self.chanel_extract_method, [
|
||
"Краткое описание",
|
||
"Артикул",
|
||
"Наличие",
|
||
"Свойство: Цвет",
|
||
"Свойство: Размер",
|
||
"Цена закупки",
|
||
"Изображения",
|
||
"Размещение на сайте",
|
||
"Свойство: Бренд"
|
||
])
|
||
}
|
||
self.method = json_data["method"]
|
||
self.tags = json_data["tags"]
|
||
|
||
self.headers = self.methods[self.method][1]
|
||
|
||
for tag in self.tags:
|
||
self.headers.insert(tag["column_number"], tag["column_name"])
|
||
|
||
|
||
|
||
def extract(self, parser, recorder, categories):
|
||
self.methods[self.method][0](parser, recorder, categories)
|
||
|
||
def default_extract_method(self):
|
||
pass
|
||
|
||
def tags_extract(self, soup, row):
|
||
|
||
dom_tree = etree.HTML(str(soup))
|
||
|
||
for tag in self.tags:
|
||
|
||
xpath_result = dom_tree.xpath(tag["xpath"])
|
||
|
||
column_data = ""
|
||
|
||
if len(xpath_result):
|
||
|
||
for element in xpath_result:
|
||
|
||
column_data = ''.join(element.itertext()).strip() + "\n"
|
||
|
||
row.insert(tag["column_number"], column_data)
|
||
|
||
def chanel_extract_method(self, parser, recorder, categories):
|
||
|
||
BASE_URL = "https://www.chanel.com"
|
||
|
||
for i, category in enumerate(categories):
|
||
table = [self.headers]
|
||
|
||
print(f"Categories: {i + 1} / {len(categories)}", category)
|
||
|
||
continue_loop = True
|
||
|
||
category_page = 1
|
||
|
||
request_elements_count = 24
|
||
|
||
product_number = 1
|
||
|
||
category_pattern = r"\/pl\/[\w\d]+\/"
|
||
|
||
location = "chanel/" + search(category_pattern, category)[0].replace("pl", "").replace("/", "")
|
||
|
||
while continue_loop:
|
||
|
||
category_data = parser.parse(f"{category}?requestType=ajax&page={category_page}&totalElementsCount={request_elements_count}", return_type="json")
|
||
|
||
if not category_data["next"]:
|
||
continue_loop = False
|
||
|
||
products_count = category_data["totalProducts"]
|
||
|
||
for product in category_data["dataLayer"]["productList"].values():
|
||
|
||
first_variant = True
|
||
|
||
article_pattern = r"\/p\/[\d\w]+/"
|
||
|
||
base_link = BASE_URL + product["quickviewPopin"]["page"]
|
||
|
||
print(f"Products: {product_number} / {products_count}", base_link)
|
||
|
||
product_number += 1
|
||
|
||
links = [base_link]
|
||
|
||
while len(links):
|
||
|
||
product_url = links.pop(0)
|
||
|
||
product_page = parser.parse(product_url)
|
||
|
||
if product_page == None:
|
||
continue
|
||
|
||
soup = BeautifulSoup(product_page, "html.parser")
|
||
|
||
if first_variant:
|
||
first_variant = False
|
||
|
||
variants_links = soup.select(".link.js-tabpanel-anchor")
|
||
|
||
replace_pattern = r"\/p\/.+$"
|
||
|
||
for variant_link in variants_links:
|
||
article = variant_link.get("data-value")
|
||
|
||
if not article in product_url:
|
||
|
||
links.append(sub(replace_pattern, f"/p/{article}", product_url))
|
||
|
||
|
||
product_url = soup.select("[property=\"og:url\"]")[0].get("content")
|
||
|
||
article = search(article_pattern, product_url)[0].replace("/", "").replace("p", "")
|
||
|
||
product_info = parser.parse(f"{BASE_URL}/pl/yapi/product/{article}?options=basic,vto,variants,stock&site=chanel", return_type="json")
|
||
|
||
stock = 0
|
||
|
||
if product_info["stock"]["stockLevel"] == "IN_STOCK":
|
||
stock = 1
|
||
|
||
product_color_name = product_info["color"]["name"]
|
||
|
||
product_size = product_info.get("size")
|
||
|
||
product_price = product_info["buyNow"].get("priceValue")
|
||
|
||
images = "\n".join(map(lambda x: x["url"], product_info["basic"]["images"]))
|
||
|
||
product_brand = "chanel"
|
||
|
||
|
||
try:
|
||
|
||
table_data = []
|
||
|
||
table_data.append([
|
||
product_url,
|
||
article,
|
||
stock,
|
||
product_color_name,
|
||
product_size,
|
||
product_price,
|
||
images,
|
||
location,
|
||
product_brand
|
||
])
|
||
|
||
self.tags_extract(soup, table_data[-1])
|
||
|
||
table += table_data.copy()
|
||
|
||
|
||
except Exception as error:
|
||
print(f"Extractor Error: {error}")
|
||
|
||
csv_name = category.replace(f"{BASE_URL}/pl/", "").replace("/", "_")
|
||
recorder.record(csv_name, table)
|
||
|
||
|
||
def zara_extract_method(self, parser, recorder, categories):
|
||
|
||
BASE_URL = "https://www.zara.com"
|
||
BASE_POLISH_URL = "https://www.zara.com/pl/en/"
|
||
|
||
for i, category in enumerate(categories):
|
||
table = [self.headers]
|
||
|
||
print(f"Categories: {i + 1} / {len(categories)}", category)
|
||
|
||
category_page = parser.parse(category)
|
||
|
||
category_soup = BeautifulSoup(category_page, "html.parser")
|
||
|
||
verify_url = category_soup.select("[http-equiv=\"refresh\"]")[0].get("content").split("'")[1]
|
||
|
||
bm_verify = verify_url.split("?")[-1]
|
||
|
||
category_page = parser.parse(BASE_URL + verify_url)
|
||
|
||
category_soup = BeautifulSoup(category_page, "html.parser")
|
||
|
||
tag_script_inner = category_soup.select("[type=\"text/javascript\"][data-compress=\"true\"]")[0].text
|
||
|
||
analytics_data = loads(search(r"zara\.analyticsData\s?=\s?{.+};", tag_script_inner)[0].split("=")[1].replace(";", ""))
|
||
|
||
category_id = analytics_data["catGroupId"]
|
||
|
||
category_products = parser.parse(f"{BASE_POLISH_URL}category/{category_id}/products?ajax=true", return_type="json")
|
||
|
||
|
||
|
||
|
||
location = "ZARA/" + "/".join(category.split("/")[5].split("-")[:2]).upper()
|
||
|
||
all_products_count = 0
|
||
|
||
for element in category_products["productGroups"][0]["elements"]:
|
||
products = element.get("commercialComponents")
|
||
if not products:
|
||
continue
|
||
for product in products:
|
||
if not product.get("name"):
|
||
continue
|
||
all_products_count += 1
|
||
|
||
|
||
product_number = 0
|
||
|
||
for element in category_products["productGroups"][0]["elements"]:
|
||
|
||
products = element.get("commercialComponents")
|
||
|
||
if not products:
|
||
continue
|
||
|
||
for product in products:
|
||
|
||
product_name = product.get("name")
|
||
|
||
if not product_name:
|
||
continue
|
||
|
||
product_number += 1
|
||
|
||
seo_keyword = product["seo"]["keyword"]
|
||
seo_id = product["seo"]["seoProductId"]
|
||
|
||
if not seo_keyword:
|
||
continue
|
||
|
||
product_url = f"{BASE_POLISH_URL}{seo_keyword}-p{seo_id}.html"
|
||
|
||
print(f"Products: {product_number} / {all_products_count}", product_url)
|
||
|
||
article = product["detail"]["displayReference"]
|
||
|
||
product_color_hex = product["colorInfo"].get("mainColorHexCode")
|
||
|
||
product_color_name = product["detail"]["colors"][0]["name"]
|
||
|
||
product_price = product["price"] / 100
|
||
|
||
product_brand = product["brand"].get("brandGroupCode")
|
||
|
||
product_page = parser.parse(f"{product_url}?{bm_verify}")
|
||
|
||
if product_page == None:
|
||
continue
|
||
|
||
soup = BeautifulSoup(product_page, "html.parser")
|
||
|
||
sizes = soup.select("[data-qa-action][role=\"option\"]")
|
||
|
||
images = "\n".join(map(lambda x: x.get("srcset").split(", ")[-1].split(" ")[0], soup.select(f"source[sizes=\"32vw\"]")))
|
||
|
||
|
||
product_id = product["id"]
|
||
|
||
extra_data = parser.parse(f"https://www.zara.com/pl/pl/product/{product_id}/extra-detail?ajax=true", return_type="json")
|
||
|
||
|
||
extra_data_extracted = {}
|
||
|
||
for section in extra_data:
|
||
extra_data_extracted[section["sectionType"]] = ""
|
||
|
||
for component in section["components"]:
|
||
if component["datatype"] in ["subtitle", "paragraph"]:
|
||
extra_data_extracted[section["sectionType"]] += component["text"]["value"] + "\n"
|
||
|
||
elif component["datatype"] == "spacer":
|
||
extra_data_extracted[section["sectionType"]] += "\n"
|
||
|
||
elif component["datatype"] == "iconList":
|
||
for item in component["items"]:
|
||
if item["datatype"] == "iconListItem" and item["description"]["datatype"] == "text":
|
||
extra_data_extracted[section["sectionType"]] += item["description"]["value"] + "\n"
|
||
|
||
materials = extra_data_extracted.get("materials")
|
||
|
||
care = extra_data_extracted.get("care")
|
||
|
||
origin = extra_data_extracted.get("origin")
|
||
|
||
|
||
for size in sizes:
|
||
try:
|
||
|
||
table_data = []
|
||
|
||
if size.get("data-qa-action") == "size-in-stock":
|
||
stock = 1
|
||
else:
|
||
stock = 0
|
||
|
||
product_size = size.select(".product-size-info__main-label")[0].text
|
||
|
||
table_data.append([
|
||
product_url,
|
||
f"{article} - {product_size}",
|
||
product_name,
|
||
stock,
|
||
product_color_hex,
|
||
product_color_name,
|
||
product_size,
|
||
product_price,
|
||
images,
|
||
materials,
|
||
care,
|
||
origin,
|
||
location,
|
||
product_brand
|
||
])
|
||
|
||
self.tags_extract(soup, table_data[-1])
|
||
|
||
table += table_data.copy()
|
||
|
||
|
||
|
||
except Exception as error:
|
||
print(f"Extractor Error: {error}")
|
||
|
||
csv_name = category.split("/")[-1].split("?")[0]
|
||
recorder.record(csv_name, table)
|
||
|
||
|
||
|
||
def decathlon_extract_method(self, parser, recorder, categories):
|
||
|
||
BASE_URL = "https://www.decathlon.pl"
|
||
|
||
for i, category in enumerate(categories):
|
||
table = [self.headers]
|
||
|
||
print(f"Categories: {i + 1} / {len(categories)}", category)
|
||
|
||
continue_loop = True
|
||
|
||
category_from = 0
|
||
|
||
while continue_loop:
|
||
|
||
category_page = parser.parse(f"{category}?from={category_from}")
|
||
|
||
category_soup = BeautifulSoup(category_page, "html.parser")
|
||
|
||
offers_count = int(category_soup.select("h1 ~ span.count")[0].text.split(" ")[0])
|
||
|
||
products_links = category_soup.select("[class$=\"model-link\"]")
|
||
|
||
products_links_count = len(products_links)
|
||
|
||
for e, product_link in enumerate(products_links):
|
||
|
||
product_url = BASE_URL + product_link.get("href")
|
||
|
||
print(f"Products: {e + 1 + category_from} / {offers_count}", product_url)
|
||
|
||
product_page = parser.parse(product_url)
|
||
|
||
if product_page == None:
|
||
continue
|
||
|
||
soup = BeautifulSoup(product_page, "html.parser")
|
||
|
||
meta_script_tags = soup.select("[type=\"application/ld+json\"]")
|
||
|
||
if len(meta_script_tags) <= 1:
|
||
continue
|
||
|
||
meta_data = loads(meta_script_tags[1].text)
|
||
|
||
path_steps = []
|
||
|
||
for step in meta_data["itemListElement"]:
|
||
path_steps.append(step["item"]["name"])
|
||
|
||
product_path = "decathlon/" + "/".join(path_steps)
|
||
|
||
script_json = soup.select("#__dkt")[0]
|
||
|
||
__dkt = loads(script_json.text.replace("__DKT = ", ""))
|
||
|
||
if __dkt["_ctx"]["page"]["id"] != "product":
|
||
continue
|
||
|
||
models_data = __dkt["_ctx"]["data"][4]["data"]["models"]
|
||
|
||
for model in models_data:
|
||
|
||
color = ""
|
||
colors = []
|
||
|
||
if model.get("colors"):
|
||
for color_info in model["colors"]:
|
||
colors.append(color_info["label"])
|
||
|
||
color = " / ".join(colors)
|
||
|
||
images = []
|
||
|
||
for image_info in model["images"]["product"]:
|
||
images.append(image_info["url"].replace("/250x250", ""))
|
||
|
||
image_lines = "\n".join(images)
|
||
|
||
|
||
product_name = model["webLabel"]
|
||
|
||
|
||
product_description = soup.select("[id^=\"ProductFunctionalities\"]")
|
||
|
||
if len(product_description):
|
||
product_description = product_description[0].encode_contents()
|
||
else:
|
||
product_description = ""
|
||
|
||
|
||
|
||
skus_data = model["skus"]
|
||
|
||
sku_ids = []
|
||
|
||
for sku in skus_data:
|
||
sku_ids.append(sku["skuId"])
|
||
|
||
sku_ids = ",".join(sku_ids)
|
||
|
||
stocks = parser.parse(f"https://www.decathlon.pl/pl/ajax/nfs/stocks/online?skuIds={sku_ids}", return_type="json")
|
||
|
||
for sku in skus_data:
|
||
try:
|
||
|
||
sku_id = sku["skuId"]
|
||
|
||
|
||
stock = stocks[sku_id]["stockOnline"] if stocks.get(sku_id) else "unknown"
|
||
|
||
table_data = []
|
||
|
||
article = f'{model["modelId"]}-{sku_id}'
|
||
|
||
size = ""
|
||
if sku.get("size"):
|
||
size = sku["size"]
|
||
|
||
price = ""
|
||
if sku.get("price"):
|
||
price = sku["price"]
|
||
|
||
weight = ""
|
||
if sku.get("grossWeight"):
|
||
weight = float(sku["grossWeight"])
|
||
|
||
table_data.append([
|
||
product_url,
|
||
article,
|
||
product_name,
|
||
product_description,
|
||
stock,
|
||
color,
|
||
size,
|
||
price,
|
||
weight,
|
||
image_lines,
|
||
product_path
|
||
])
|
||
|
||
self.tags_extract(soup, table_data[-1])
|
||
|
||
table += table_data.copy()
|
||
|
||
except Exception as error:
|
||
print(f"Extractor Error: {error}")
|
||
|
||
|
||
if offers_count == products_links_count + category_from:
|
||
continue_loop = False
|
||
else:
|
||
category_from += products_links_count
|
||
|
||
csv_name = "_".join(category.split("/")[4:]).replace(":", "-").replace("?", "_").replace("=", "_")
|
||
recorder.record(csv_name, table)
|
||
|
||
|
||
def eobuwie_extract_method(self, parser, recorder, categories):
|
||
|
||
for i, category in enumerate(categories):
|
||
table = [self.headers]
|
||
|
||
print(f"Categories: {i + 1} / {len(categories)}", category)
|
||
|
||
category_page = 1
|
||
|
||
category_marka = category.split(":")[2].split("?")[0]
|
||
category_type = category.split("/")[4]
|
||
|
||
while True:
|
||
|
||
category_products_data = parser.parse(f"https://eobuwie.com.pl/t-api/rest/search/eobuwie/v5/search?channel=eobuwie¤cy=PLN&locale=pl_PL&limit=100&page={category_page}&filters[marka][in][]={category_marka}&categories[]={category_type}&select[]=url_key&select[]=product_group_associated&select[]=images&select[]=final_price&select[]=footwear_size&select_locales[]=pl_PL", return_type="json")
|
||
|
||
total = category_products_data["total"]
|
||
|
||
products = category_products_data["products"]
|
||
|
||
for e, product in enumerate(products):
|
||
|
||
short_url = product["values"]["url_key"]["value"]["pl_PL"]
|
||
|
||
product_url = f"https://eobuwie.com.pl/p/{short_url}"
|
||
|
||
print(f"Products: {e + 1 + ((category_page - 1) * 100)} / {total}", product_url)
|
||
|
||
product_page = parser.parse(product_url)
|
||
|
||
if product_page == None:
|
||
continue
|
||
|
||
soup = BeautifulSoup(product_page, "html.parser")
|
||
|
||
links = soup.select(".breadcrumb-list .text-link")[2:]
|
||
|
||
product_location = "/".join(list(map(lambda x: x.text, links)))
|
||
|
||
product_group = ""
|
||
|
||
if product["values"].get("product_group_associated") and product["values"]["product_group_associated"].get("value"):
|
||
product_group = product["values"]["product_group_associated"]["value"]
|
||
|
||
product_name = soup.select("[data-test-id=\"product-name\"]")[0].text.strip()
|
||
|
||
product_name = split(r"\d", product_name)[0]
|
||
|
||
product_name = f"{product_name} - {product_group}"
|
||
|
||
|
||
images_list = []
|
||
|
||
if product["values"].get("images") and product["values"]["images"].get("value"):
|
||
for image in product["values"]["images"]["value"]:
|
||
if image.get("url"):
|
||
images_list.append(f'https://img.modivo.cloud/eob_product_1800w_1800h({image["url"]}.jpg,webp)')
|
||
|
||
images_list = "\n".join(images_list)
|
||
|
||
for i, variant in enumerate(product["variants"].values()):
|
||
try:
|
||
table_data = []
|
||
|
||
size_url = variant["size"]
|
||
|
||
variant_url = f"{product_url}?size={size_url}"
|
||
|
||
article = variant["id"]
|
||
|
||
size_name = ""
|
||
|
||
if variant["values"].get("footwear_size"):
|
||
|
||
size_name = variant["values"]["footwear_size"]["value"]["label"]
|
||
|
||
description = ""
|
||
|
||
location = f"Каталог/Обувь и аксессуары/{product_location}"
|
||
|
||
availability = variant["stock_quantity"]
|
||
|
||
if variant["stock_quantity"]:
|
||
price = variant["offers"][0]["final_price"]["amount"]
|
||
else:
|
||
price = product["values"]["final_price"]["value"]["pl_PL"]["PLN"]["amount"]
|
||
|
||
table_data.append([
|
||
variant_url,
|
||
article,
|
||
size_name,
|
||
description,
|
||
product_name,
|
||
images_list,
|
||
location,
|
||
price,
|
||
availability
|
||
])
|
||
|
||
self.tags_extract(soup, table_data[-1])
|
||
|
||
table += table_data.copy()
|
||
|
||
except Exception as error:
|
||
print(f"Extractor Error: {error}")
|
||
|
||
|
||
if category_page * 100 >= total:
|
||
break
|
||
|
||
category_page += 1
|
||
|
||
|
||
csv_name = category.split("/")[-1].replace(":", "-").replace("?", "_").replace("=", "_")
|
||
recorder.record(csv_name, table)
|
||
|
||
# ────────────────────────────────────────────────────────────────
|
||
# ZARA HOME — обновлённый метод
|
||
# ────────────────────────────────────────────────────────────────
|
||
def zarahome_extract_method(self, parser, recorder, categories):
|
||
|
||
BASE_API = "https://www.zarahome.com/itxrest/3/catalog/store/85009924/80290000"
|
||
USER_BRAND = "ZARAHOME"
|
||
|
||
for i, category in enumerate(categories):
|
||
table = [self.headers]
|
||
print(f"Categories: {i + 1} / {len(categories)} {category}")
|
||
|
||
# ── HTML категории ───────────────────────────────────────
|
||
html = parser.parse(category)
|
||
if html is None:
|
||
print("Extractor Error: empty category page"); continue
|
||
soup = BeautifulSoup(html, "html.parser")
|
||
|
||
script = soup.select_one("#serverApp-state")
|
||
if not script:
|
||
print("Extractor Error: script#serverApp-state not found"); continue
|
||
try:
|
||
state = loads(script.string)
|
||
except Exception as e:
|
||
print(f"Extractor Error: bad JSON ({e})"); continue
|
||
|
||
# ── category_id ──────────────────────────────────────────
|
||
cdata = state.get("inditex-data", {})
|
||
cat_id = (cdata.get("iCategoryId")
|
||
or cdata.get("categoryId")
|
||
or cdata.get("iCategoryJSON", {}).get("id"))
|
||
if not cat_id:
|
||
for k in state:
|
||
m = search(r"/category/(\d+)/product", k)
|
||
if m: cat_id = m.group(1); break
|
||
if not cat_id:
|
||
print("Extractor Error: cannot detect category_id"); continue
|
||
|
||
# ── блок с продуктами или их ID ─────────────────────────
|
||
key = next((k for k in state if f"/category/{cat_id}/product" in k), None)
|
||
if not key:
|
||
print("Extractor Error: products block not found"); continue
|
||
prod_block = state[key]
|
||
|
||
summaries = []
|
||
|
||
# ★ СТАРАЯ схема: в JSON уже есть ["products"]
|
||
if "products" in prod_block:
|
||
for grp in prod_block["products"]:
|
||
for s in grp["bundleProductSummaries"]:
|
||
summaries.append({
|
||
"productUrl": s.get("productUrl", ""),
|
||
"__full": None, # полного JSON пока нет
|
||
"detail": s["detail"] # нужен reference
|
||
})
|
||
|
||
# ★ НОВАЯ схема: есть только ID-шки, тянем их пачками
|
||
else:
|
||
ids = (prod_block.get("productIds")
|
||
or prod_block.get("sortedProductIds")
|
||
or prod_block.get("sortedProductIdsByPricesAsc")
|
||
or [])
|
||
print(f"→ pulling {len(ids)} products via API")
|
||
CHUNK = 1
|
||
for p in range(0, len(ids), CHUNK):
|
||
ids_chunk = ",".join(map(str, ids[p:p+CHUNK]))
|
||
api = (f"{BASE_API}/productsArray"
|
||
f"?languageId=-1&productIds={ids_chunk}&appId=1")
|
||
data = parser.parse(api, return_type="json")
|
||
|
||
# печатаем красиво (ANSI-символы не экранируем, чтобы было читаемо)
|
||
print("\n=== RAW API JSON ===")
|
||
print(textwrap.indent(json.dumps(data, ensure_ascii=False, indent=2), " "))
|
||
print("=== END ===\n")
|
||
#### Печать в файл
|
||
fname = PurePath(api).parts[-1].split("?")[0] # productsArray
|
||
ts = int(time.time())
|
||
Path(f"/Users/valis/Yandex.Disk.localized/Python3/Parsing ZARAHOME/src_2024-09-05/records_folderdebug_{fname}_{ts}.json").write_text(
|
||
json.dumps(data, ensure_ascii=False, indent=2),
|
||
encoding="utf-8"
|
||
)
|
||
print(f"→ RAW saved to debug_{fname}_{ts}.json")
|
||
|
||
for prod in data.get("products", []):
|
||
summaries.append({
|
||
"productUrl": prod.get("productUrl", ""),
|
||
"__full": prod # уже полный JSON
|
||
})
|
||
|
||
# ── путь категории для итоговой таблицы ─────────────────
|
||
cat_json = cdata.get("iCategoryJSON", {})
|
||
cat_title = "/".join(cat_json.get("parentNames", []) +
|
||
[cat_json.get("name", "")])
|
||
cat_path = f"Каталог/ZaraHome/{cat_title}"
|
||
|
||
seen = set()
|
||
for n, summary in enumerate(summaries, 1):
|
||
|
||
short_url = summary.get("productUrl")
|
||
if not short_url or short_url in seen:
|
||
continue
|
||
seen.add(short_url)
|
||
print(f"Products: {n} / {len(summaries)} "
|
||
f"https://www.zarahome.com/pl/{short_url}")
|
||
|
||
# ── получаем полный JSON товара ─────────────────────
|
||
prod = summary.get("__full")
|
||
if prod is None: # старая схема
|
||
ref_id = summary["detail"]["reference"].split("-")[0]
|
||
api = (f"{BASE_API}/productsArray"
|
||
f"?languageId=-1&referenceIds={ref_id}&appId=1")
|
||
data = parser.parse(api, return_type="json")
|
||
|
||
|
||
if not data or "products" not in data:
|
||
print(f"Skip (no data) → {short_url}"); continue
|
||
prod = data["products"][0]
|
||
|
||
det = prod["detail"]
|
||
|
||
url_full = f"https://www.zarahome.com/pl/en/{prod.get('productUrl','')}"
|
||
article = det["displayReference"]
|
||
name = prod["name"]
|
||
descr = det["longDescription"]
|
||
|
||
# ── перед блоком "все изображения" ───────────────────────────────
|
||
print("DETAIL KEYS:", list(det.keys())[:20]) # покажем первые 20 ключей
|
||
print(
|
||
textwrap.indent(
|
||
json.dumps(det, ensure_ascii=False, indent=2), # полный JSON
|
||
prefix=" " # немного отступа
|
||
)
|
||
)
|
||
# ─────────────────────────────────────────────────────────────────
|
||
|
||
# ── ВСЕ ИЗОБРАЖЕНИЯ ──────────────────────────────────────────────
|
||
# raw_xmedia → либо список set-ов, либо None
|
||
raw_xmedia = (det.get("xmedia") or
|
||
prod.get("xmedia") or
|
||
[])
|
||
|
||
# default_idx → целое число (индекс) либо None
|
||
default_idx = det.get("xmediaDefaultSet")
|
||
|
||
# получаем список наборов, которые надо разобрать
|
||
if isinstance(raw_xmedia, list) and raw_xmedia:
|
||
if isinstance(default_idx, int):
|
||
media_sets = [raw_xmedia[default_idx]] # только дефолтный
|
||
else:
|
||
media_sets = raw_xmedia # все наборы
|
||
elif isinstance(raw_xmedia, dict):
|
||
media_sets = [raw_xmedia] # иногда словарь
|
||
else:
|
||
media_sets = []
|
||
|
||
all_imgs = [
|
||
f"https://static.zarahome.net/8/photos4{loc['path']}/{m['idMedia']}2.jpg"
|
||
for loc in media_sets
|
||
for m in loc["xmediaItems"][0]["medias"]
|
||
]
|
||
all_imgs_s = "\n".join(all_imgs)
|
||
|
||
|
||
|
||
|
||
# состав
|
||
colors_list = det.get("colors") or [] # может быть []
|
||
####
|
||
colors_list = det.get("colors") or []
|
||
|
||
if not colors_list: # псевдо-цвет
|
||
colors_list = [{
|
||
"id": 0,
|
||
"name": "DEFAULT",
|
||
"image": {"url": ""},
|
||
"sizes": [{
|
||
"visibilityValue": "SHOW",
|
||
"name": "",
|
||
"description": "",
|
||
"weight": prod.get("weight", ""),
|
||
"price": prod.get("price", 0)
|
||
}]
|
||
}]
|
||
##
|
||
comp_block = det.get("compositionDetail") or \
|
||
(colors_list[0].get("compositionDetail") if colors_list else None)
|
||
|
||
comp_txt = ""
|
||
if comp_block and comp_block.get("parts"):
|
||
comp_txt = "\n".join(
|
||
extract_components_zarahome(comp_block["parts"])
|
||
)
|
||
|
||
# уход
|
||
care = "\n".join(c["description"] for c in det["care"])
|
||
|
||
# traceability
|
||
trace = ""
|
||
if colors_list and colors_list[0].get("traceability"):
|
||
trace = "\n".join(
|
||
f"{v['name']}\n" + "\n".join(v["country"])
|
||
for v in colors_list[0]["traceability"].values()
|
||
if isinstance(v, dict) and v.get("country") and v.get("name")
|
||
)
|
||
|
||
|
||
# ── цвета / размеры ─────────────────────────────────
|
||
serial = 0
|
||
rows = []
|
||
if not colors_list: # у товара вообще нет вариантов цвета
|
||
continue # переходим к следующему товару
|
||
|
||
for clr in colors_list:
|
||
|
||
if clr["image"] is None: continue
|
||
|
||
clr_code = clr.get("id")
|
||
clr_name = clr.get("name", "")
|
||
# безопасно строим картинку: если поля нет — остаётся пусто
|
||
clr_image = ""
|
||
if clr.get("image") and clr["image"].get("url"):
|
||
clr_image = f"https://static.zarahome.net/8/photos4{clr['image']['url']}_3_1_5.jpg"
|
||
|
||
|
||
# ── ИЗОБРАЖЕНИЯ ЭТОГО ЦВЕТА ─────────────────────────────────────
|
||
raw_xmedia = (det.get("xmedia") or
|
||
prod.get("xmedia") or
|
||
[])
|
||
|
||
default_idx = det.get("xmediaDefaultSet")
|
||
|
||
if isinstance(raw_xmedia, list) and raw_xmedia:
|
||
media_sets = [raw_xmedia[default_idx]] if isinstance(default_idx, int) else raw_xmedia
|
||
elif isinstance(raw_xmedia, dict):
|
||
media_sets = [raw_xmedia]
|
||
else:
|
||
media_sets = []
|
||
|
||
clr_imgs = [
|
||
f"https://static.zarahome.net/8/photos4{loc['path']}/{m['idMedia']}2.jpg"
|
||
for loc in media_sets
|
||
if loc.get("colorCode") == clr_code
|
||
for m in loc["xmediaItems"][0]["medias"]
|
||
]
|
||
clr_imgs_s = "\n".join(clr_imgs)
|
||
|
||
|
||
|
||
for size in clr["sizes"]:
|
||
if size["visibilityValue"] != "SHOW": continue
|
||
suffix = "" if serial == 0 else f"-{serial}"
|
||
serial += 1
|
||
|
||
size_name = size["name"]
|
||
size_descr = size["description"]
|
||
size_full = f"{size_descr} ({size_name})" if size_descr else size_name
|
||
weight = size.get("weight") or prod.get("weight", "")
|
||
buy_price = int(size.get("price") or prod.get("price", 0)) / 100
|
||
|
||
rows.append([
|
||
url_full,
|
||
f"{article}{suffix}",
|
||
name,
|
||
descr,
|
||
clr_image,
|
||
clr_name,
|
||
size_full,
|
||
buy_price,
|
||
weight,
|
||
all_imgs_s,
|
||
clr_imgs_s,
|
||
comp_txt,
|
||
care,
|
||
trace,
|
||
cat_path,
|
||
USER_BRAND
|
||
])
|
||
|
||
table += rows
|
||
|
||
# ── сохраняем категорию ────────────────────────────────
|
||
csv_name = category.split("/")[-1]
|
||
recorder.record(csv_name, table)
|
||
|
||
|
||
def get_extractor():
|
||
with open(abspath("parse_settings.json"), "r", encoding="utf-8") as file:
|
||
return Extractor(load(file)) |