From 00048eb2bd306e95acbc8b201b77d07a4480af8a Mon Sep 17 00:00:00 2001 From: va1is Date: Wed, 1 Oct 2025 16:21:00 +0300 Subject: [PATCH] =?UTF-8?q?IKEA-api-=D0=BF=D0=B5=D1=80=D0=B5=D0=BD=D0=BE?= =?UTF-8?q?=D1=81=20=D0=BD=D0=B0=20=D0=BF=D1=80=D0=BE=D0=B4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Парсер_IKEA/main.py | 26 +- Парсер_IKEA/main0.py | 723 ++++++++++++++++++++++++++++++++++ Парсер_IKEA/main_win proxy.py | 2 +- 3 files changed, 741 insertions(+), 10 deletions(-) create mode 100644 Парсер_IKEA/main0.py diff --git a/Парсер_IKEA/main.py b/Парсер_IKEA/main.py index fb7d810..960db85 100644 --- a/Парсер_IKEA/main.py +++ b/Парсер_IKEA/main.py @@ -1,3 +1,4 @@ + #!/usr/bin/env python3 # -*- coding: utf-8 -*- @@ -24,7 +25,7 @@ POST_LOG = os.path.join(RECORDS_DIR, "post_log.txt") Ответ: 1 (да) / 0 (нет). Пустой ввод = 1. ''' -POST_URL = os.getenv("IKEA_POST_URL", "http://localhost:3005/parser/data") +POST_URL = os.getenv("IKEA_POST_URL", "http://172.25.4.101:3005/parser/data") POST_API_KEY = os.getenv("IKEA_POST_API_KEY", "") POST_TIMEOUT = 20 BATCH_SIZE = 50 @@ -248,7 +249,9 @@ def format_dimensions(raw_dim_props, with_html=False, translated=False): br = "
" if with_html else "\n" title = translate_token("Wymiary") if translated else "Wymiary" - lines.append(f"{title}" if with_html else title) + + lines.append(f"{title}" if with_html else title) + #lines.append(f"{title}" if with_html else title) for d in raw_dim_props.get("dimensions", []): name = d.get("name", "") @@ -265,7 +268,8 @@ def format_dimensions(raw_dim_props, with_html=False, translated=False): pack = (raw_dim_props.get("packaging") or {}) pack_title = translate_token("Opakowanie") if translated else "Opakowanie" lines.append(br if with_html else "") - lines.append(f"{pack_title}" if with_html else pack_title) + lines.append(f"{pack_title}" if with_html else pack_title) + #lines.append(f"{pack_title}" if with_html else pack_title) content = (pack.get("contentProps") or {}).get("packages") or [] for pkg in content: @@ -322,7 +326,8 @@ def format_product_details(raw_details, add_summary_desc="", with_html=False, sk out.append(br if with_html else "") t1 = "Informacje o produkcie" - out.append(f"{t1}" if with_html else t1) + out.append(f"{t1}" if with_html else t1) + #out.append(f"{t1}" if with_html else t1) pd = (raw_details.get("productDescriptionProps") or {}) paragraphs = pd.get("paragraphs") or [] for p in paragraphs: @@ -343,7 +348,8 @@ def format_product_details(raw_details, add_summary_desc="", with_html=False, sk if gk: out.append(br if with_html else "") t2 = "Dobrze wiedzieć" - out.append(f"{t2}" if with_html else t2) + out.append(f"{t2}" if with_html else t2) + #out.append(f"{t2}" if with_html else t2) for item in gk: txt = item.get("text") if txt: @@ -356,7 +362,8 @@ def format_product_details(raw_details, add_summary_desc="", with_html=False, sk t3 = "Materiały i pielęgnacja" if mats or care: out.append(br if with_html else "") - out.append(f"{t3}" if with_html else t3) + out.append(f"{t3}" if with_html else t3) + #out.append(f"{t3}" if with_html else t3) if mats: out.append("Materiały") @@ -385,7 +392,8 @@ def format_product_details(raw_details, add_summary_desc="", with_html=False, sk if sc: out.append(br if with_html else "") t4 = "Bezpieczeństwo i zgodność z przepisami" - out.append(f"{t4}" if with_html else t4) + out.append(f"{t4}" if with_html else t4) + #out.append(f"{t4}" if with_html else t4) for s in sc: txt = s.get("text") if txt: @@ -597,8 +605,8 @@ def build_variant(row: dict) -> dict: } return { - #"category": {"name": category_name}, - "category": {"name": "TEST/IKEA"}, + "category": {"name": category_name}, + #"category": {"name": "TEST/IKEA"}, "brand": {"name": "ikea"}, "variant": variant, } diff --git a/Парсер_IKEA/main0.py b/Парсер_IKEA/main0.py new file mode 100644 index 0000000..9fcb4a1 --- /dev/null +++ b/Парсер_IKEA/main0.py @@ -0,0 +1,723 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import os, json, re, math, time, html, requests, datetime +from bs4 import BeautifulSoup +from openpyxl import Workbook + +# ───────────────────────── ПУТИ / ФАЙЛЫ ─────────────────────────── +BASE_DIR = os.path.dirname(os.path.abspath(__file__)) +RECORDS_DIR = os.path.join(BASE_DIR, "records_folder") +os.makedirs(RECORDS_DIR, exist_ok=True) + +INPUT_FILE = os.path.join(BASE_DIR, "product_links.txt") +OUTPUT_FILE = os.path.join(RECORDS_DIR, "records.xlsx") +DICT_FILE = os.path.join(BASE_DIR, "dictionary_main.txt") +EXCL_FILE = os.path.join(BASE_DIR, "exclusion_materials.txt") +POST_LOG = os.path.join(RECORDS_DIR, "post_log.txt") + +# ───────────────────────── НАСТРОЙКИ POST ───────────────────────── +''' +На старте спросим: +- сохранять ли JSON батчи на диск +- отправлять ли батчи на API + +Ответ: 1 (да) / 0 (нет). Пустой ввод = 1. +''' +POST_URL = os.getenv("IKEA_POST_URL", "http://172.25.4.101:3005/parser/data") +POST_API_KEY = os.getenv("IKEA_POST_API_KEY", "") +POST_TIMEOUT = 20 +BATCH_SIZE = 50 + +# ───────────────────────── НАСТРОЙКИ САЙТА ──────────────────────── +HEADERS = {"User-Agent": "Mozilla/5.0"} +CSS_SELECTOR = ".pip-product__subgrid.product-pip.js-product-pip" + +BLOCKS = [ + "buyModule", + "productSummary", + "pipPricePackage", + "productInformationSection", + "keyFacts", + "stockcheckSection", + "availabilityGroup", + "productGallery", +] + +''' +Whitelist колонок для Excel. +''' +KEEP_COLUMNS = [ + "availabilityGroup.serverOnlineSellable", + "availabilityGroup.storeHeader", + "buyModule.onlineSellable", + "buyModule.productName", + "buyModule.productPrice", + "buyModule.productType", + "keyFacts.ariaLabels", + "keyFacts.gaLabel", + "keyFacts.keyFacts", + "keyFacts.keyFacts_formatted", + "pipPricePackage.measurementText", + "pipPricePackage.productDescription", + "productGallery.urls", + "productInformationSection.dimensionProps", + "productInformationSection.dimensionProps_formatted", + "productInformationSection.dimensionProps_formatted_html_translated", + "productInformationSection.productDetailsProps", + "productInformationSection.productDetailsProps_formatted", + "productInformationSection.productDetailsProps_formatted_html", + "productSummary.description", + "productSummary.visibleItemNo", + "stockcheckSection.packagingProps", + "stockcheckSection.typeName", + "total brutto", + "prductVariantColorMeasure", + "categoryBreadcrumb", + "originalName", # ### NEW: колонка для Excel + "url", +] + +# ───────────────────────── УТИЛИТЫ I/O ──────────────────────────── +def ask_bool(prompt: str, default: str = "1") -> bool: + ''' + Спрашивает 1/0; пустой ввод → default. + ''' + try: + val = input(f"{prompt} (1=yes, 0=no) [{default}]: ").strip() or default + except EOFError: + val = default + return val == "1" + +def _post_log(msg: str): + '''Пишем строку в post_log.txt (молча игнорируем ошибки).''' + try: + with open(POST_LOG, "a", encoding="utf-8") as f: + f.write(msg.rstrip() + "\n") + except Exception: + pass + +def _now_tag(): + return datetime.datetime.now().strftime("%Y%m%d_%H%M%S") + +def _save_json_batch(payload: dict, batch_index: int): + fname = f"ikea_batch_{_now_tag()}_{batch_index:04d}.json" + fpath = os.path.join(RECORDS_DIR, fname) + with open(fpath, "w", encoding="utf-8") as fh: + json.dump(payload, fh, ensure_ascii=False, indent=2) + print(f"💾 JSON saved: {fname}") + return fpath + +# ───────────────────────── СЛОВАРИ / ФИЛЬТРЫ ────────────────────── +def load_dictionary(path: str) -> dict: + ''' + Читает словарь переводов: + "Wymiary" : "Размеры", + ... + ''' + if not os.path.isfile(path): + return {} + txt = open(path, "r", encoding="utf-8").read() + pairs = re.findall(r'"([^"]+)"\s*:\s*"([^"]+)"', txt) + return {k: v for k, v in pairs} + +DICT = load_dictionary(DICT_FILE) + +def translate_token(token: str) -> str: + return DICT.get(token, token) + +def load_exclusions(path: str) -> set: + ''' + Загружает токены исключений из exclusion_materials.txt: + - можно по одному на строку + - или через запятую + - регистр игнорируем + ''' + if not os.path.isfile(path): + return set() + txt = open(path, "r", encoding="utf-8").read() + # сначала из кавычек, если есть: + quoted = re.findall(r'"([^"]+)"', txt, flags=re.S) + tokens = quoted if quoted else re.split(r"[,;\n\r]+", txt) + return {t.strip().lower() for t in tokens if t.strip()} + +EXCLUSIONS = load_exclusions(EXCL_FILE) + +def materials_from_details_json(details: dict) -> list[str]: + ''' + Извлекаем ВСЕ строки из ключей "material" на любой глубине productDetailsProps. + Встречаются разные схемы, поэтому делаем обход рекурсивно. + ''' + out = [] + def walk(node): + if isinstance(node, dict): + for k, v in node.items(): + if k == "material" and isinstance(v, str): + out.append(v) + else: + walk(v) + elif isinstance(node, list): + for x in node: + walk(x) + walk(details or {}) + return out + +def materials_match_exclusions(details: dict, exclusion_tokens: set) -> bool: + ''' + True — если хоть один токен встречается в любом material (case-insensitive). + ''' + if not exclusion_tokens: + return False + mats = materials_from_details_json(details) + joined = "\n".join(mats).lower() + return any(tok in joined for tok in exclusion_tokens) + +# ───────────────────────── ФОРМАТТЕРЫ ───────────────────────────── +def _parse_json_value(val): + if isinstance(val, (dict, list)) or val is None: + return val + if isinstance(val, str): + s = val.strip() + if not s: + return val + try: + return json.loads(s) + except Exception: + return val + return val + +def flatten_block(block_name, data): + if not isinstance(data, dict): + return {} + flat = {} + for k, v in data.items(): + if block_name == "productGallery" and k == "mediaList": + if isinstance(v, list): + urls = [] + for item in v: + content = item.get("content", {}) + if isinstance(content, dict) and "url" in content: + urls.append(content["url"]) + flat["productGallery.urls"] = "\n".join(urls) + return flat + key = f"{block_name}.{k}" + flat[key] = v + return flat + +def format_keyfacts(raw_keyfacts): + if not isinstance(raw_keyfacts, list): + return "" + out = [] + header_added = False + for el in raw_keyfacts: + lbl = (el or {}).get("label") + name = (el or {}).get("name", "Właściwości") + if not header_added: + out.append(name) + header_added = True + if lbl: + out.append(lbl) + return "\n".join(out) + +def _fmt_float(x): + try: + return f"{float(x):.2f}".rstrip("0").rstrip(".") + except Exception: + return "" + +def _collect_packaging_total_kg(packaging): + total = 0.0 + if not isinstance(packaging, dict): + return total + content = (packaging.get("contentProps") or {}).get("packages") or [] + for pkg in content: + qty = ((pkg.get("quantity") or {}).get("value")) or 1 + ms = pkg.get("measurements") or [] + for block in ms: + if not isinstance(block, list): + continue + weight_lbl = next((m for m in block if (m.get("type") == "weight" or m.get("label") == "Waga")), None) + if weight_lbl and isinstance(weight_lbl.get("value"), (int, float)): + total += float(weight_lbl["value"]) * (qty or 1) + return total + +def format_dimensions(raw_dim_props, with_html=False, translated=False): + if not isinstance(raw_dim_props, dict): + return "" + lines = [] + br = "
" if with_html else "\n" + + title = translate_token("Wymiary") if translated else "Wymiary" + lines.append(f"{title}" if with_html else title) + + for d in raw_dim_props.get("dimensions", []): + name = d.get("name", "") + meas = d.get("measure", "") + if not name and not meas: + continue + if translated: + name_t = translate_token(name) + line = f"{name_t}: {meas}".strip() + else: + line = f"{name}: {meas}".strip() + lines.append(line) + + pack = (raw_dim_props.get("packaging") or {}) + pack_title = translate_token("Opakowanie") if translated else "Opakowanie" + lines.append(br if with_html else "") + lines.append(f"{pack_title}" if with_html else pack_title) + + content = (pack.get("contentProps") or {}).get("packages") or [] + for pkg in content: + name = pkg.get("name") or "" + if name: + lines.append(name) + + art = (pkg.get("articleNumber") or {}).get("value") + if art: + art_lbl = "Numer artykułu" + if translated: + art_lbl = translate_token(art_lbl) + lines.append(art_lbl) + lines.append(f"{art}") + + ms = pkg.get("measurements") or [] + for block in ms: + if not isinstance(block, list): + continue + for m in block: + lbl = m.get("label", "") + txt = m.get("text", "") + if translated: + lbl = translate_token(lbl) if lbl else lbl + if lbl or txt: + lines.append(f"{lbl}: {txt}".strip(": ")) + + q_val = ((pkg.get("quantity") or {}).get("value")) + if q_val: + q_lbl = "Paczka(i)" + if translated: + q_lbl = translate_token(q_lbl) + lines.append(f"{q_lbl}: {q_val}") + + if with_html: + s = br.join([x for x in lines if x is not None]) + s = re.sub(r"(" + re.escape(br) + r"){2,}", br*2, s) + s = s.strip(br) + # ### NEW: страховка — иногда первая "<" теряется в Excel-предпросмотре + if s.startswith("strong>"): + s = "<" + s + return s + return "\n".join([x for x in lines if x is not None]).strip() + +def format_product_details(raw_details, add_summary_desc="", with_html=False, skip_assembly=True): + if not isinstance(raw_details, dict): + return add_summary_desc if with_html else add_summary_desc + + br = "
" if with_html else "\n" + out = [] + + if add_summary_desc: + out.append(add_summary_desc) + out.append(br if with_html else "") + + t1 = "Informacje o produkcie" + out.append(f"{t1}" if with_html else t1) + pd = (raw_details.get("productDescriptionProps") or {}) + paragraphs = pd.get("paragraphs") or [] + for p in paragraphs: + out.append(p) + + dlabel = pd.get("designerLabel") + dname = pd.get("designerName") + if dlabel and dname: + out.append(dlabel) + out.append(dname) + + if raw_details.get("productId"): + out.append("Numer artykułu") + out.append(raw_details["productId"]) + + acc = (raw_details.get("accordionObject") or {}) + gk = ((acc.get("goodToKnow") or {}).get("contentProps") or {}).get("goodToKnow") or [] + if gk: + out.append(br if with_html else "") + t2 = "Dobrze wiedzieć" + out.append(f"{t2}" if with_html else t2) + for item in gk: + txt = item.get("text") + if txt: + out.append(txt) + + mac = (acc.get("materialsAndCare") or {}).get("contentProps") or {} + mats = mac.get("materials") or [] + care = mac.get("careInstructions") or [] + + t3 = "Materiały i pielęgnacja" + if mats or care: + out.append(br if with_html else "") + out.append(f"{t3}" if with_html else t3) + + if mats: + out.append("Materiały") + for m in mats: + ptype = m.get("productType", "") + for mat in (m.get("materials") or []): + material = mat.get("material", "") + if ptype: + out.append(ptype) + if material: + out.append(material) + + if care: + detailsCareText = mac.get("detailsCareText", "Pielęgnacja") + out.append(detailsCareText) + for c in care: + ptype = c.get("productType", "") + texts = c.get("texts") or [] + if ptype: + out.append(ptype) + for t in texts: + out.append(t) + + safety = (raw_details.get("safetyAndCompliance") or {}).get("contentProps") or {} + sc = safety.get("safetyAndCompliance") or [] + if sc: + out.append(br if with_html else "") + t4 = "Bezpieczeństwo i zgodność z przepisami" + out.append(f"{t4}" if with_html else t4) + for s in sc: + txt = s.get("text") + if txt: + out.append(txt) + + ''' + ### Был блок сборки "Instrukcja montażu" — по вашему запросу отключён. + if not skip_assembly: + ... + ''' + + if with_html: + s = br.join([x for x in out if x is not None]) + s = re.sub(r"(" + re.escape(br) + r"){2,}", br*2, s) + return s.strip(br) + return "\n".join([x for x in out if x is not None]).strip() + +def build_variant_color_measure(desc: str, type_name: str, measurement: str) -> str: + s = (desc or "") + t = (type_name or "").strip() + if t: + pattern = r"^\s*" + re.escape(t) + r"[\s,;:\-–—/]*" + s = re.sub(pattern, "", s, flags=re.IGNORECASE) + + if not re.search(r"[0-9A-Za-zА-Яа-яЁёÀ-ž]", s or ""): + s = "" + + s = s.strip() + meas = (measurement or "").strip() + + if not s: + return meas if meas else "" + + s = s[:1].upper() + s[1:] + return f"{s}, {meas}" if meas else s + +# ───────────────────── СКРАПИНГ КАРТОЧКИ ────────────────────────── +def extract_data(url: str) -> dict: + ''' + Возвращает плоский dict с полями KEEP_COLUMNS. + Форматтеры/подсчёты: keyFacts_formatted, dimensionProps_formatted, + dimensionProps_formatted_html_translated, productDetailsProps_formatted, + productDetailsProps_formatted_html, total brutto, prductVariantColorMeasure, categoryBreadcrumb. + ''' + try: + resp = requests.get(url, headers=HEADERS, timeout=15) + resp.raise_for_status() + # 🔎 DEBUG: вывести в консоль базовую информацию об ответе + print("\n=== FETCH DEBUG ===") + print("URL: ", url) + print("Final URL: ", resp.url) + print("Status: ", resp.status_code) + print("ContentType:", resp.headers.get("Content-Type")) + print("Length: ", len(resp.text)) + print("Snippet ↓↓↓") + print(resp.text[:1000]) # покажет первые 1000 символов HTML + soup = BeautifulSoup(resp.text, "html.parser") + + target = soup.select_one(CSS_SELECTOR) + if not target: + return {"url": url, "error": "CSS selector not found"} + + raw = target.get("data-hydration-props") + if not raw: + return {"url": url, "error": "data-hydration-props not found"} + + decoded = html.unescape(raw) + full_json = json.loads(decoded) + + result = {"url": url} + for block in BLOCKS: + result.update(flatten_block(block, full_json.get(block, {}))) + + kf_json = _parse_json_value(result.get("keyFacts.keyFacts")) + dim_json = _parse_json_value(result.get("productInformationSection.dimensionProps")) + det_json = _parse_json_value(result.get("productInformationSection.productDetailsProps")) + + result["keyFacts.keyFacts_formatted"] = format_keyfacts(kf_json) + result["productInformationSection.dimensionProps_formatted"] = format_dimensions(dim_json, with_html=False, translated=False) + html_trans = format_dimensions(dim_json, with_html=True, translated=True) + + # ### NEW: дополнительная страховка — если вдруг нет '<' в начале: + if isinstance(html_trans, str) and html_trans.startswith("strong>"): + html_trans = "<" + html_trans + + result["productInformationSection.dimensionProps_formatted_html_translated"] = html_trans + + total_kg = _collect_packaging_total_kg((dim_json or {}).get("packaging") or {}) + result["total brutto"] = _fmt_float(total_kg) + + summary_desc = result.get("productSummary.description", "") or "" + result["productInformationSection.productDetailsProps_formatted"] = format_product_details(det_json, add_summary_desc=summary_desc, with_html=False, skip_assembly=True) + result["productInformationSection.productDetailsProps_formatted_html"] = format_product_details(det_json, add_summary_desc=summary_desc, with_html=True, skip_assembly=True) + + desc = result.get("pipPricePackage.productDescription", "") or "" + tname = result.get("stockcheckSection.typeName", "") or "" + meas = result.get("pipPricePackage.measurementText", "") or "" + result["prductVariantColorMeasure"] = build_variant_color_measure(desc, tname, meas) + + # breadcrumb + breadcrumb = None + for tag in soup.find_all("script", attrs={"type": lambda t: t and "ld+json" in t}): + try: + data = json.loads(tag.string) + except Exception: + continue + if isinstance(data, list): + data = next((d for d in data if isinstance(d, dict) and d.get("@type") == "BreadcrumbList"), None) + if isinstance(data, dict) and data.get("@type") == "BreadcrumbList": + items = data.get("itemListElement", []) + names = [it.get("name", "") for it in items] + breadcrumb = "/".join(names) + break + if breadcrumb: + result["categoryBreadcrumb"] = breadcrumb + + # применяем whitelist + filtered = {k: result.get(k) for k in KEEP_COLUMNS if k != "originalName"} + + ''' + ### NEW: originalName = productName + " " + typeName (без двойных пробелов) + ''' + pn = (result.get("buyModule.productName") or "").strip() + tn = (result.get("stockcheckSection.typeName") or "").strip() + if pn and tn: + orig_name = f"{pn} {tn}" + else: + orig_name = pn or tn + filtered["originalName"] = orig_name + + return filtered + + except Exception as e: + print(e) + return {"url": url, "error": str(e)} + +# ───────────────────── ПОСТРОЕНИЕ ВАРИАНТА / POST ───────────────── +def _split_color_size(text: str): + if not text: + return "", "" + parts = [p.strip() for p in text.split(",", 1)] + if len(parts) == 2: + return parts[0], parts[1] + return "", parts[0] + +def _ceil_price(v): + try: + return int(math.ceil(float(v))) + except Exception: + return None + +def _ceil_int(v): + try: + return int(math.ceil(float(v))) + except Exception: + return None + +def build_variant(row: dict) -> dict: + category_name = row.get("categoryBreadcrumb") or "" + brand_name = "ikea" + + visible = row.get("productSummary.visibleItemNo") or "" + sku = visible.replace(" ", "") + + csm = (row.get("prductVariantColorMeasure") or "").strip() + color, size = _split_color_size(csm) + if not color and not size: + size = (row.get("pipPricePackage.measurementText") or "").strip() + + cost = _ceil_price(row.get("buyModule.productPrice")) + url = row.get("url") or "" + + ''' + ### NEW: originalName берём из одноимённой колонки (а не только из productName) + ''' + name = row.get("originalName") or row.get("buyModule.productName") or "" + + desc_html = row.get("productInformationSection.productDetailsProps_formatted_html") or "" + + ''' + ### NEW: originalComposition = HTML из dimensionProps_formatted_html_translated + ''' + composition_html = row.get("productInformationSection.dimensionProps_formatted_html_translated") or "" + + imgs = [] + raw_imgs = row.get("productGallery.urls") or "" + if isinstance(raw_imgs, str): + imgs = [x for x in raw_imgs.split("\n") if x.strip()] + + in_stock = bool(row.get("availabilityGroup.serverOnlineSellable")) + if not in_stock: + in_stock = bool(row.get("buyModule.onlineSellable")) + + weight_kg = _ceil_int(row.get("total brutto")) + + variant = { + "status_id": 1, + "color": color.capitalize() if color else "none", + "sku": sku, + "size": size, + "cost": cost, + "originalUrl": url, + "originalName": name, # ← ### NEW: в JSON сохраняем originalName + "originalDescription": desc_html, + "originalComposition": composition_html, # ← ### NEW + "images": imgs, + "inStock": in_stock, + "weight": weight_kg if weight_kg is not None else 0, + } + + return { + "category": {"name": category_name}, + #"category": {"name": "TEST/IKEA"}, + "brand": {"name": "ikea"}, + "variant": variant, + } + +def post_payload(payload: dict) -> dict: + headers = {"Content-Type": "application/json"} + if POST_API_KEY: + headers["Authorization"] = f"Bearer {POST_API_KEY}" + + body = json.dumps(payload, ensure_ascii=False) + _post_log(f"→ POST {POST_URL}\nHeaders: {headers}\nBody: {body}") + + try: + r = requests.post(POST_URL, headers=headers, data=body.encode("utf-8"), timeout=POST_TIMEOUT) + text = r.text + _post_log(f"← {r.status_code}\n{text}\n{'-'*60}") + ok = 200 <= r.status_code < 300 + return {"ok": ok, "status": r.status_code, "response": text} + except Exception as e: + _post_log(f"× ERROR: {e}\n{'-'*60}") + return {"ok": False, "status": None, "error": str(e)} + +# ───────────────────────── СЕРДЦЕ СКРИПТА ───────────────────────── +def safe_cell(val): + if isinstance(val, (dict, list)): + return json.dumps(val, ensure_ascii=False) + return "" if val is None else val + +def main(): + SAVE_JSON = ask_bool("SAVE_JSON (сохранять JSON на диск?)", "1") + SEND_JSON = ask_bool("SEND_JSON (отправлять на API?)", "1") + + # читаем ссылки + with open(INPUT_FILE, "r", encoding="utf-8") as f: + links = [line.strip() for line in f if line.strip()] + print(f"Всего ссылок: {len(links)}") + + # готовим Excel + wb = Workbook() + ws = wb.active + ws.title = "IKEA Products" + ws.append(KEEP_COLUMNS) + + # батч для JSON/API + batch_items = [] + batch_index = 1 + + def flush_batch(): + nonlocal batch_items, batch_index + if not batch_items: + return + payload = {"parserName": "ikea", "items": batch_items} + if SAVE_JSON: + _save_json_batch(payload, batch_index) + if SEND_JSON: + res = post_payload(payload) + ok = res.get("ok") + print(f"POST batch {batch_index}: {'OK' if ok else 'FAIL'} (status={res.get('status')})") + batch_index += 1 + batch_items = [] + + for idx, link in enumerate(links, 1): + print(f"[{idx}/{len(links)}] {link}") + row = extract_data(link) + + ''' + ### NEW: originalName уже сформирован в extract_data и попал в row + ''' + + # пишем в Excel ВСЁ (без фильтров) + ws.append([safe_cell(row.get(col, "")) for col in KEEP_COLUMNS]) + + # ФИЛЬТРЫ для JSON/API + try: + price = float(row.get("buyModule.productPrice") or 0) + except Exception: + price = 0.0 + + try: + total_kg = float(row.get("total brutto") or 0) + except Exception: + total_kg = 0.0 + + details_json = row.get("productInformationSection.productDetailsProps") or {} + + # 1) фильтр цены + if not (20 <= price <= 1500): + pass + # 2) фильтр веса + elif total_kg > 30: + pass + # 3) фильтр материалов + elif materials_match_exclusions(details_json, EXCLUSIONS): + pass + else: + # прошёл фильтры → добавляем в батч + try: + item = build_variant(row) + batch_items.append(item) + except Exception as e: + _post_log(f"× build_variant error for {link}: {e}") + + # авто-сейв Excel каждые 50 строк + if idx % 50 == 0: + wb.save(OUTPUT_FILE) + print(f"💾 autosave: {OUTPUT_FILE}") + + # флаш батча при достижении лимита + if len(batch_items) >= BATCH_SIZE: + flush_batch() + + # финал: дописать Excel и отправить/сохранить остаток батча + wb.save(OUTPUT_FILE) + print(f"\n✅ Excel готов: {OUTPUT_FILE}") + + flush_batch() + print("🎯 Готово.") + +if __name__ == "__main__": + main() + + diff --git a/Парсер_IKEA/main_win proxy.py b/Парсер_IKEA/main_win proxy.py index c832d86..66defd7 100644 --- a/Парсер_IKEA/main_win proxy.py +++ b/Парсер_IKEA/main_win proxy.py @@ -50,7 +50,7 @@ PROXIES_WEB = { REQUEST_TIMEOUT = 20 # ───────────────────────── НАСТРОЙКИ POST ───────────────────────── -POST_URL = os.getenv("IKEA_POST_URL", "http://localhost:3005/parser/data") +POST_URL = os.getenv("IKEA_POST_URL", "http://172.25.4.101:3005/parser/data") POST_API_KEY = os.getenv("IKEA_POST_API_KEY", "") POST_TIMEOUT = 20 BATCH_SIZE = 50