From 8f32eaa3e1ab7e4186b381e61eb74655bf7c0bdf Mon Sep 17 00:00:00 2001 From: va1is Date: Mon, 25 Aug 2025 16:29:50 +0300 Subject: [PATCH] IKEAwin 2 --- Парсер_IKEA/main_win.py | 170 +++++++++++++++++++++++++++++----------- 1 file changed, 124 insertions(+), 46 deletions(-) diff --git a/Парсер_IKEA/main_win.py b/Парсер_IKEA/main_win.py index fb5536c..40b1c46 100644 --- a/Парсер_IKEA/main_win.py +++ b/Парсер_IKEA/main_win.py @@ -1,13 +1,15 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -import os, json, re, math, time, html, requests, datetime +import os, json, re, math, time, html, requests, datetime, http.cookiejar as cookiejar from collections import Counter -from typing import List +from typing import List, Optional from bs4 import BeautifulSoup from openpyxl import Workbook from requests.adapters import HTTPAdapter from urllib3.util.retry import Retry +import logging +import socket # ───────────────────────── ПУТИ / ФАЙЛЫ ─────────────────────────── BASE_DIR = os.path.dirname(os.path.abspath(__file__)) @@ -19,6 +21,30 @@ OUTPUT_FILE = os.path.join(RECORDS_DIR, "records.xlsx") DICT_FILE = os.path.join(BASE_DIR, "dictionary_main.txt") EXCL_FILE = os.path.join(BASE_DIR, "exclusion_materials.txt") POST_LOG = os.path.join(RECORDS_DIR, "post_log.txt") +FETCH_LOG = os.path.join(RECORDS_DIR, "fetch_debug.log") +COOKIES_TXT = os.path.join(BASE_DIR, "cookies.txt") + +# ───────────────────────── ЛОГИРОВАНИЕ ──────────────────────────── +logger = logging.getLogger("ikea_parser") +logger.setLevel(logging.DEBUG) +# файл — максимум подробностей +fh = logging.FileHandler(FETCH_LOG, encoding="utf-8") +fh.setLevel(logging.DEBUG) +fh.setFormatter(logging.Formatter("%(asctime)s [%(levelname)s] %(message)s")) +# консоль — INFO +ch = logging.StreamHandler() +ch.setLevel(logging.INFO) +ch.setFormatter(logging.Formatter("%(message)s")) +logger.addHandler(fh) +logger.addHandler(ch) + +logger.info("=== IKEA parser started ===") +logger.info(f"BASE_DIR={BASE_DIR}") +logger.info(f"Python={os.sys.version}") +try: + logger.info(f"Hostname={socket.gethostname()} IP={socket.gethostbyname(socket.gethostname())}") +except Exception as _e: + logger.info("Hostname/IP: unavailable") # ───────────────────────── НАСТРОЙКИ POST ───────────────────────── POST_URL = os.getenv("IKEA_POST_URL", "http://localhost:3005/parser/data") @@ -28,7 +54,6 @@ BATCH_SIZE = 50 # ───────────────────────── НАСТРОЙКИ САЙТА ──────────────────────── HEADERS = { - # Ближе к Windows Chrome "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " "AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/124.0.0.0 Safari/537.36", @@ -85,6 +110,8 @@ KEEP_COLUMNS = [ def make_session() -> requests.Session: s = requests.Session() s.headers.update(HEADERS) + # игнор системных прокси/mitm переменных окружения Windows + s.trust_env = False retries = Retry( total=5, backoff_factor=0.5, @@ -93,13 +120,23 @@ def make_session() -> requests.Session: ) s.mount("https://", HTTPAdapter(max_retries=retries)) s.mount("http://", HTTPAdapter(max_retries=retries)) - # При необходимости задайте рыночные куки (пример, если нужен PL): - # s.cookies.set("ikeaMarket", "PL") - # s.cookies.set("ikeaCurrency", "PLN") return s SESSION = make_session() -SESSION.trust_env = False # игнорируем системные прокси/сертификаты из переменных окружения Windows + +def load_netscape_cookies(session: requests.Session, path: str): + if os.path.isfile(path): + cj = cookiejar.MozillaCookieJar() + try: + cj.load(path, ignore_discard=True, ignore_expires=True) + session.cookies.update(cj) + logger.info(f"🍪 Cookies loaded: {path} ({len(cj)} pcs)") + except Exception as e: + logger.warning(f"⚠️ Failed to load cookies.txt: {e}") + else: + logger.info("cookies.txt not found — proceeding without external cookies") + +load_netscape_cookies(SESSION, COOKIES_TXT) # ───────────────────────── УТИЛИТЫ I/O ──────────────────────────── def ask_bool(prompt: str, default: str = "1") -> bool: @@ -124,19 +161,42 @@ def _save_json_batch(payload: dict, batch_index: int): fpath = os.path.join(RECORDS_DIR, fname) with open(fpath, "w", encoding="utf-8") as fh: json.dump(payload, fh, ensure_ascii=False, indent=2) - print(f"💾 JSON saved: {fname}") + logger.info(f"💾 JSON saved: {fname}") return fpath -def _save_debug_html(url: str, text: str, prefix: str = "debug"): +def _safe_name_from_url(url: str) -> str: + return re.sub(r"[^a-zA-Z0-9]+", "_", url)[:80] + +def _dump_meta(prefix: str, url: str, status: int, elapsed: float, text_len: int, final_url: str, headers: dict, note: str = ""): try: - safe = re.sub(r"[^a-zA-Z0-9]+", "_", url)[:80] - fname = f"{prefix}_{_now_tag()}_{safe}.html" - fpath = os.path.join(RECORDS_DIR, fname) + base = f"{prefix}_{_now_tag()}_{_safe_name_from_url(url)}" + meta = os.path.join(RECORDS_DIR, base + ".meta.txt") + with open(meta, "w", encoding="utf-8") as fh: + fh.write(f"URL: {url}\n") + fh.write(f"FINAL_URL: {final_url}\n") + fh.write(f"STATUS: {status}\n") + fh.write(f"ELAPSED_SEC: {elapsed:.3f}\n") + fh.write(f"RESP_LEN: {text_len}\n") + fh.write(f"NOTE: {note}\n") + fh.write("HEADERS:\n") + for k, v in headers.items(): + hv = v if isinstance(v, str) else str(v) + fh.write(f" {k}: {hv}\n") + except Exception as e: + logger.debug(f"Meta dump failed: {e}") + +def _save_debug_html(url: str, text: str, prefix: str = "debug", note: str = "", status: Optional[int] = None, elapsed: Optional[float] = None, headers: Optional[dict] = None, final_url: Optional[str] = None): + try: + base = f"{prefix}_{_now_tag()}_{_safe_name_from_url(url)}" + fpath = os.path.join(RECORDS_DIR, base + ".html") with open(fpath, "w", encoding="utf-8") as fh: fh.write(text) - print(f"🧪 Saved HTML snapshot: {fname}") - except Exception: - pass + logger.info(f"🧪 Saved HTML snapshot: {os.path.basename(fpath)}") + # мета рядом + if status is not None and headers is not None and final_url is not None and elapsed is not None: + _dump_meta(prefix, url, status, elapsed, len(text or ""), final_url, headers, note=note) + except Exception as e: + logger.debug(f"HTML dump failed: {e}") # ───────────────────────── СЛОВАРИ / ФИЛЬТРЫ ────────────────────── def load_dictionary(path: str) -> dict: @@ -427,28 +487,48 @@ def build_variant_color_measure(desc: str, type_name: str, measurement: str) -> return f"{s}, {meas}" if meas else s # ───────────────────── СКРАПИНГ КАРТОЧКИ ────────────────────────── -def extract_data(url: str) -> dict: +def extract_data(url: str, force_dump: bool = False) -> dict: try: - resp = SESSION.get(url, timeout=20, allow_redirects=True) + logger.debug(f"GET {url}") + t0 = time.time() + resp = SESSION.get(url, timeout=25, allow_redirects=True) + elapsed = time.time() - t0 status = resp.status_code - if status != 200 or not resp.text or "data-hydration-props" not in resp.text: - _save_debug_html(url, resp.text, prefix=f"resp{status}") + final_url = str(getattr(resp, "url", url)) + text_len = len(resp.text or "") + logger.info(f"HTTP {status} {final_url} ({elapsed:.2f}s, {text_len} bytes)") + + # Всегда сохраняем первые (force_dump=True) или любую «сомнительную» страницу + need_dump = force_dump or status != 200 or ("data-hydration-props" not in resp.text) + if need_dump: + note = "force_dump" if force_dump else ("no_hydration" if "data-hydration-props" not in resp.text else f"status_{status}") + _save_debug_html(url, resp.text, prefix=f"resp{status}", note=note, status=status, elapsed=elapsed, headers=resp.headers, final_url=final_url) + resp.raise_for_status() soup = BeautifulSoup(resp.text, "html.parser") - target = soup.select_one(CSS_SELECTOR) if not target: - _save_debug_html(url, resp.text, prefix="no_selector") + logger.warning("CSS selector NOT FOUND") + _save_debug_html(url, resp.text, prefix="no_selector", note="css_selector_missing", status=status, elapsed=elapsed, headers=resp.headers, final_url=final_url) return {"url": url, "error": "CSS selector not found", "http_status": status} raw = target.get("data-hydration-props") if not raw: - _save_debug_html(url, resp.text, prefix="no_hydration") + logger.warning("data-hydration-props NOT FOUND") + _save_debug_html(url, resp.text, prefix="no_hydration", note="attribute_missing", status=status, elapsed=elapsed, headers=resp.headers, final_url=final_url) return {"url": url, "error": "data-hydration-props not found", "http_status": status} - decoded = html.unescape(raw) - full_json = json.loads(decoded) + decoded = html.unescape(raw) + try: + full_json = json.loads(decoded) + except Exception as je: + logger.error(f"JSON decode error: {je}") + # сохраним кусок для анализа + sample_name = f"bad_json_{_now_tag()}_{_safe_name_from_url(url)}.txt" + with open(os.path.join(RECORDS_DIR, sample_name), "w", encoding="utf-8") as fh: + fh.write(decoded[:20000]) + return {"url": url, "error": f"json decode error: {je}", "http_status": status} result = {"url": url} for block in BLOCKS: @@ -497,7 +577,7 @@ def extract_data(url: str) -> dict: # применяем whitelist filtered = {k: result.get(k) for k in KEEP_COLUMNS if k != "originalName"} - # originalName = productName + " " + typeName (без двойных пробелов) + # originalName = productName + " " + typeName pn = (result.get("buyModule.productName") or "").strip() tn = (result.get("stockcheckSection.typeName") or "").strip() if pn and tn: @@ -509,6 +589,7 @@ def extract_data(url: str) -> dict: return filtered except Exception as e: + logger.error(f"Request error for {url}: {e}") return {"url": url, "error": str(e), "http_status": None} # ───────────────────── ПОСТРОЕНИЕ ВАРИАНТА / POST ───────────────── @@ -533,9 +614,6 @@ def _ceil_int(v): return None def build_variant(row: dict) -> dict: - category_name = row.get("categoryBreadcrumb") or "" - brand_name = "ikea" - visible = row.get("productSummary.visibleItemNo") or "" sku = visible.replace(" ", "") @@ -556,9 +634,7 @@ def build_variant(row: dict) -> dict: if isinstance(raw_imgs, str): imgs = [x for x in raw_imgs.split("\n") if x.strip()] - in_stock = bool(row.get("availabilityGroup.serverOnlineSellable")) - if not in_stock: - in_stock = bool(row.get("buyModule.onlineSellable")) + in_stock = bool(row.get("availabilityGroup.serverOnlineSellable")) or bool(row.get("buyModule.onlineSellable")) weight_kg = _ceil_int(row.get("total brutto")) @@ -578,8 +654,7 @@ def build_variant(row: dict) -> dict: } return { - # Временно по вашему запросу: - "category": {"name": "TEST/IKEA"}, + "category": {"name": "TEST/IKEA"}, # временно по вашему ТЗ "brand": {"name": "ikea"}, "variant": variant, } @@ -617,6 +692,7 @@ def _clean_url(u: str) -> str: return u def main(): + logger.info(f"POST_URL={POST_URL} OUTPUT_FILE={OUTPUT_FILE}") SAVE_JSON = ask_bool("SAVE_JSON (сохранять JSON на диск?)", "1") SEND_JSON = ask_bool("SEND_JSON (отправлять на API?)", "1") @@ -624,7 +700,9 @@ def main(): with open(INPUT_FILE, "r", encoding="utf-8-sig", newline="") as f: raw_lines = f.readlines() links = [_clean_url(x) for x in raw_lines if _clean_url(x)] - print(f"Всего ссылок: {len(links)}") + logger.info(f"Всего ссылок: {len(links)}") + if not links: + logger.warning("Список ссылок пуст — проверьте product_links.txt") # готовим Excel wb = Workbook() @@ -648,15 +726,15 @@ def main(): if SEND_JSON: res = post_payload(payload) ok = res.get("ok") - print(f"POST batch {batch_index}: {'OK' if ok else 'FAIL'} (status={res.get('status')})") + logger.info(f"POST batch {batch_index}: {'OK' if ok else 'FAIL'} (status={res.get('status')})") batch_index += 1 batch_items = [] for idx, link in enumerate(links, 1): - print(f"[{idx}/{len(links)}] {link}") - row = extract_data(link) + logger.info(f"[{idx}/{len(links)}] {link}") + force_dump = idx <= 3 # ← Принудительно сохраняем HTML для первых 3 ссылок + row = extract_data(link, force_dump=force_dump) - # учёт статусов st = row.get("http_status") if st is None and "error" in row: STATUS_COUNTER["err"] += 1 @@ -680,22 +758,23 @@ def main(): details_json = row.get("productInformationSection.productDetailsProps") or {} if not (20 <= price <= 1500): - pass + logger.debug(f"Skip by price: {price}") elif total_kg > 30: - pass + logger.debug(f"Skip by weight: {total_kg} kg") elif materials_match_exclusions(details_json, EXCLUSIONS): - pass + logger.debug("Skip by exclusions (materials)") else: try: item = build_variant(row) batch_items.append(item) except Exception as e: + logger.error(f"build_variant error for {link}: {e}") _post_log(f"× build_variant error for {link}: {e}") # авто-сейв Excel каждые 50 строк if idx % 50 == 0: wb.save(OUTPUT_FILE) - print(f"💾 autosave: {OUTPUT_FILE}") + logger.info(f"💾 autosave: {OUTPUT_FILE}") # флаш батча при достижении лимита if len(batch_items) >= BATCH_SIZE: @@ -703,13 +782,12 @@ def main(): # финал wb.save(OUTPUT_FILE) - print(f"\n✅ Excel готов: {OUTPUT_FILE}") + logger.info(f"\n✅ Excel готов: {OUTPUT_FILE}") flush_batch() - # сводка по HTTP - print("HTTP stats:", dict(STATUS_COUNTER)) - print("🎯 Готово.") + logger.info(f"HTTP stats: {dict(STATUS_COUNTER)}") + logger.info("🎯 Готово.") if __name__ == "__main__": main()