IKEAwin 2

This commit is contained in:
va1is 2025-08-25 16:29:50 +03:00
parent 05c36f2ffe
commit 8f32eaa3e1

View File

@ -1,13 +1,15 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import os, json, re, math, time, html, requests, datetime
import os, json, re, math, time, html, requests, datetime, http.cookiejar as cookiejar
from collections import Counter
from typing import List
from typing import List, Optional
from bs4 import BeautifulSoup
from openpyxl import Workbook
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
import logging
import socket
# ───────────────────────── ПУТИ / ФАЙЛЫ ───────────────────────────
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
@ -19,6 +21,30 @@ OUTPUT_FILE = os.path.join(RECORDS_DIR, "records.xlsx")
DICT_FILE = os.path.join(BASE_DIR, "dictionary_main.txt")
EXCL_FILE = os.path.join(BASE_DIR, "exclusion_materials.txt")
POST_LOG = os.path.join(RECORDS_DIR, "post_log.txt")
FETCH_LOG = os.path.join(RECORDS_DIR, "fetch_debug.log")
COOKIES_TXT = os.path.join(BASE_DIR, "cookies.txt")
# ───────────────────────── ЛОГИРОВАНИЕ ────────────────────────────
logger = logging.getLogger("ikea_parser")
logger.setLevel(logging.DEBUG)
# файл — максимум подробностей
fh = logging.FileHandler(FETCH_LOG, encoding="utf-8")
fh.setLevel(logging.DEBUG)
fh.setFormatter(logging.Formatter("%(asctime)s [%(levelname)s] %(message)s"))
# консоль — INFO
ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
ch.setFormatter(logging.Formatter("%(message)s"))
logger.addHandler(fh)
logger.addHandler(ch)
logger.info("=== IKEA parser started ===")
logger.info(f"BASE_DIR={BASE_DIR}")
logger.info(f"Python={os.sys.version}")
try:
logger.info(f"Hostname={socket.gethostname()} IP={socket.gethostbyname(socket.gethostname())}")
except Exception as _e:
logger.info("Hostname/IP: unavailable")
# ───────────────────────── НАСТРОЙКИ POST ─────────────────────────
POST_URL = os.getenv("IKEA_POST_URL", "http://localhost:3005/parser/data")
@ -28,7 +54,6 @@ BATCH_SIZE = 50
# ───────────────────────── НАСТРОЙКИ САЙТА ────────────────────────
HEADERS = {
# Ближе к Windows Chrome
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/124.0.0.0 Safari/537.36",
@ -85,6 +110,8 @@ KEEP_COLUMNS = [
def make_session() -> requests.Session:
s = requests.Session()
s.headers.update(HEADERS)
# игнор системных прокси/mitm переменных окружения Windows
s.trust_env = False
retries = Retry(
total=5,
backoff_factor=0.5,
@ -93,13 +120,23 @@ def make_session() -> requests.Session:
)
s.mount("https://", HTTPAdapter(max_retries=retries))
s.mount("http://", HTTPAdapter(max_retries=retries))
# При необходимости задайте рыночные куки (пример, если нужен PL):
# s.cookies.set("ikeaMarket", "PL")
# s.cookies.set("ikeaCurrency", "PLN")
return s
SESSION = make_session()
SESSION.trust_env = False # игнорируем системные прокси/сертификаты из переменных окружения Windows
def load_netscape_cookies(session: requests.Session, path: str):
if os.path.isfile(path):
cj = cookiejar.MozillaCookieJar()
try:
cj.load(path, ignore_discard=True, ignore_expires=True)
session.cookies.update(cj)
logger.info(f"🍪 Cookies loaded: {path} ({len(cj)} pcs)")
except Exception as e:
logger.warning(f"⚠️ Failed to load cookies.txt: {e}")
else:
logger.info("cookies.txt not found — proceeding without external cookies")
load_netscape_cookies(SESSION, COOKIES_TXT)
# ───────────────────────── УТИЛИТЫ I/O ────────────────────────────
def ask_bool(prompt: str, default: str = "1") -> bool:
@ -124,19 +161,42 @@ def _save_json_batch(payload: dict, batch_index: int):
fpath = os.path.join(RECORDS_DIR, fname)
with open(fpath, "w", encoding="utf-8") as fh:
json.dump(payload, fh, ensure_ascii=False, indent=2)
print(f"💾 JSON saved: {fname}")
logger.info(f"💾 JSON saved: {fname}")
return fpath
def _save_debug_html(url: str, text: str, prefix: str = "debug"):
def _safe_name_from_url(url: str) -> str:
return re.sub(r"[^a-zA-Z0-9]+", "_", url)[:80]
def _dump_meta(prefix: str, url: str, status: int, elapsed: float, text_len: int, final_url: str, headers: dict, note: str = ""):
try:
safe = re.sub(r"[^a-zA-Z0-9]+", "_", url)[:80]
fname = f"{prefix}_{_now_tag()}_{safe}.html"
fpath = os.path.join(RECORDS_DIR, fname)
base = f"{prefix}_{_now_tag()}_{_safe_name_from_url(url)}"
meta = os.path.join(RECORDS_DIR, base + ".meta.txt")
with open(meta, "w", encoding="utf-8") as fh:
fh.write(f"URL: {url}\n")
fh.write(f"FINAL_URL: {final_url}\n")
fh.write(f"STATUS: {status}\n")
fh.write(f"ELAPSED_SEC: {elapsed:.3f}\n")
fh.write(f"RESP_LEN: {text_len}\n")
fh.write(f"NOTE: {note}\n")
fh.write("HEADERS:\n")
for k, v in headers.items():
hv = v if isinstance(v, str) else str(v)
fh.write(f" {k}: {hv}\n")
except Exception as e:
logger.debug(f"Meta dump failed: {e}")
def _save_debug_html(url: str, text: str, prefix: str = "debug", note: str = "", status: Optional[int] = None, elapsed: Optional[float] = None, headers: Optional[dict] = None, final_url: Optional[str] = None):
try:
base = f"{prefix}_{_now_tag()}_{_safe_name_from_url(url)}"
fpath = os.path.join(RECORDS_DIR, base + ".html")
with open(fpath, "w", encoding="utf-8") as fh:
fh.write(text)
print(f"🧪 Saved HTML snapshot: {fname}")
except Exception:
pass
logger.info(f"🧪 Saved HTML snapshot: {os.path.basename(fpath)}")
# мета рядом
if status is not None and headers is not None and final_url is not None and elapsed is not None:
_dump_meta(prefix, url, status, elapsed, len(text or ""), final_url, headers, note=note)
except Exception as e:
logger.debug(f"HTML dump failed: {e}")
# ───────────────────────── СЛОВАРИ / ФИЛЬТРЫ ──────────────────────
def load_dictionary(path: str) -> dict:
@ -427,28 +487,48 @@ def build_variant_color_measure(desc: str, type_name: str, measurement: str) ->
return f"{s}, {meas}" if meas else s
# ───────────────────── СКРАПИНГ КАРТОЧКИ ──────────────────────────
def extract_data(url: str) -> dict:
def extract_data(url: str, force_dump: bool = False) -> dict:
try:
resp = SESSION.get(url, timeout=20, allow_redirects=True)
logger.debug(f"GET {url}")
t0 = time.time()
resp = SESSION.get(url, timeout=25, allow_redirects=True)
elapsed = time.time() - t0
status = resp.status_code
if status != 200 or not resp.text or "data-hydration-props" not in resp.text:
_save_debug_html(url, resp.text, prefix=f"resp{status}")
final_url = str(getattr(resp, "url", url))
text_len = len(resp.text or "")
logger.info(f"HTTP {status} {final_url} ({elapsed:.2f}s, {text_len} bytes)")
# Всегда сохраняем первые (force_dump=True) или любую «сомнительную» страницу
need_dump = force_dump or status != 200 or ("data-hydration-props" not in resp.text)
if need_dump:
note = "force_dump" if force_dump else ("no_hydration" if "data-hydration-props" not in resp.text else f"status_{status}")
_save_debug_html(url, resp.text, prefix=f"resp{status}", note=note, status=status, elapsed=elapsed, headers=resp.headers, final_url=final_url)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, "html.parser")
target = soup.select_one(CSS_SELECTOR)
if not target:
_save_debug_html(url, resp.text, prefix="no_selector")
logger.warning("CSS selector NOT FOUND")
_save_debug_html(url, resp.text, prefix="no_selector", note="css_selector_missing", status=status, elapsed=elapsed, headers=resp.headers, final_url=final_url)
return {"url": url, "error": "CSS selector not found", "http_status": status}
raw = target.get("data-hydration-props")
if not raw:
_save_debug_html(url, resp.text, prefix="no_hydration")
logger.warning("data-hydration-props NOT FOUND")
_save_debug_html(url, resp.text, prefix="no_hydration", note="attribute_missing", status=status, elapsed=elapsed, headers=resp.headers, final_url=final_url)
return {"url": url, "error": "data-hydration-props not found", "http_status": status}
decoded = html.unescape(raw)
try:
full_json = json.loads(decoded)
except Exception as je:
logger.error(f"JSON decode error: {je}")
# сохраним кусок для анализа
sample_name = f"bad_json_{_now_tag()}_{_safe_name_from_url(url)}.txt"
with open(os.path.join(RECORDS_DIR, sample_name), "w", encoding="utf-8") as fh:
fh.write(decoded[:20000])
return {"url": url, "error": f"json decode error: {je}", "http_status": status}
result = {"url": url}
for block in BLOCKS:
@ -497,7 +577,7 @@ def extract_data(url: str) -> dict:
# применяем whitelist
filtered = {k: result.get(k) for k in KEEP_COLUMNS if k != "originalName"}
# originalName = productName + " " + typeName (без двойных пробелов)
# originalName = productName + " " + typeName
pn = (result.get("buyModule.productName") or "").strip()
tn = (result.get("stockcheckSection.typeName") or "").strip()
if pn and tn:
@ -509,6 +589,7 @@ def extract_data(url: str) -> dict:
return filtered
except Exception as e:
logger.error(f"Request error for {url}: {e}")
return {"url": url, "error": str(e), "http_status": None}
# ───────────────────── ПОСТРОЕНИЕ ВАРИАНТА / POST ─────────────────
@ -533,9 +614,6 @@ def _ceil_int(v):
return None
def build_variant(row: dict) -> dict:
category_name = row.get("categoryBreadcrumb") or ""
brand_name = "ikea"
visible = row.get("productSummary.visibleItemNo") or ""
sku = visible.replace(" ", "")
@ -556,9 +634,7 @@ def build_variant(row: dict) -> dict:
if isinstance(raw_imgs, str):
imgs = [x for x in raw_imgs.split("\n") if x.strip()]
in_stock = bool(row.get("availabilityGroup.serverOnlineSellable"))
if not in_stock:
in_stock = bool(row.get("buyModule.onlineSellable"))
in_stock = bool(row.get("availabilityGroup.serverOnlineSellable")) or bool(row.get("buyModule.onlineSellable"))
weight_kg = _ceil_int(row.get("total brutto"))
@ -578,8 +654,7 @@ def build_variant(row: dict) -> dict:
}
return {
# Временно по вашему запросу:
"category": {"name": "TEST/IKEA"},
"category": {"name": "TEST/IKEA"}, # временно по вашему ТЗ
"brand": {"name": "ikea"},
"variant": variant,
}
@ -617,6 +692,7 @@ def _clean_url(u: str) -> str:
return u
def main():
logger.info(f"POST_URL={POST_URL} OUTPUT_FILE={OUTPUT_FILE}")
SAVE_JSON = ask_bool("SAVE_JSON (сохранять JSON на диск?)", "1")
SEND_JSON = ask_bool("SEND_JSON (отправлять на API?)", "1")
@ -624,7 +700,9 @@ def main():
with open(INPUT_FILE, "r", encoding="utf-8-sig", newline="") as f:
raw_lines = f.readlines()
links = [_clean_url(x) for x in raw_lines if _clean_url(x)]
print(f"Всего ссылок: {len(links)}")
logger.info(f"Всего ссылок: {len(links)}")
if not links:
logger.warning("Список ссылок пуст — проверьте product_links.txt")
# готовим Excel
wb = Workbook()
@ -648,15 +726,15 @@ def main():
if SEND_JSON:
res = post_payload(payload)
ok = res.get("ok")
print(f"POST batch {batch_index}: {'OK' if ok else 'FAIL'} (status={res.get('status')})")
logger.info(f"POST batch {batch_index}: {'OK' if ok else 'FAIL'} (status={res.get('status')})")
batch_index += 1
batch_items = []
for idx, link in enumerate(links, 1):
print(f"[{idx}/{len(links)}] {link}")
row = extract_data(link)
logger.info(f"[{idx}/{len(links)}] {link}")
force_dump = idx <= 3 # ← Принудительно сохраняем HTML для первых 3 ссылок
row = extract_data(link, force_dump=force_dump)
# учёт статусов
st = row.get("http_status")
if st is None and "error" in row:
STATUS_COUNTER["err"] += 1
@ -680,22 +758,23 @@ def main():
details_json = row.get("productInformationSection.productDetailsProps") or {}
if not (20 <= price <= 1500):
pass
logger.debug(f"Skip by price: {price}")
elif total_kg > 30:
pass
logger.debug(f"Skip by weight: {total_kg} kg")
elif materials_match_exclusions(details_json, EXCLUSIONS):
pass
logger.debug("Skip by exclusions (materials)")
else:
try:
item = build_variant(row)
batch_items.append(item)
except Exception as e:
logger.error(f"build_variant error for {link}: {e}")
_post_log(f"× build_variant error for {link}: {e}")
# авто-сейв Excel каждые 50 строк
if idx % 50 == 0:
wb.save(OUTPUT_FILE)
print(f"💾 autosave: {OUTPUT_FILE}")
logger.info(f"💾 autosave: {OUTPUT_FILE}")
# флаш батча при достижении лимита
if len(batch_items) >= BATCH_SIZE:
@ -703,13 +782,12 @@ def main():
# финал
wb.save(OUTPUT_FILE)
print(f"\n✅ Excel готов: {OUTPUT_FILE}")
logger.info(f"\n✅ Excel готов: {OUTPUT_FILE}")
flush_batch()
# сводка по HTTP
print("HTTP stats:", dict(STATUS_COUNTER))
print("🎯 Готово.")
logger.info(f"HTTP stats: {dict(STATUS_COUNTER)}")
logger.info("🎯 Готово.")
if __name__ == "__main__":
main()