IKEAwin 2

This commit is contained in:
va1is 2025-08-25 16:29:50 +03:00
parent 05c36f2ffe
commit 8f32eaa3e1

View File

@ -1,13 +1,15 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import os, json, re, math, time, html, requests, datetime import os, json, re, math, time, html, requests, datetime, http.cookiejar as cookiejar
from collections import Counter from collections import Counter
from typing import List from typing import List, Optional
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from openpyxl import Workbook from openpyxl import Workbook
from requests.adapters import HTTPAdapter from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry from urllib3.util.retry import Retry
import logging
import socket
# ───────────────────────── ПУТИ / ФАЙЛЫ ─────────────────────────── # ───────────────────────── ПУТИ / ФАЙЛЫ ───────────────────────────
BASE_DIR = os.path.dirname(os.path.abspath(__file__)) BASE_DIR = os.path.dirname(os.path.abspath(__file__))
@ -19,6 +21,30 @@ OUTPUT_FILE = os.path.join(RECORDS_DIR, "records.xlsx")
DICT_FILE = os.path.join(BASE_DIR, "dictionary_main.txt") DICT_FILE = os.path.join(BASE_DIR, "dictionary_main.txt")
EXCL_FILE = os.path.join(BASE_DIR, "exclusion_materials.txt") EXCL_FILE = os.path.join(BASE_DIR, "exclusion_materials.txt")
POST_LOG = os.path.join(RECORDS_DIR, "post_log.txt") POST_LOG = os.path.join(RECORDS_DIR, "post_log.txt")
FETCH_LOG = os.path.join(RECORDS_DIR, "fetch_debug.log")
COOKIES_TXT = os.path.join(BASE_DIR, "cookies.txt")
# ───────────────────────── ЛОГИРОВАНИЕ ────────────────────────────
logger = logging.getLogger("ikea_parser")
logger.setLevel(logging.DEBUG)
# файл — максимум подробностей
fh = logging.FileHandler(FETCH_LOG, encoding="utf-8")
fh.setLevel(logging.DEBUG)
fh.setFormatter(logging.Formatter("%(asctime)s [%(levelname)s] %(message)s"))
# консоль — INFO
ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
ch.setFormatter(logging.Formatter("%(message)s"))
logger.addHandler(fh)
logger.addHandler(ch)
logger.info("=== IKEA parser started ===")
logger.info(f"BASE_DIR={BASE_DIR}")
logger.info(f"Python={os.sys.version}")
try:
logger.info(f"Hostname={socket.gethostname()} IP={socket.gethostbyname(socket.gethostname())}")
except Exception as _e:
logger.info("Hostname/IP: unavailable")
# ───────────────────────── НАСТРОЙКИ POST ───────────────────────── # ───────────────────────── НАСТРОЙКИ POST ─────────────────────────
POST_URL = os.getenv("IKEA_POST_URL", "http://localhost:3005/parser/data") POST_URL = os.getenv("IKEA_POST_URL", "http://localhost:3005/parser/data")
@ -28,7 +54,6 @@ BATCH_SIZE = 50
# ───────────────────────── НАСТРОЙКИ САЙТА ──────────────────────── # ───────────────────────── НАСТРОЙКИ САЙТА ────────────────────────
HEADERS = { HEADERS = {
# Ближе к Windows Chrome
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) " "AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/124.0.0.0 Safari/537.36", "Chrome/124.0.0.0 Safari/537.36",
@ -85,6 +110,8 @@ KEEP_COLUMNS = [
def make_session() -> requests.Session: def make_session() -> requests.Session:
s = requests.Session() s = requests.Session()
s.headers.update(HEADERS) s.headers.update(HEADERS)
# игнор системных прокси/mitm переменных окружения Windows
s.trust_env = False
retries = Retry( retries = Retry(
total=5, total=5,
backoff_factor=0.5, backoff_factor=0.5,
@ -93,13 +120,23 @@ def make_session() -> requests.Session:
) )
s.mount("https://", HTTPAdapter(max_retries=retries)) s.mount("https://", HTTPAdapter(max_retries=retries))
s.mount("http://", HTTPAdapter(max_retries=retries)) s.mount("http://", HTTPAdapter(max_retries=retries))
# При необходимости задайте рыночные куки (пример, если нужен PL):
# s.cookies.set("ikeaMarket", "PL")
# s.cookies.set("ikeaCurrency", "PLN")
return s return s
SESSION = make_session() SESSION = make_session()
SESSION.trust_env = False # игнорируем системные прокси/сертификаты из переменных окружения Windows
def load_netscape_cookies(session: requests.Session, path: str):
if os.path.isfile(path):
cj = cookiejar.MozillaCookieJar()
try:
cj.load(path, ignore_discard=True, ignore_expires=True)
session.cookies.update(cj)
logger.info(f"🍪 Cookies loaded: {path} ({len(cj)} pcs)")
except Exception as e:
logger.warning(f"⚠️ Failed to load cookies.txt: {e}")
else:
logger.info("cookies.txt not found — proceeding without external cookies")
load_netscape_cookies(SESSION, COOKIES_TXT)
# ───────────────────────── УТИЛИТЫ I/O ──────────────────────────── # ───────────────────────── УТИЛИТЫ I/O ────────────────────────────
def ask_bool(prompt: str, default: str = "1") -> bool: def ask_bool(prompt: str, default: str = "1") -> bool:
@ -124,19 +161,42 @@ def _save_json_batch(payload: dict, batch_index: int):
fpath = os.path.join(RECORDS_DIR, fname) fpath = os.path.join(RECORDS_DIR, fname)
with open(fpath, "w", encoding="utf-8") as fh: with open(fpath, "w", encoding="utf-8") as fh:
json.dump(payload, fh, ensure_ascii=False, indent=2) json.dump(payload, fh, ensure_ascii=False, indent=2)
print(f"💾 JSON saved: {fname}") logger.info(f"💾 JSON saved: {fname}")
return fpath return fpath
def _save_debug_html(url: str, text: str, prefix: str = "debug"): def _safe_name_from_url(url: str) -> str:
return re.sub(r"[^a-zA-Z0-9]+", "_", url)[:80]
def _dump_meta(prefix: str, url: str, status: int, elapsed: float, text_len: int, final_url: str, headers: dict, note: str = ""):
try: try:
safe = re.sub(r"[^a-zA-Z0-9]+", "_", url)[:80] base = f"{prefix}_{_now_tag()}_{_safe_name_from_url(url)}"
fname = f"{prefix}_{_now_tag()}_{safe}.html" meta = os.path.join(RECORDS_DIR, base + ".meta.txt")
fpath = os.path.join(RECORDS_DIR, fname) with open(meta, "w", encoding="utf-8") as fh:
fh.write(f"URL: {url}\n")
fh.write(f"FINAL_URL: {final_url}\n")
fh.write(f"STATUS: {status}\n")
fh.write(f"ELAPSED_SEC: {elapsed:.3f}\n")
fh.write(f"RESP_LEN: {text_len}\n")
fh.write(f"NOTE: {note}\n")
fh.write("HEADERS:\n")
for k, v in headers.items():
hv = v if isinstance(v, str) else str(v)
fh.write(f" {k}: {hv}\n")
except Exception as e:
logger.debug(f"Meta dump failed: {e}")
def _save_debug_html(url: str, text: str, prefix: str = "debug", note: str = "", status: Optional[int] = None, elapsed: Optional[float] = None, headers: Optional[dict] = None, final_url: Optional[str] = None):
try:
base = f"{prefix}_{_now_tag()}_{_safe_name_from_url(url)}"
fpath = os.path.join(RECORDS_DIR, base + ".html")
with open(fpath, "w", encoding="utf-8") as fh: with open(fpath, "w", encoding="utf-8") as fh:
fh.write(text) fh.write(text)
print(f"🧪 Saved HTML snapshot: {fname}") logger.info(f"🧪 Saved HTML snapshot: {os.path.basename(fpath)}")
except Exception: # мета рядом
pass if status is not None and headers is not None and final_url is not None and elapsed is not None:
_dump_meta(prefix, url, status, elapsed, len(text or ""), final_url, headers, note=note)
except Exception as e:
logger.debug(f"HTML dump failed: {e}")
# ───────────────────────── СЛОВАРИ / ФИЛЬТРЫ ────────────────────── # ───────────────────────── СЛОВАРИ / ФИЛЬТРЫ ──────────────────────
def load_dictionary(path: str) -> dict: def load_dictionary(path: str) -> dict:
@ -427,28 +487,48 @@ def build_variant_color_measure(desc: str, type_name: str, measurement: str) ->
return f"{s}, {meas}" if meas else s return f"{s}, {meas}" if meas else s
# ───────────────────── СКРАПИНГ КАРТОЧКИ ────────────────────────── # ───────────────────── СКРАПИНГ КАРТОЧКИ ──────────────────────────
def extract_data(url: str) -> dict: def extract_data(url: str, force_dump: bool = False) -> dict:
try: try:
resp = SESSION.get(url, timeout=20, allow_redirects=True) logger.debug(f"GET {url}")
t0 = time.time()
resp = SESSION.get(url, timeout=25, allow_redirects=True)
elapsed = time.time() - t0
status = resp.status_code status = resp.status_code
if status != 200 or not resp.text or "data-hydration-props" not in resp.text: final_url = str(getattr(resp, "url", url))
_save_debug_html(url, resp.text, prefix=f"resp{status}") text_len = len(resp.text or "")
logger.info(f"HTTP {status} {final_url} ({elapsed:.2f}s, {text_len} bytes)")
# Всегда сохраняем первые (force_dump=True) или любую «сомнительную» страницу
need_dump = force_dump or status != 200 or ("data-hydration-props" not in resp.text)
if need_dump:
note = "force_dump" if force_dump else ("no_hydration" if "data-hydration-props" not in resp.text else f"status_{status}")
_save_debug_html(url, resp.text, prefix=f"resp{status}", note=note, status=status, elapsed=elapsed, headers=resp.headers, final_url=final_url)
resp.raise_for_status() resp.raise_for_status()
soup = BeautifulSoup(resp.text, "html.parser") soup = BeautifulSoup(resp.text, "html.parser")
target = soup.select_one(CSS_SELECTOR) target = soup.select_one(CSS_SELECTOR)
if not target: if not target:
_save_debug_html(url, resp.text, prefix="no_selector") logger.warning("CSS selector NOT FOUND")
_save_debug_html(url, resp.text, prefix="no_selector", note="css_selector_missing", status=status, elapsed=elapsed, headers=resp.headers, final_url=final_url)
return {"url": url, "error": "CSS selector not found", "http_status": status} return {"url": url, "error": "CSS selector not found", "http_status": status}
raw = target.get("data-hydration-props") raw = target.get("data-hydration-props")
if not raw: if not raw:
_save_debug_html(url, resp.text, prefix="no_hydration") logger.warning("data-hydration-props NOT FOUND")
_save_debug_html(url, resp.text, prefix="no_hydration", note="attribute_missing", status=status, elapsed=elapsed, headers=resp.headers, final_url=final_url)
return {"url": url, "error": "data-hydration-props not found", "http_status": status} return {"url": url, "error": "data-hydration-props not found", "http_status": status}
decoded = html.unescape(raw) decoded = html.unescape(raw)
full_json = json.loads(decoded) try:
full_json = json.loads(decoded)
except Exception as je:
logger.error(f"JSON decode error: {je}")
# сохраним кусок для анализа
sample_name = f"bad_json_{_now_tag()}_{_safe_name_from_url(url)}.txt"
with open(os.path.join(RECORDS_DIR, sample_name), "w", encoding="utf-8") as fh:
fh.write(decoded[:20000])
return {"url": url, "error": f"json decode error: {je}", "http_status": status}
result = {"url": url} result = {"url": url}
for block in BLOCKS: for block in BLOCKS:
@ -497,7 +577,7 @@ def extract_data(url: str) -> dict:
# применяем whitelist # применяем whitelist
filtered = {k: result.get(k) for k in KEEP_COLUMNS if k != "originalName"} filtered = {k: result.get(k) for k in KEEP_COLUMNS if k != "originalName"}
# originalName = productName + " " + typeName (без двойных пробелов) # originalName = productName + " " + typeName
pn = (result.get("buyModule.productName") or "").strip() pn = (result.get("buyModule.productName") or "").strip()
tn = (result.get("stockcheckSection.typeName") or "").strip() tn = (result.get("stockcheckSection.typeName") or "").strip()
if pn and tn: if pn and tn:
@ -509,6 +589,7 @@ def extract_data(url: str) -> dict:
return filtered return filtered
except Exception as e: except Exception as e:
logger.error(f"Request error for {url}: {e}")
return {"url": url, "error": str(e), "http_status": None} return {"url": url, "error": str(e), "http_status": None}
# ───────────────────── ПОСТРОЕНИЕ ВАРИАНТА / POST ───────────────── # ───────────────────── ПОСТРОЕНИЕ ВАРИАНТА / POST ─────────────────
@ -533,9 +614,6 @@ def _ceil_int(v):
return None return None
def build_variant(row: dict) -> dict: def build_variant(row: dict) -> dict:
category_name = row.get("categoryBreadcrumb") or ""
brand_name = "ikea"
visible = row.get("productSummary.visibleItemNo") or "" visible = row.get("productSummary.visibleItemNo") or ""
sku = visible.replace(" ", "") sku = visible.replace(" ", "")
@ -556,9 +634,7 @@ def build_variant(row: dict) -> dict:
if isinstance(raw_imgs, str): if isinstance(raw_imgs, str):
imgs = [x for x in raw_imgs.split("\n") if x.strip()] imgs = [x for x in raw_imgs.split("\n") if x.strip()]
in_stock = bool(row.get("availabilityGroup.serverOnlineSellable")) in_stock = bool(row.get("availabilityGroup.serverOnlineSellable")) or bool(row.get("buyModule.onlineSellable"))
if not in_stock:
in_stock = bool(row.get("buyModule.onlineSellable"))
weight_kg = _ceil_int(row.get("total brutto")) weight_kg = _ceil_int(row.get("total brutto"))
@ -578,8 +654,7 @@ def build_variant(row: dict) -> dict:
} }
return { return {
# Временно по вашему запросу: "category": {"name": "TEST/IKEA"}, # временно по вашему ТЗ
"category": {"name": "TEST/IKEA"},
"brand": {"name": "ikea"}, "brand": {"name": "ikea"},
"variant": variant, "variant": variant,
} }
@ -617,6 +692,7 @@ def _clean_url(u: str) -> str:
return u return u
def main(): def main():
logger.info(f"POST_URL={POST_URL} OUTPUT_FILE={OUTPUT_FILE}")
SAVE_JSON = ask_bool("SAVE_JSON (сохранять JSON на диск?)", "1") SAVE_JSON = ask_bool("SAVE_JSON (сохранять JSON на диск?)", "1")
SEND_JSON = ask_bool("SEND_JSON (отправлять на API?)", "1") SEND_JSON = ask_bool("SEND_JSON (отправлять на API?)", "1")
@ -624,7 +700,9 @@ def main():
with open(INPUT_FILE, "r", encoding="utf-8-sig", newline="") as f: with open(INPUT_FILE, "r", encoding="utf-8-sig", newline="") as f:
raw_lines = f.readlines() raw_lines = f.readlines()
links = [_clean_url(x) for x in raw_lines if _clean_url(x)] links = [_clean_url(x) for x in raw_lines if _clean_url(x)]
print(f"Всего ссылок: {len(links)}") logger.info(f"Всего ссылок: {len(links)}")
if not links:
logger.warning("Список ссылок пуст — проверьте product_links.txt")
# готовим Excel # готовим Excel
wb = Workbook() wb = Workbook()
@ -648,15 +726,15 @@ def main():
if SEND_JSON: if SEND_JSON:
res = post_payload(payload) res = post_payload(payload)
ok = res.get("ok") ok = res.get("ok")
print(f"POST batch {batch_index}: {'OK' if ok else 'FAIL'} (status={res.get('status')})") logger.info(f"POST batch {batch_index}: {'OK' if ok else 'FAIL'} (status={res.get('status')})")
batch_index += 1 batch_index += 1
batch_items = [] batch_items = []
for idx, link in enumerate(links, 1): for idx, link in enumerate(links, 1):
print(f"[{idx}/{len(links)}] {link}") logger.info(f"[{idx}/{len(links)}] {link}")
row = extract_data(link) force_dump = idx <= 3 # ← Принудительно сохраняем HTML для первых 3 ссылок
row = extract_data(link, force_dump=force_dump)
# учёт статусов
st = row.get("http_status") st = row.get("http_status")
if st is None and "error" in row: if st is None and "error" in row:
STATUS_COUNTER["err"] += 1 STATUS_COUNTER["err"] += 1
@ -680,22 +758,23 @@ def main():
details_json = row.get("productInformationSection.productDetailsProps") or {} details_json = row.get("productInformationSection.productDetailsProps") or {}
if not (20 <= price <= 1500): if not (20 <= price <= 1500):
pass logger.debug(f"Skip by price: {price}")
elif total_kg > 30: elif total_kg > 30:
pass logger.debug(f"Skip by weight: {total_kg} kg")
elif materials_match_exclusions(details_json, EXCLUSIONS): elif materials_match_exclusions(details_json, EXCLUSIONS):
pass logger.debug("Skip by exclusions (materials)")
else: else:
try: try:
item = build_variant(row) item = build_variant(row)
batch_items.append(item) batch_items.append(item)
except Exception as e: except Exception as e:
logger.error(f"build_variant error for {link}: {e}")
_post_log(f"× build_variant error for {link}: {e}") _post_log(f"× build_variant error for {link}: {e}")
# авто-сейв Excel каждые 50 строк # авто-сейв Excel каждые 50 строк
if idx % 50 == 0: if idx % 50 == 0:
wb.save(OUTPUT_FILE) wb.save(OUTPUT_FILE)
print(f"💾 autosave: {OUTPUT_FILE}") logger.info(f"💾 autosave: {OUTPUT_FILE}")
# флаш батча при достижении лимита # флаш батча при достижении лимита
if len(batch_items) >= BATCH_SIZE: if len(batch_items) >= BATCH_SIZE:
@ -703,13 +782,12 @@ def main():
# финал # финал
wb.save(OUTPUT_FILE) wb.save(OUTPUT_FILE)
print(f"\n✅ Excel готов: {OUTPUT_FILE}") logger.info(f"\n✅ Excel готов: {OUTPUT_FILE}")
flush_batch() flush_batch()
# сводка по HTTP logger.info(f"HTTP stats: {dict(STATUS_COUNTER)}")
print("HTTP stats:", dict(STATUS_COUNTER)) logger.info("🎯 Готово.")
print("🎯 Готово.")
if __name__ == "__main__": if __name__ == "__main__":
main() main()