IKEAwin 2
This commit is contained in:
parent
05c36f2ffe
commit
8f32eaa3e1
@ -1,13 +1,15 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import os, json, re, math, time, html, requests, datetime
|
||||
import os, json, re, math, time, html, requests, datetime, http.cookiejar as cookiejar
|
||||
from collections import Counter
|
||||
from typing import List
|
||||
from typing import List, Optional
|
||||
from bs4 import BeautifulSoup
|
||||
from openpyxl import Workbook
|
||||
from requests.adapters import HTTPAdapter
|
||||
from urllib3.util.retry import Retry
|
||||
import logging
|
||||
import socket
|
||||
|
||||
# ───────────────────────── ПУТИ / ФАЙЛЫ ───────────────────────────
|
||||
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||
@ -19,6 +21,30 @@ OUTPUT_FILE = os.path.join(RECORDS_DIR, "records.xlsx")
|
||||
DICT_FILE = os.path.join(BASE_DIR, "dictionary_main.txt")
|
||||
EXCL_FILE = os.path.join(BASE_DIR, "exclusion_materials.txt")
|
||||
POST_LOG = os.path.join(RECORDS_DIR, "post_log.txt")
|
||||
FETCH_LOG = os.path.join(RECORDS_DIR, "fetch_debug.log")
|
||||
COOKIES_TXT = os.path.join(BASE_DIR, "cookies.txt")
|
||||
|
||||
# ───────────────────────── ЛОГИРОВАНИЕ ────────────────────────────
|
||||
logger = logging.getLogger("ikea_parser")
|
||||
logger.setLevel(logging.DEBUG)
|
||||
# файл — максимум подробностей
|
||||
fh = logging.FileHandler(FETCH_LOG, encoding="utf-8")
|
||||
fh.setLevel(logging.DEBUG)
|
||||
fh.setFormatter(logging.Formatter("%(asctime)s [%(levelname)s] %(message)s"))
|
||||
# консоль — INFO
|
||||
ch = logging.StreamHandler()
|
||||
ch.setLevel(logging.INFO)
|
||||
ch.setFormatter(logging.Formatter("%(message)s"))
|
||||
logger.addHandler(fh)
|
||||
logger.addHandler(ch)
|
||||
|
||||
logger.info("=== IKEA parser started ===")
|
||||
logger.info(f"BASE_DIR={BASE_DIR}")
|
||||
logger.info(f"Python={os.sys.version}")
|
||||
try:
|
||||
logger.info(f"Hostname={socket.gethostname()} IP={socket.gethostbyname(socket.gethostname())}")
|
||||
except Exception as _e:
|
||||
logger.info("Hostname/IP: unavailable")
|
||||
|
||||
# ───────────────────────── НАСТРОЙКИ POST ─────────────────────────
|
||||
POST_URL = os.getenv("IKEA_POST_URL", "http://localhost:3005/parser/data")
|
||||
@ -28,7 +54,6 @@ BATCH_SIZE = 50
|
||||
|
||||
# ───────────────────────── НАСТРОЙКИ САЙТА ────────────────────────
|
||||
HEADERS = {
|
||||
# Ближе к Windows Chrome
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
"Chrome/124.0.0.0 Safari/537.36",
|
||||
@ -85,6 +110,8 @@ KEEP_COLUMNS = [
|
||||
def make_session() -> requests.Session:
|
||||
s = requests.Session()
|
||||
s.headers.update(HEADERS)
|
||||
# игнор системных прокси/mitm переменных окружения Windows
|
||||
s.trust_env = False
|
||||
retries = Retry(
|
||||
total=5,
|
||||
backoff_factor=0.5,
|
||||
@ -93,13 +120,23 @@ def make_session() -> requests.Session:
|
||||
)
|
||||
s.mount("https://", HTTPAdapter(max_retries=retries))
|
||||
s.mount("http://", HTTPAdapter(max_retries=retries))
|
||||
# При необходимости задайте рыночные куки (пример, если нужен PL):
|
||||
# s.cookies.set("ikeaMarket", "PL")
|
||||
# s.cookies.set("ikeaCurrency", "PLN")
|
||||
return s
|
||||
|
||||
SESSION = make_session()
|
||||
SESSION.trust_env = False # игнорируем системные прокси/сертификаты из переменных окружения Windows
|
||||
|
||||
def load_netscape_cookies(session: requests.Session, path: str):
|
||||
if os.path.isfile(path):
|
||||
cj = cookiejar.MozillaCookieJar()
|
||||
try:
|
||||
cj.load(path, ignore_discard=True, ignore_expires=True)
|
||||
session.cookies.update(cj)
|
||||
logger.info(f"🍪 Cookies loaded: {path} ({len(cj)} pcs)")
|
||||
except Exception as e:
|
||||
logger.warning(f"⚠️ Failed to load cookies.txt: {e}")
|
||||
else:
|
||||
logger.info("cookies.txt not found — proceeding without external cookies")
|
||||
|
||||
load_netscape_cookies(SESSION, COOKIES_TXT)
|
||||
|
||||
# ───────────────────────── УТИЛИТЫ I/O ────────────────────────────
|
||||
def ask_bool(prompt: str, default: str = "1") -> bool:
|
||||
@ -124,19 +161,42 @@ def _save_json_batch(payload: dict, batch_index: int):
|
||||
fpath = os.path.join(RECORDS_DIR, fname)
|
||||
with open(fpath, "w", encoding="utf-8") as fh:
|
||||
json.dump(payload, fh, ensure_ascii=False, indent=2)
|
||||
print(f"💾 JSON saved: {fname}")
|
||||
logger.info(f"💾 JSON saved: {fname}")
|
||||
return fpath
|
||||
|
||||
def _save_debug_html(url: str, text: str, prefix: str = "debug"):
|
||||
def _safe_name_from_url(url: str) -> str:
|
||||
return re.sub(r"[^a-zA-Z0-9]+", "_", url)[:80]
|
||||
|
||||
def _dump_meta(prefix: str, url: str, status: int, elapsed: float, text_len: int, final_url: str, headers: dict, note: str = ""):
|
||||
try:
|
||||
safe = re.sub(r"[^a-zA-Z0-9]+", "_", url)[:80]
|
||||
fname = f"{prefix}_{_now_tag()}_{safe}.html"
|
||||
fpath = os.path.join(RECORDS_DIR, fname)
|
||||
base = f"{prefix}_{_now_tag()}_{_safe_name_from_url(url)}"
|
||||
meta = os.path.join(RECORDS_DIR, base + ".meta.txt")
|
||||
with open(meta, "w", encoding="utf-8") as fh:
|
||||
fh.write(f"URL: {url}\n")
|
||||
fh.write(f"FINAL_URL: {final_url}\n")
|
||||
fh.write(f"STATUS: {status}\n")
|
||||
fh.write(f"ELAPSED_SEC: {elapsed:.3f}\n")
|
||||
fh.write(f"RESP_LEN: {text_len}\n")
|
||||
fh.write(f"NOTE: {note}\n")
|
||||
fh.write("HEADERS:\n")
|
||||
for k, v in headers.items():
|
||||
hv = v if isinstance(v, str) else str(v)
|
||||
fh.write(f" {k}: {hv}\n")
|
||||
except Exception as e:
|
||||
logger.debug(f"Meta dump failed: {e}")
|
||||
|
||||
def _save_debug_html(url: str, text: str, prefix: str = "debug", note: str = "", status: Optional[int] = None, elapsed: Optional[float] = None, headers: Optional[dict] = None, final_url: Optional[str] = None):
|
||||
try:
|
||||
base = f"{prefix}_{_now_tag()}_{_safe_name_from_url(url)}"
|
||||
fpath = os.path.join(RECORDS_DIR, base + ".html")
|
||||
with open(fpath, "w", encoding="utf-8") as fh:
|
||||
fh.write(text)
|
||||
print(f"🧪 Saved HTML snapshot: {fname}")
|
||||
except Exception:
|
||||
pass
|
||||
logger.info(f"🧪 Saved HTML snapshot: {os.path.basename(fpath)}")
|
||||
# мета рядом
|
||||
if status is not None and headers is not None and final_url is not None and elapsed is not None:
|
||||
_dump_meta(prefix, url, status, elapsed, len(text or ""), final_url, headers, note=note)
|
||||
except Exception as e:
|
||||
logger.debug(f"HTML dump failed: {e}")
|
||||
|
||||
# ───────────────────────── СЛОВАРИ / ФИЛЬТРЫ ──────────────────────
|
||||
def load_dictionary(path: str) -> dict:
|
||||
@ -427,28 +487,48 @@ def build_variant_color_measure(desc: str, type_name: str, measurement: str) ->
|
||||
return f"{s}, {meas}" if meas else s
|
||||
|
||||
# ───────────────────── СКРАПИНГ КАРТОЧКИ ──────────────────────────
|
||||
def extract_data(url: str) -> dict:
|
||||
def extract_data(url: str, force_dump: bool = False) -> dict:
|
||||
try:
|
||||
resp = SESSION.get(url, timeout=20, allow_redirects=True)
|
||||
logger.debug(f"GET {url}")
|
||||
t0 = time.time()
|
||||
resp = SESSION.get(url, timeout=25, allow_redirects=True)
|
||||
elapsed = time.time() - t0
|
||||
status = resp.status_code
|
||||
if status != 200 or not resp.text or "data-hydration-props" not in resp.text:
|
||||
_save_debug_html(url, resp.text, prefix=f"resp{status}")
|
||||
final_url = str(getattr(resp, "url", url))
|
||||
text_len = len(resp.text or "")
|
||||
logger.info(f"HTTP {status} {final_url} ({elapsed:.2f}s, {text_len} bytes)")
|
||||
|
||||
# Всегда сохраняем первые (force_dump=True) или любую «сомнительную» страницу
|
||||
need_dump = force_dump or status != 200 or ("data-hydration-props" not in resp.text)
|
||||
if need_dump:
|
||||
note = "force_dump" if force_dump else ("no_hydration" if "data-hydration-props" not in resp.text else f"status_{status}")
|
||||
_save_debug_html(url, resp.text, prefix=f"resp{status}", note=note, status=status, elapsed=elapsed, headers=resp.headers, final_url=final_url)
|
||||
|
||||
resp.raise_for_status()
|
||||
|
||||
soup = BeautifulSoup(resp.text, "html.parser")
|
||||
|
||||
target = soup.select_one(CSS_SELECTOR)
|
||||
if not target:
|
||||
_save_debug_html(url, resp.text, prefix="no_selector")
|
||||
logger.warning("CSS selector NOT FOUND")
|
||||
_save_debug_html(url, resp.text, prefix="no_selector", note="css_selector_missing", status=status, elapsed=elapsed, headers=resp.headers, final_url=final_url)
|
||||
return {"url": url, "error": "CSS selector not found", "http_status": status}
|
||||
|
||||
raw = target.get("data-hydration-props")
|
||||
if not raw:
|
||||
_save_debug_html(url, resp.text, prefix="no_hydration")
|
||||
logger.warning("data-hydration-props NOT FOUND")
|
||||
_save_debug_html(url, resp.text, prefix="no_hydration", note="attribute_missing", status=status, elapsed=elapsed, headers=resp.headers, final_url=final_url)
|
||||
return {"url": url, "error": "data-hydration-props not found", "http_status": status}
|
||||
|
||||
decoded = html.unescape(raw)
|
||||
full_json = json.loads(decoded)
|
||||
decoded = html.unescape(raw)
|
||||
try:
|
||||
full_json = json.loads(decoded)
|
||||
except Exception as je:
|
||||
logger.error(f"JSON decode error: {je}")
|
||||
# сохраним кусок для анализа
|
||||
sample_name = f"bad_json_{_now_tag()}_{_safe_name_from_url(url)}.txt"
|
||||
with open(os.path.join(RECORDS_DIR, sample_name), "w", encoding="utf-8") as fh:
|
||||
fh.write(decoded[:20000])
|
||||
return {"url": url, "error": f"json decode error: {je}", "http_status": status}
|
||||
|
||||
result = {"url": url}
|
||||
for block in BLOCKS:
|
||||
@ -497,7 +577,7 @@ def extract_data(url: str) -> dict:
|
||||
# применяем whitelist
|
||||
filtered = {k: result.get(k) for k in KEEP_COLUMNS if k != "originalName"}
|
||||
|
||||
# originalName = productName + " " + typeName (без двойных пробелов)
|
||||
# originalName = productName + " " + typeName
|
||||
pn = (result.get("buyModule.productName") or "").strip()
|
||||
tn = (result.get("stockcheckSection.typeName") or "").strip()
|
||||
if pn and tn:
|
||||
@ -509,6 +589,7 @@ def extract_data(url: str) -> dict:
|
||||
return filtered
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Request error for {url}: {e}")
|
||||
return {"url": url, "error": str(e), "http_status": None}
|
||||
|
||||
# ───────────────────── ПОСТРОЕНИЕ ВАРИАНТА / POST ─────────────────
|
||||
@ -533,9 +614,6 @@ def _ceil_int(v):
|
||||
return None
|
||||
|
||||
def build_variant(row: dict) -> dict:
|
||||
category_name = row.get("categoryBreadcrumb") or ""
|
||||
brand_name = "ikea"
|
||||
|
||||
visible = row.get("productSummary.visibleItemNo") or ""
|
||||
sku = visible.replace(" ", "")
|
||||
|
||||
@ -556,9 +634,7 @@ def build_variant(row: dict) -> dict:
|
||||
if isinstance(raw_imgs, str):
|
||||
imgs = [x for x in raw_imgs.split("\n") if x.strip()]
|
||||
|
||||
in_stock = bool(row.get("availabilityGroup.serverOnlineSellable"))
|
||||
if not in_stock:
|
||||
in_stock = bool(row.get("buyModule.onlineSellable"))
|
||||
in_stock = bool(row.get("availabilityGroup.serverOnlineSellable")) or bool(row.get("buyModule.onlineSellable"))
|
||||
|
||||
weight_kg = _ceil_int(row.get("total brutto"))
|
||||
|
||||
@ -578,8 +654,7 @@ def build_variant(row: dict) -> dict:
|
||||
}
|
||||
|
||||
return {
|
||||
# Временно по вашему запросу:
|
||||
"category": {"name": "TEST/IKEA"},
|
||||
"category": {"name": "TEST/IKEA"}, # временно по вашему ТЗ
|
||||
"brand": {"name": "ikea"},
|
||||
"variant": variant,
|
||||
}
|
||||
@ -617,6 +692,7 @@ def _clean_url(u: str) -> str:
|
||||
return u
|
||||
|
||||
def main():
|
||||
logger.info(f"POST_URL={POST_URL} OUTPUT_FILE={OUTPUT_FILE}")
|
||||
SAVE_JSON = ask_bool("SAVE_JSON (сохранять JSON на диск?)", "1")
|
||||
SEND_JSON = ask_bool("SEND_JSON (отправлять на API?)", "1")
|
||||
|
||||
@ -624,7 +700,9 @@ def main():
|
||||
with open(INPUT_FILE, "r", encoding="utf-8-sig", newline="") as f:
|
||||
raw_lines = f.readlines()
|
||||
links = [_clean_url(x) for x in raw_lines if _clean_url(x)]
|
||||
print(f"Всего ссылок: {len(links)}")
|
||||
logger.info(f"Всего ссылок: {len(links)}")
|
||||
if not links:
|
||||
logger.warning("Список ссылок пуст — проверьте product_links.txt")
|
||||
|
||||
# готовим Excel
|
||||
wb = Workbook()
|
||||
@ -648,15 +726,15 @@ def main():
|
||||
if SEND_JSON:
|
||||
res = post_payload(payload)
|
||||
ok = res.get("ok")
|
||||
print(f"POST batch {batch_index}: {'OK' if ok else 'FAIL'} (status={res.get('status')})")
|
||||
logger.info(f"POST batch {batch_index}: {'OK' if ok else 'FAIL'} (status={res.get('status')})")
|
||||
batch_index += 1
|
||||
batch_items = []
|
||||
|
||||
for idx, link in enumerate(links, 1):
|
||||
print(f"[{idx}/{len(links)}] {link}")
|
||||
row = extract_data(link)
|
||||
logger.info(f"[{idx}/{len(links)}] {link}")
|
||||
force_dump = idx <= 3 # ← Принудительно сохраняем HTML для первых 3 ссылок
|
||||
row = extract_data(link, force_dump=force_dump)
|
||||
|
||||
# учёт статусов
|
||||
st = row.get("http_status")
|
||||
if st is None and "error" in row:
|
||||
STATUS_COUNTER["err"] += 1
|
||||
@ -680,22 +758,23 @@ def main():
|
||||
details_json = row.get("productInformationSection.productDetailsProps") or {}
|
||||
|
||||
if not (20 <= price <= 1500):
|
||||
pass
|
||||
logger.debug(f"Skip by price: {price}")
|
||||
elif total_kg > 30:
|
||||
pass
|
||||
logger.debug(f"Skip by weight: {total_kg} kg")
|
||||
elif materials_match_exclusions(details_json, EXCLUSIONS):
|
||||
pass
|
||||
logger.debug("Skip by exclusions (materials)")
|
||||
else:
|
||||
try:
|
||||
item = build_variant(row)
|
||||
batch_items.append(item)
|
||||
except Exception as e:
|
||||
logger.error(f"build_variant error for {link}: {e}")
|
||||
_post_log(f"× build_variant error for {link}: {e}")
|
||||
|
||||
# авто-сейв Excel каждые 50 строк
|
||||
if idx % 50 == 0:
|
||||
wb.save(OUTPUT_FILE)
|
||||
print(f"💾 autosave: {OUTPUT_FILE}")
|
||||
logger.info(f"💾 autosave: {OUTPUT_FILE}")
|
||||
|
||||
# флаш батча при достижении лимита
|
||||
if len(batch_items) >= BATCH_SIZE:
|
||||
@ -703,13 +782,12 @@ def main():
|
||||
|
||||
# финал
|
||||
wb.save(OUTPUT_FILE)
|
||||
print(f"\n✅ Excel готов: {OUTPUT_FILE}")
|
||||
logger.info(f"\n✅ Excel готов: {OUTPUT_FILE}")
|
||||
|
||||
flush_batch()
|
||||
|
||||
# сводка по HTTP
|
||||
print("HTTP stats:", dict(STATUS_COUNTER))
|
||||
print("🎯 Готово.")
|
||||
logger.info(f"HTTP stats: {dict(STATUS_COUNTER)}")
|
||||
logger.info("🎯 Готово.")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
Loading…
Reference in New Issue
Block a user