IKEA-json+наличие
This commit is contained in:
parent
d32356bd0c
commit
63c4c9b14f
1
.gitignore
vendored
1
.gitignore
vendored
@ -32,3 +32,4 @@ records_folder
|
|||||||
Ignore_Temp
|
Ignore_Temp
|
||||||
/Processing/Files-todo
|
/Processing/Files-todo
|
||||||
/Users/valis/MacOS_Parsers/Parser_NEXT/out
|
/Users/valis/MacOS_Parsers/Parser_NEXT/out
|
||||||
|
json_raw
|
||||||
4
requirements.txt
Normal file
4
requirements.txt
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
requests
|
||||||
|
beautifulsoup4
|
||||||
|
openpyxl
|
||||||
|
pandas
|
||||||
113
Парсер_IKEA/fetch_log.txt
Normal file
113
Парсер_IKEA/fetch_log.txt
Normal file
@ -0,0 +1,113 @@
|
|||||||
|
[2025-10-06 16:52:50] [1/1] https://www.ikea.com/pl/pl/cat/kolekcja-vinterfest-47706/
|
||||||
|
[2025-10-06 16:52:50] POST https://sik.search.blue.cdtapps.com/pl/pl/search?c=listaf&v=20250507 category_id=47706
|
||||||
|
[2025-10-06 16:52:50] → Status: 200
|
||||||
|
[2025-10-06 16:52:50] ✅ JSON сохранён: cat_47706_20251006_165250.json
|
||||||
|
[2025-10-06 16:52:50] 🎯 Готово.
|
||||||
|
[2025-10-06 17:02:28] [1/2] https://www.ikea.com/pl/pl/cat/kolekcja-vinterfest-47706/
|
||||||
|
[2025-10-06 17:02:28] POST https://sik.search.blue.cdtapps.com/pl/pl/search?c=listaf&v=20250507 category_id=47706
|
||||||
|
[2025-10-06 17:02:28] → Status: 200
|
||||||
|
[2025-10-06 17:02:29] ✅ JSON сохранён: cat_47706_20251006_170228.json
|
||||||
|
[2025-10-06 17:02:29] [2/2] https://www.ikea.com/pl/pl/cat/pokrycia-podnozek-i-puf-57654/
|
||||||
|
[2025-10-06 17:02:29] POST https://sik.search.blue.cdtapps.com/pl/pl/search?c=listaf&v=20250507 category_id=57654
|
||||||
|
[2025-10-06 17:02:29] → Status: 200
|
||||||
|
[2025-10-06 17:02:29] ✅ JSON сохранён: cat_57654_20251006_170229.json
|
||||||
|
[2025-10-06 17:02:29] 🎯 Готово.
|
||||||
|
[2025-10-06 17:46:53] [1/2] https://www.ikea.com/pl/pl/cat/kolekcja-vinterfest-47706/
|
||||||
|
[2025-10-06 17:46:53] POST https://sik.search.blue.cdtapps.com/pl/pl/search?c=listaf&v=20250507 category_id=47706
|
||||||
|
[2025-10-06 17:46:53] → Status: 200
|
||||||
|
[2025-10-06 17:46:53] ✅ 204 товаров добавлено из категории 47706
|
||||||
|
[2025-10-06 17:46:53] [2/2] https://www.ikea.com/pl/pl/cat/pokrycia-podnozek-i-puf-57654/
|
||||||
|
[2025-10-06 17:46:53] POST https://sik.search.blue.cdtapps.com/pl/pl/search?c=listaf&v=20250507 category_id=57654
|
||||||
|
[2025-10-06 17:46:53] → Status: 200
|
||||||
|
[2025-10-06 17:46:53] ✅ 46 товаров добавлено из категории 57654
|
||||||
|
[2025-10-06 17:46:53] 💾 JSON сохранён → flattened_products.json (250 записей)
|
||||||
|
[2025-10-06 17:46:53] 📊 Excel сохранён → flattened_products.xlsx
|
||||||
|
[2025-10-06 17:46:53] 🎯 Готово.
|
||||||
|
[2025-10-06 17:46:53] [1/2] https://www.ikea.com/pl/pl/cat/kolekcja-vinterfest-47706/
|
||||||
|
[2025-10-06 17:46:53] POST https://sik.search.blue.cdtapps.com/pl/pl/search?c=listaf&v=20250507 category_id=47706
|
||||||
|
[2025-10-06 17:46:54] → Status: 200
|
||||||
|
[2025-10-06 17:46:54] ✅ 204 товаров добавлено из категории 47706
|
||||||
|
[2025-10-06 17:46:54] [2/2] https://www.ikea.com/pl/pl/cat/pokrycia-podnozek-i-puf-57654/
|
||||||
|
[2025-10-06 17:46:54] POST https://sik.search.blue.cdtapps.com/pl/pl/search?c=listaf&v=20250507 category_id=57654
|
||||||
|
[2025-10-06 17:46:54] → Status: 200
|
||||||
|
[2025-10-06 17:46:54] ✅ 46 товаров добавлено из категории 57654
|
||||||
|
[2025-10-06 17:46:54] 💾 JSON сохранён → flattened_products.json (250 записей)
|
||||||
|
[2025-10-06 17:46:54] 📊 Excel сохранён → flattened_products.xlsx
|
||||||
|
[2025-10-06 17:46:54] 🎯 Готово.
|
||||||
|
[2025-10-06 17:50:05] [1/2] https://www.ikea.com/pl/pl/cat/kolekcja-vinterfest-47706/
|
||||||
|
[2025-10-06 17:50:05] POST https://sik.search.blue.cdtapps.com/pl/pl/search?c=listaf&v=20250507 category_id=47706
|
||||||
|
[2025-10-06 17:50:06] → Status: 200
|
||||||
|
[2025-10-06 17:50:06] ✅ 204 товаров добавлено из категории 47706
|
||||||
|
[2025-10-06 17:50:06] [2/2] https://www.ikea.com/pl/pl/cat/pokrycia-podnozek-i-puf-57654/
|
||||||
|
[2025-10-06 17:50:06] POST https://sik.search.blue.cdtapps.com/pl/pl/search?c=listaf&v=20250507 category_id=57654
|
||||||
|
[2025-10-06 17:50:06] → Status: 200
|
||||||
|
[2025-10-06 17:50:06] ✅ 46 товаров добавлено из категории 57654
|
||||||
|
[2025-10-06 17:50:06] 💾 JSON сохранён → flattened_products.json (250 записей)
|
||||||
|
[2025-10-06 17:50:06] 📊 Excel сохранён → flattened_products.xlsx
|
||||||
|
[2025-10-06 17:50:06] 🎯 Готово.
|
||||||
|
[2025-10-06 17:50:06] [1/2] https://www.ikea.com/pl/pl/cat/kolekcja-vinterfest-47706/
|
||||||
|
[2025-10-06 17:50:06] POST https://sik.search.blue.cdtapps.com/pl/pl/search?c=listaf&v=20250507 category_id=47706
|
||||||
|
[2025-10-06 17:50:06] → Status: 200
|
||||||
|
[2025-10-06 17:50:06] ✅ 204 товаров добавлено из категории 47706
|
||||||
|
[2025-10-06 17:50:06] [2/2] https://www.ikea.com/pl/pl/cat/pokrycia-podnozek-i-puf-57654/
|
||||||
|
[2025-10-06 17:50:06] POST https://sik.search.blue.cdtapps.com/pl/pl/search?c=listaf&v=20250507 category_id=57654
|
||||||
|
[2025-10-06 17:50:06] → Status: 200
|
||||||
|
[2025-10-06 17:50:06] ✅ 46 товаров добавлено из категории 57654
|
||||||
|
[2025-10-06 17:50:06] 💾 JSON сохранён → flattened_products.json (250 записей)
|
||||||
|
[2025-10-06 17:50:06] 📊 Excel сохранён → flattened_products.xlsx
|
||||||
|
[2025-10-06 17:50:06] 🎯 Готово.
|
||||||
|
[2025-10-06 17:53:05] [1/2] https://www.ikea.com/pl/pl/cat/kolekcja-vinterfest-47706/
|
||||||
|
[2025-10-06 17:53:05] POST https://sik.search.blue.cdtapps.com/pl/pl/search?c=listaf&v=20250507 category_id=47706
|
||||||
|
[2025-10-06 17:53:05] → Status: 200
|
||||||
|
[2025-10-06 17:53:05] ✅ 204 товаров добавлено из категории 47706
|
||||||
|
[2025-10-06 17:53:05] [2/2] https://www.ikea.com/pl/pl/cat/pokrycia-podnozek-i-puf-57654/
|
||||||
|
[2025-10-06 17:53:05] POST https://sik.search.blue.cdtapps.com/pl/pl/search?c=listaf&v=20250507 category_id=57654
|
||||||
|
[2025-10-06 17:53:06] → Status: 200
|
||||||
|
[2025-10-06 17:53:06] ✅ 46 товаров добавлено из категории 57654
|
||||||
|
[2025-10-06 17:53:06] 💾 JSON сохранён → flattened_products.json (250 записей)
|
||||||
|
[2025-10-06 17:53:06] 📊 Excel сохранён → flattened_products.xlsx
|
||||||
|
[2025-10-06 17:53:06] 🎯 Готово.
|
||||||
|
[2025-10-06 17:53:06] [1/2] https://www.ikea.com/pl/pl/cat/kolekcja-vinterfest-47706/
|
||||||
|
[2025-10-06 17:53:06] POST https://sik.search.blue.cdtapps.com/pl/pl/search?c=listaf&v=20250507 category_id=47706
|
||||||
|
[2025-10-06 17:53:06] → Status: 200
|
||||||
|
[2025-10-06 17:53:06] ✅ 204 товаров добавлено из категории 47706
|
||||||
|
[2025-10-06 17:53:06] [2/2] https://www.ikea.com/pl/pl/cat/pokrycia-podnozek-i-puf-57654/
|
||||||
|
[2025-10-06 17:53:06] POST https://sik.search.blue.cdtapps.com/pl/pl/search?c=listaf&v=20250507 category_id=57654
|
||||||
|
[2025-10-06 17:53:06] → Status: 200
|
||||||
|
[2025-10-06 17:53:06] ✅ 46 товаров добавлено из категории 57654
|
||||||
|
[2025-10-06 17:53:06] 💾 JSON сохранён → flattened_products.json (250 записей)
|
||||||
|
[2025-10-06 17:53:06] 📊 Excel сохранён → flattened_products.xlsx
|
||||||
|
[2025-10-06 17:53:06] 🎯 Готово.
|
||||||
|
[2025-10-06 17:59:15] [1/2] https://www.ikea.com/pl/pl/cat/kolekcja-vinterfest-47706/
|
||||||
|
[2025-10-06 17:59:15] POST https://sik.search.blue.cdtapps.com/pl/pl/search?c=listaf&v=20250507 category_id=47706
|
||||||
|
[2025-10-06 17:59:16] → Status: 200
|
||||||
|
[2025-10-06 17:59:16] ✅ 204 товаров добавлено из категории 47706
|
||||||
|
[2025-10-06 17:59:16] [2/2] https://www.ikea.com/pl/pl/cat/pokrycia-podnozek-i-puf-57654/
|
||||||
|
[2025-10-06 17:59:16] POST https://sik.search.blue.cdtapps.com/pl/pl/search?c=listaf&v=20250507 category_id=57654
|
||||||
|
[2025-10-06 17:59:16] → Status: 200
|
||||||
|
[2025-10-06 17:59:16] ✅ 46 товаров добавлено из категории 57654
|
||||||
|
[2025-10-06 17:59:16] 💾 JSON сохранён → flattened_products.json (250 записей)
|
||||||
|
[2025-10-06 17:59:16] 📊 Excel сохранён → flattened_products.xlsx
|
||||||
|
[2025-10-06 17:59:16] 🎯 Готово.
|
||||||
|
[2025-10-06 17:59:16] [1/2] https://www.ikea.com/pl/pl/cat/kolekcja-vinterfest-47706/
|
||||||
|
[2025-10-06 17:59:16] POST https://sik.search.blue.cdtapps.com/pl/pl/search?c=listaf&v=20250507 category_id=47706
|
||||||
|
[2025-10-06 17:59:16] → Status: 200
|
||||||
|
[2025-10-06 17:59:16] ✅ 204 товаров добавлено из категории 47706
|
||||||
|
[2025-10-06 17:59:16] [2/2] https://www.ikea.com/pl/pl/cat/pokrycia-podnozek-i-puf-57654/
|
||||||
|
[2025-10-06 17:59:16] POST https://sik.search.blue.cdtapps.com/pl/pl/search?c=listaf&v=20250507 category_id=57654
|
||||||
|
[2025-10-06 17:59:17] → Status: 200
|
||||||
|
[2025-10-06 17:59:17] ✅ 46 товаров добавлено из категории 57654
|
||||||
|
[2025-10-06 17:59:17] 💾 JSON сохранён → flattened_products.json (250 записей)
|
||||||
|
[2025-10-06 17:59:17] 📊 Excel сохранён → flattened_products.xlsx
|
||||||
|
[2025-10-06 17:59:17] 🎯 Готово.
|
||||||
|
[2025-10-06 18:00:56] [1/2] https://www.ikea.com/pl/pl/cat/kolekcja-vinterfest-47706/
|
||||||
|
[2025-10-06 18:00:56] POST https://sik.search.blue.cdtapps.com/pl/pl/search?c=listaf&v=20250507 category_id=47706
|
||||||
|
[2025-10-06 18:00:57] → Status: 200
|
||||||
|
[2025-10-06 18:00:57] ✅ 204 товаров добавлено из категории 47706
|
||||||
|
[2025-10-06 18:00:57] [2/2] https://www.ikea.com/pl/pl/cat/pokrycia-podnozek-i-puf-57654/
|
||||||
|
[2025-10-06 18:00:57] POST https://sik.search.blue.cdtapps.com/pl/pl/search?c=listaf&v=20250507 category_id=57654
|
||||||
|
[2025-10-06 18:00:57] → Status: 200
|
||||||
|
[2025-10-06 18:00:57] ✅ 46 товаров добавлено из категории 57654
|
||||||
|
[2025-10-06 18:00:57] 💾 JSON сохранён → flattened_products.json (250 записей)
|
||||||
|
[2025-10-06 18:00:57] 📊 Excel сохранён → flattened_products.xlsx
|
||||||
|
[2025-10-06 18:00:57] 🎯 Готово.
|
||||||
185
Парсер_IKEA/ikea_collect_product_linksAND-mininfo.py
Normal file
185
Парсер_IKEA/ikea_collect_product_linksAND-mininfo.py
Normal file
@ -0,0 +1,185 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import json
|
||||||
|
import requests
|
||||||
|
import datetime
|
||||||
|
import pathlib
|
||||||
|
import re
|
||||||
|
from openpyxl import Workbook
|
||||||
|
|
||||||
|
# ──────────────── ПУТИ ────────────────
|
||||||
|
BASE_DIR = pathlib.Path(__file__).resolve().parent
|
||||||
|
CAT_FILE = BASE_DIR / "leaf_categories.txt" # список категорий IKEA
|
||||||
|
OUT_DIR = BASE_DIR / "json_raw"
|
||||||
|
OUT_DIR.mkdir(exist_ok=True)
|
||||||
|
|
||||||
|
LOG_FILE = BASE_DIR / "fetch_log.txt"
|
||||||
|
|
||||||
|
OUT_JSON = OUT_DIR / "flattened_products.json"
|
||||||
|
OUT_XLSX = OUT_DIR / "flattened_products.xlsx"
|
||||||
|
|
||||||
|
# ──────────────── API ────────────────
|
||||||
|
SEARCH_URL = "https://sik.search.blue.cdtapps.com/pl/pl/search?c=listaf&v=20250507"
|
||||||
|
HEADERS = {
|
||||||
|
"User-Agent": "Mozilla/5.0",
|
||||||
|
"Content-Type": "application/json",
|
||||||
|
}
|
||||||
|
|
||||||
|
# ──────────────── ВСПОМОГАТЕЛЬНОЕ ────────────────
|
||||||
|
def log(msg: str):
|
||||||
|
ts = datetime.datetime.now().strftime("[%Y-%m-%d %H:%M:%S] ")
|
||||||
|
print(ts + msg)
|
||||||
|
with LOG_FILE.open("a", encoding="utf-8") as f:
|
||||||
|
f.write(ts + msg + "\n")
|
||||||
|
|
||||||
|
def fetch_category_json(category_id: str) -> dict:
|
||||||
|
"""Делает POST к IKEA API и возвращает чистый JSON"""
|
||||||
|
payload = {
|
||||||
|
"searchParameters": {"input": category_id, "type": "CATEGORY"},
|
||||||
|
"zip": "05-090",
|
||||||
|
"store": "188",
|
||||||
|
"isUserLoggedIn": False,
|
||||||
|
"optimizely": {
|
||||||
|
"listing_3547_filter_hnf_sticky": None,
|
||||||
|
"listing_3332_collapsed_filter_bar": None,
|
||||||
|
"discount_percentage": None,
|
||||||
|
"listing_3790_simplify_rating_stars": None
|
||||||
|
},
|
||||||
|
"optimizelyAttributes": {
|
||||||
|
"market": "pl",
|
||||||
|
"device": "desktop",
|
||||||
|
"deviceVendor": "Apple",
|
||||||
|
"deviceType": "desktop",
|
||||||
|
"isLoggedIn": False,
|
||||||
|
"environment": "prod",
|
||||||
|
"browser": "Chrome",
|
||||||
|
"os": "Mac OS",
|
||||||
|
"language": "pl",
|
||||||
|
"feedMarket": "pl-PL",
|
||||||
|
"locale": "pl-PL",
|
||||||
|
"customerType": "guest",
|
||||||
|
"isEntranceVisit": False,
|
||||||
|
"pip_to_pip_src": ""
|
||||||
|
},
|
||||||
|
"components": [{
|
||||||
|
"component": "PRIMARY_AREA",
|
||||||
|
"columns": 4,
|
||||||
|
"types": {
|
||||||
|
"main": "PRODUCT",
|
||||||
|
"breakouts": ["PLANNER", "LOGIN_REMINDER", "MATTRESS_WARRANTY"]
|
||||||
|
},
|
||||||
|
"filterConfig": {"max-num-filters": 6},
|
||||||
|
"window": {"size": 1000, "offset": 0},
|
||||||
|
"forceFilterCalculation": True
|
||||||
|
}]
|
||||||
|
}
|
||||||
|
|
||||||
|
log(f"POST {SEARCH_URL} category_id={category_id}")
|
||||||
|
r = requests.post(SEARCH_URL, headers=HEADERS, json=payload, timeout=30)
|
||||||
|
log(f"→ Status: {r.status_code}")
|
||||||
|
r.raise_for_status()
|
||||||
|
return r.json()
|
||||||
|
|
||||||
|
def extract_products(data: dict) -> list[dict]:
|
||||||
|
"""Извлекает товары и варианты из ответа IKEA"""
|
||||||
|
products = []
|
||||||
|
|
||||||
|
for result in data.get("results", []):
|
||||||
|
for item in result.get("items", []):
|
||||||
|
product = item.get("product")
|
||||||
|
if not product:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Собираем категорию
|
||||||
|
category_path = " / ".join(c.get("name", "") for c in product.get("categoryPath", []))
|
||||||
|
|
||||||
|
def extract_one(prod):
|
||||||
|
av = prod.get("availability", [])
|
||||||
|
av0_status = av[0].get("status") if len(av) > 0 else ""
|
||||||
|
av1_status = av[1].get("status") if len(av) > 1 else ""
|
||||||
|
av1_store = av[1].get("store") if len(av) > 1 else ""
|
||||||
|
|
||||||
|
price = (
|
||||||
|
prod.get("salesPrice", {})
|
||||||
|
.get("current", {})
|
||||||
|
.get("wholeNumber", "")
|
||||||
|
)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"id": prod.get("id") or prod.get("itemNoGlobal"),
|
||||||
|
"pipUrl": prod.get("pipUrl", ""),
|
||||||
|
"availability_0_status": av0_status,
|
||||||
|
"availability_1_status": av1_status,
|
||||||
|
"availability_1_store": av1_store,
|
||||||
|
"price": price,
|
||||||
|
"category_path": category_path,
|
||||||
|
}
|
||||||
|
|
||||||
|
# Основной продукт
|
||||||
|
products.append(extract_one(product))
|
||||||
|
|
||||||
|
# Варианты
|
||||||
|
variants = (
|
||||||
|
product.get("gprDescription", {}).get("variants", [])
|
||||||
|
)
|
||||||
|
for v in variants:
|
||||||
|
products.append(extract_one(v))
|
||||||
|
|
||||||
|
return products
|
||||||
|
|
||||||
|
# ──────────────── MAIN ────────────────
|
||||||
|
def main():
|
||||||
|
if not CAT_FILE.exists():
|
||||||
|
print("✖ Файл leaf_categories.txt не найден.")
|
||||||
|
return
|
||||||
|
|
||||||
|
categories = [
|
||||||
|
line.strip() for line in CAT_FILE.read_text(encoding="utf-8").splitlines() if line.strip()
|
||||||
|
]
|
||||||
|
if not categories:
|
||||||
|
print("✖ Нет категорий для обработки.")
|
||||||
|
return
|
||||||
|
|
||||||
|
all_products = []
|
||||||
|
|
||||||
|
for idx, url in enumerate(categories, 1):
|
||||||
|
log(f"[{idx}/{len(categories)}] {url}")
|
||||||
|
m = re.search(r"-([0-9]+)/?$", url.rstrip("/"))
|
||||||
|
if not m:
|
||||||
|
log("⚠️ Не найден ID категории в URL")
|
||||||
|
continue
|
||||||
|
|
||||||
|
cat_id = m.group(1)
|
||||||
|
try:
|
||||||
|
data = fetch_category_json(cat_id)
|
||||||
|
items = extract_products(data)
|
||||||
|
all_products.extend(items)
|
||||||
|
log(f"✅ {len(items)} товаров добавлено из категории {cat_id}")
|
||||||
|
except Exception as e:
|
||||||
|
log(f"❌ Ошибка при категории {cat_id}: {e}")
|
||||||
|
|
||||||
|
if not all_products:
|
||||||
|
log("⚠️ Нет товаров для сохранения.")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Сохраняем JSON
|
||||||
|
with OUT_JSON.open("w", encoding="utf-8") as f:
|
||||||
|
json.dump(all_products, f, ensure_ascii=False, indent=2)
|
||||||
|
log(f"💾 JSON сохранён → {OUT_JSON.name} ({len(all_products)} записей)")
|
||||||
|
|
||||||
|
# Сохраняем Excel
|
||||||
|
wb = Workbook()
|
||||||
|
ws = wb.active
|
||||||
|
ws.title = "IKEA_flat"
|
||||||
|
headers = list(all_products[0].keys())
|
||||||
|
ws.append(headers)
|
||||||
|
for row in all_products:
|
||||||
|
ws.append([row.get(h, "") for h in headers])
|
||||||
|
wb.save(OUT_XLSX)
|
||||||
|
log(f"📊 Excel сохранён → {OUT_XLSX.name}")
|
||||||
|
|
||||||
|
log("🎯 Готово.")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
111
Парсер_IKEA/ikea_collect_product_linksANDinfo-fullJSON.py
Normal file
111
Парсер_IKEA/ikea_collect_product_linksANDinfo-fullJSON.py
Normal file
@ -0,0 +1,111 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import json
|
||||||
|
import requests
|
||||||
|
import datetime
|
||||||
|
import pathlib
|
||||||
|
import re
|
||||||
|
|
||||||
|
# ──────────────── ПУТИ ────────────────
|
||||||
|
BASE_DIR = pathlib.Path(__file__).resolve().parent
|
||||||
|
CAT_FILE = BASE_DIR / "leaf_categories.txt" # список категорий IKEA
|
||||||
|
OUT_DIR = BASE_DIR / "json_raw"
|
||||||
|
OUT_DIR.mkdir(exist_ok=True)
|
||||||
|
|
||||||
|
LOG_FILE = BASE_DIR / "fetch_log.txt"
|
||||||
|
|
||||||
|
# ──────────────── API ────────────────
|
||||||
|
SEARCH_URL = "https://sik.search.blue.cdtapps.com/pl/pl/search?c=listaf&v=20250507"
|
||||||
|
HEADERS = {
|
||||||
|
"User-Agent": "Mozilla/5.0",
|
||||||
|
"Content-Type": "application/json",
|
||||||
|
}
|
||||||
|
|
||||||
|
# ──────────────── ВСПОМОГАТЕЛЬНОЕ ────────────────
|
||||||
|
def log(msg: str):
|
||||||
|
ts = datetime.datetime.now().strftime("[%Y-%m-%d %H:%M:%S] ")
|
||||||
|
print(ts + msg)
|
||||||
|
with LOG_FILE.open("a", encoding="utf-8") as f:
|
||||||
|
f.write(ts + msg + "\n")
|
||||||
|
|
||||||
|
def fetch_category_json(category_id: str) -> dict:
|
||||||
|
"""Делает POST к IKEA API и возвращает чистый JSON"""
|
||||||
|
payload = {
|
||||||
|
"searchParameters": {"input": category_id, "type": "CATEGORY"},
|
||||||
|
"zip": "05-090",
|
||||||
|
"store": "188",
|
||||||
|
"isUserLoggedIn": False,
|
||||||
|
"optimizely": {
|
||||||
|
"listing_3547_filter_hnf_sticky": None,
|
||||||
|
"listing_3332_collapsed_filter_bar": None,
|
||||||
|
"discount_percentage": None,
|
||||||
|
"listing_3790_simplify_rating_stars": None
|
||||||
|
},
|
||||||
|
"optimizelyAttributes": {
|
||||||
|
"market": "pl",
|
||||||
|
"device": "desktop",
|
||||||
|
"deviceVendor": "Apple",
|
||||||
|
"deviceType": "desktop",
|
||||||
|
"isLoggedIn": False,
|
||||||
|
"environment": "prod",
|
||||||
|
"browser": "Chrome",
|
||||||
|
"os": "Mac OS",
|
||||||
|
"language": "pl",
|
||||||
|
"feedMarket": "pl-PL",
|
||||||
|
"locale": "pl-PL",
|
||||||
|
"customerType": "guest",
|
||||||
|
"isEntranceVisit": False,
|
||||||
|
"pip_to_pip_src": ""
|
||||||
|
},
|
||||||
|
"components": [{
|
||||||
|
"component": "PRIMARY_AREA",
|
||||||
|
"columns": 4,
|
||||||
|
"types": {
|
||||||
|
"main": "PRODUCT",
|
||||||
|
"breakouts": ["PLANNER", "LOGIN_REMINDER", "MATTRESS_WARRANTY"]
|
||||||
|
},
|
||||||
|
"filterConfig": {"max-num-filters": 6},
|
||||||
|
"window": {"size": 1000, "offset": 0},
|
||||||
|
"forceFilterCalculation": True
|
||||||
|
}]
|
||||||
|
}
|
||||||
|
|
||||||
|
log(f"POST {SEARCH_URL} category_id={category_id}")
|
||||||
|
r = requests.post(SEARCH_URL, headers=HEADERS, json=payload, timeout=30)
|
||||||
|
log(f"→ Status: {r.status_code}")
|
||||||
|
r.raise_for_status()
|
||||||
|
return r.json()
|
||||||
|
|
||||||
|
# ──────────────── MAIN ────────────────
|
||||||
|
def main():
|
||||||
|
if not CAT_FILE.exists():
|
||||||
|
print("✖ Файл leaf_categories.txt не найден.")
|
||||||
|
return
|
||||||
|
|
||||||
|
categories = [line.strip() for line in CAT_FILE.read_text(encoding="utf-8").splitlines() if line.strip()]
|
||||||
|
if not categories:
|
||||||
|
print("✖ Нет категорий для обработки.")
|
||||||
|
return
|
||||||
|
|
||||||
|
for idx, url in enumerate(categories, 1):
|
||||||
|
log(f"[{idx}/{len(categories)}] {url}")
|
||||||
|
m = re.search(r"-([0-9]+)/?$", url.rstrip("/"))
|
||||||
|
if not m:
|
||||||
|
log("⚠️ Не найден ID категории в URL")
|
||||||
|
continue
|
||||||
|
cat_id = m.group(1)
|
||||||
|
try:
|
||||||
|
data = fetch_category_json(cat_id)
|
||||||
|
fname = f"cat_{cat_id}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
|
||||||
|
fpath = OUT_DIR / fname
|
||||||
|
with fpath.open("w", encoding="utf-8") as f:
|
||||||
|
json.dump(data, f, ensure_ascii=False, indent=2)
|
||||||
|
log(f"✅ JSON сохранён: {fpath.name}")
|
||||||
|
except Exception as e:
|
||||||
|
log(f"❌ Ошибка при категории {cat_id}: {e}")
|
||||||
|
|
||||||
|
log("🎯 Готово.")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Binary file not shown.
1277
Парсер_IKEA/leaf_categories copy 2.txt
Normal file
1277
Парсер_IKEA/leaf_categories copy 2.txt
Normal file
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
Binary file not shown.
File diff suppressed because one or more lines are too long
@ -449,7 +449,7 @@ def extract_data(url: str) -> dict:
|
|||||||
print("ContentType:", resp.headers.get("Content-Type"))
|
print("ContentType:", resp.headers.get("Content-Type"))
|
||||||
print("Length: ", len(resp.text))
|
print("Length: ", len(resp.text))
|
||||||
print("Snippet ↓↓↓")
|
print("Snippet ↓↓↓")
|
||||||
print(resp.text[:1000]) # покажет первые 1000 символов HTML
|
print(resp.text[:40000]) # покажет первые 1000 символов HTML
|
||||||
soup = BeautifulSoup(resp.text, "html.parser")
|
soup = BeautifulSoup(resp.text, "html.parser")
|
||||||
|
|
||||||
target = soup.select_one(CSS_SELECTOR)
|
target = soup.select_one(CSS_SELECTOR)
|
||||||
|
|||||||
911
Парсер_IKEA/main_win proxy_all-1.0.py
Normal file
911
Парсер_IKEA/main_win proxy_all-1.0.py
Normal file
@ -0,0 +1,911 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# ikea_pipeline.py — Фаза 1 (API → flattened) + Фаза 2 (PIP → records)
|
||||||
|
# v1.0
|
||||||
|
|
||||||
|
import os, json, re, math, time, html, requests, datetime, pathlib
|
||||||
|
from urllib.parse import urljoin
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from openpyxl import Workbook, load_workbook
|
||||||
|
|
||||||
|
# ───────────────────────── ПУТИ / ПАПКИ ───────────────────────────
|
||||||
|
BASE_DIR = pathlib.Path(__file__).resolve().parent
|
||||||
|
RECORDS_DIR = BASE_DIR / "records_folder"
|
||||||
|
JSON_DIR = BASE_DIR / "json_raw"
|
||||||
|
RECORDS_DIR.mkdir(exist_ok=True)
|
||||||
|
JSON_DIR.mkdir(exist_ok=True)
|
||||||
|
|
||||||
|
# Файлы ввода/вывода
|
||||||
|
CAT_FILE = BASE_DIR / "leaf_categories.txt" # вход: список URL категорий
|
||||||
|
OUT_JSON = JSON_DIR / "flattened_products.json" # выход фазы 1 (json)
|
||||||
|
OUT_XLSX = JSON_DIR / "flattened_products.xlsx" # выход фазы 1 (xlsx)
|
||||||
|
OUTPUT_FILE = RECORDS_DIR / "records.xlsx" # выход фазы 2 (xlsx)
|
||||||
|
POST_LOG = RECORDS_DIR / "post_log.txt" # лог POST пакетов
|
||||||
|
|
||||||
|
DICT_FILE = BASE_DIR / "dictionary_main.txt"
|
||||||
|
EXCL_FILE = BASE_DIR / "exclusion_materials.txt"
|
||||||
|
|
||||||
|
# ───────────────────────── ПРОКСИ (общий) ────────────────────────
|
||||||
|
# Используется и для Фазы 1 (API POST), и для Фазы 2 (GET карточек).
|
||||||
|
PROXY_SCHEME = "http"
|
||||||
|
PROXY_USER = "vdE9MRLB"
|
||||||
|
PROXY_PASS = "YW9ZvHLU"
|
||||||
|
PROXY_HOST = "146.19.76.243"
|
||||||
|
PROXY_PORT = 63276
|
||||||
|
|
||||||
|
_AUTH = f"{PROXY_USER}:{PROXY_PASS}@" if PROXY_USER and PROXY_PASS else ""
|
||||||
|
PROXY_URL = f"{PROXY_SCHEME}://{_AUTH}{PROXY_HOST}:{PROXY_PORT}"
|
||||||
|
PROXIES_WEB = {"http": PROXY_URL, "https": PROXY_URL}
|
||||||
|
|
||||||
|
# ───────────────────────── НАСТРОЙКИ POST (Фаза 2) ───────────────
|
||||||
|
POST_URL = os.getenv("IKEA_POST_URL", "http://172.25.4.101:3005/parser/data")
|
||||||
|
POST_API_KEY = os.getenv("IKEA_POST_API_KEY", "")
|
||||||
|
POST_TIMEOUT = 20
|
||||||
|
BATCH_SIZE = 50
|
||||||
|
|
||||||
|
# ───────────────────────── НАСТРОЙКИ IKEA API (Фаза 1) ───────────
|
||||||
|
SEARCH_URL = "https://sik.search.blue.cdtapps.com/pl/pl/search?c=listaf&v=20250507"
|
||||||
|
API_HEADERS = {
|
||||||
|
"User-Agent": "Mozilla/5.0",
|
||||||
|
"Content-Type": "application/json",
|
||||||
|
}
|
||||||
|
REQUEST_TIMEOUT = 30
|
||||||
|
|
||||||
|
# ───────────────────────── НАСТРОЙКИ PIP (Фаза 2) ────────────────
|
||||||
|
HEADERS = {
|
||||||
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||||
|
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||||
|
"Chrome/126.0.0.0 Safari/537.36",
|
||||||
|
"Accept-Language": "pl-PL,pl;q=0.9,en;q=0.8,ru;q=0.7",
|
||||||
|
}
|
||||||
|
CSS_SELECTOR = ".pip-product__subgrid.product-pip.js-product-pip"
|
||||||
|
REQUEST_TIMEOUT_GET = 20
|
||||||
|
|
||||||
|
BLOCKS = [
|
||||||
|
"buyModule",
|
||||||
|
"productSummary",
|
||||||
|
"pipPricePackage",
|
||||||
|
"productInformationSection",
|
||||||
|
"keyFacts",
|
||||||
|
"stockcheckSection",
|
||||||
|
"availabilityGroup",
|
||||||
|
"productGallery",
|
||||||
|
]
|
||||||
|
|
||||||
|
# Столбцы для Excel (Фаза 2, карточка) + мы добавим flat.* (Фаза 1)
|
||||||
|
KEEP_COLUMNS = [
|
||||||
|
"availabilityGroup.serverOnlineSellable",
|
||||||
|
"availabilityGroup.storeHeader",
|
||||||
|
"buyModule.onlineSellable",
|
||||||
|
"buyModule.productName",
|
||||||
|
"buyModule.productPrice",
|
||||||
|
"buyModule.productType",
|
||||||
|
"keyFacts.ariaLabels",
|
||||||
|
"keyFacts.gaLabel",
|
||||||
|
"keyFacts.keyFacts",
|
||||||
|
"keyFacts.keyFacts_formatted",
|
||||||
|
"pipPricePackage.measurementText",
|
||||||
|
"pipPricePackage.productDescription",
|
||||||
|
"productGallery.urls",
|
||||||
|
"productInformationSection.dimensionProps",
|
||||||
|
"productInformationSection.dimensionProps_formatted",
|
||||||
|
"productInformationSection.dimensionProps_formatted_html_translated",
|
||||||
|
"productInformationSection.productDetailsProps",
|
||||||
|
"productInformationSection.productDetailsProps_formatted",
|
||||||
|
"productInformationSection.productDetailsProps_formatted_html",
|
||||||
|
"productInformationSection.dimensionsOnly_formatted_html_translated",
|
||||||
|
"productSummary.description",
|
||||||
|
"productSummary.visibleItemNo",
|
||||||
|
"stockcheckSection.packagingProps",
|
||||||
|
"stockcheckSection.typeName",
|
||||||
|
"total brutto",
|
||||||
|
"prductVariantColorMeasure",
|
||||||
|
"categoryBreadcrumb",
|
||||||
|
"originalName",
|
||||||
|
"url",
|
||||||
|
]
|
||||||
|
|
||||||
|
# Доп. столбцы из фазы 1, которые вливаем в итоговый records.xlsx
|
||||||
|
FLAT_EXTRA_COLS = [
|
||||||
|
"flat.id",
|
||||||
|
"flat.price",
|
||||||
|
"flat.availability_0_status",
|
||||||
|
"flat.availability_1_status",
|
||||||
|
"flat.availability_1_store",
|
||||||
|
"flat.category_path",
|
||||||
|
]
|
||||||
|
|
||||||
|
# ───────────────────────── УТИЛИТЫ ───────────────────────────────
|
||||||
|
def _now_tag():
|
||||||
|
return datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||||
|
|
||||||
|
def ask_bool(prompt: str, default: str = "1") -> bool:
|
||||||
|
try:
|
||||||
|
val = input(f"{prompt} (1=yes, 0=no) [{default}]: ").strip() or default
|
||||||
|
except EOFError:
|
||||||
|
val = default
|
||||||
|
return val == "1"
|
||||||
|
|
||||||
|
def _post_log(msg: str):
|
||||||
|
try:
|
||||||
|
with open(POST_LOG, "a", encoding="utf-8") as f:
|
||||||
|
f.write(msg.rstrip() + "\n")
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
def log(msg: str):
|
||||||
|
ts = datetime.datetime.now().strftime("[%Y-%m-%d %H:%M:%S] ")
|
||||||
|
print(ts + msg)
|
||||||
|
|
||||||
|
# ───────────────────────── СЛОВАРИ / ФИЛЬТРЫ (Фаза 2) ────────────
|
||||||
|
def load_dictionary(path: pathlib.Path) -> dict:
|
||||||
|
if not path.exists():
|
||||||
|
return {}
|
||||||
|
txt = path.read_text(encoding="utf-8")
|
||||||
|
pairs = re.findall(r'"([^"]+)"\s*:\s*"([^"]+)"', txt)
|
||||||
|
return {k: v for k, v in pairs}
|
||||||
|
|
||||||
|
DICT = load_dictionary(DICT_FILE)
|
||||||
|
|
||||||
|
def translate_token(token: str) -> str:
|
||||||
|
return DICT.get(token, token)
|
||||||
|
|
||||||
|
def load_exclusions(path: pathlib.Path) -> set:
|
||||||
|
if not path.exists():
|
||||||
|
return set()
|
||||||
|
txt = path.read_text(encoding="utf-8")
|
||||||
|
quoted = re.findall(r'"([^"]+)"', txt, flags=re.S)
|
||||||
|
tokens = quoted if quoted else re.split(r"[,;\n\r]+", txt)
|
||||||
|
return {t.strip().lower() for t in tokens if t.strip()}
|
||||||
|
|
||||||
|
EXCLUSIONS = load_exclusions(EXCL_FILE)
|
||||||
|
|
||||||
|
def materials_from_details_json(details: dict) -> list[str]:
|
||||||
|
out = []
|
||||||
|
def walk(node):
|
||||||
|
if isinstance(node, dict):
|
||||||
|
for k, v in node.items():
|
||||||
|
if k == "material" and isinstance(v, str):
|
||||||
|
out.append(v)
|
||||||
|
else:
|
||||||
|
walk(v)
|
||||||
|
elif isinstance(node, list):
|
||||||
|
for x in node:
|
||||||
|
walk(x)
|
||||||
|
walk(details or {})
|
||||||
|
return out
|
||||||
|
|
||||||
|
def materials_match_exclusions(details: dict, exclusion_tokens: set) -> bool:
|
||||||
|
if not exclusion_tokens:
|
||||||
|
return False
|
||||||
|
mats = materials_from_details_json(details)
|
||||||
|
joined = "\n".join(mats).lower()
|
||||||
|
return any(tok in joined for tok in exclusion_tokens)
|
||||||
|
|
||||||
|
# ───────────────────────── ФОРМАТТЕРЫ (Фаза 2) ───────────────────
|
||||||
|
def _parse_json_value(val):
|
||||||
|
if isinstance(val, (dict, list)) or val is None:
|
||||||
|
return val
|
||||||
|
if isinstance(val, str):
|
||||||
|
s = val.strip()
|
||||||
|
if not s:
|
||||||
|
return val
|
||||||
|
try:
|
||||||
|
return json.loads(s)
|
||||||
|
except Exception:
|
||||||
|
return val
|
||||||
|
return val
|
||||||
|
|
||||||
|
def flatten_block(block_name, data):
|
||||||
|
if not isinstance(data, dict):
|
||||||
|
return {}
|
||||||
|
flat = {}
|
||||||
|
for k, v in data.items():
|
||||||
|
if block_name == "productGallery" and k == "mediaList":
|
||||||
|
if isinstance(v, list):
|
||||||
|
urls = []
|
||||||
|
for item in v:
|
||||||
|
content = item.get("content", {})
|
||||||
|
if isinstance(content, dict) and "url" in content:
|
||||||
|
urls.append(content["url"])
|
||||||
|
flat["productGallery.urls"] = "\n".join(urls)
|
||||||
|
return flat
|
||||||
|
key = f"{block_name}.{k}"
|
||||||
|
flat[key] = v
|
||||||
|
return flat
|
||||||
|
|
||||||
|
def format_keyfacts(raw_keyfacts):
|
||||||
|
if not isinstance(raw_keyfacts, list):
|
||||||
|
return ""
|
||||||
|
out = []
|
||||||
|
header_added = False
|
||||||
|
for el in raw_keyfacts:
|
||||||
|
lbl = (el or {}).get("label")
|
||||||
|
name = (el or {}).get("name", "Właściwości")
|
||||||
|
if not header_added:
|
||||||
|
out.append(name)
|
||||||
|
header_added = True
|
||||||
|
if lbl:
|
||||||
|
out.append(lbl)
|
||||||
|
return "\n".join(out)
|
||||||
|
|
||||||
|
def _fmt_float(x):
|
||||||
|
try:
|
||||||
|
return f"{float(x):.2f}".rstrip("0").rstrip(".")
|
||||||
|
except Exception:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
def _collect_packaging_total_kg(packaging):
|
||||||
|
total = 0.0
|
||||||
|
if not isinstance(packaging, dict):
|
||||||
|
return total
|
||||||
|
content = (packaging.get("contentProps") or {}).get("packages") or []
|
||||||
|
for pkg in content:
|
||||||
|
qty = ((pkg.get("quantity") or {}).get("value")) or 1
|
||||||
|
ms = pkg.get("measurements") or []
|
||||||
|
for block in ms:
|
||||||
|
if not isinstance(block, list):
|
||||||
|
continue
|
||||||
|
weight_lbl = next((m for m in block if (m.get("type") == "weight" or m.get("label") == "Waga")), None)
|
||||||
|
if weight_lbl and isinstance(weight_lbl.get("value"), (int, float)):
|
||||||
|
total += float(weight_lbl["value"]) * (qty or 1)
|
||||||
|
return total
|
||||||
|
|
||||||
|
def format_dimensions(raw_dim_props, with_html=False, translated=False):
|
||||||
|
if not isinstance(raw_dim_props, dict):
|
||||||
|
return ""
|
||||||
|
lines = []
|
||||||
|
br = "<br/>" if with_html else "\n"
|
||||||
|
|
||||||
|
title = translate_token("Wymiary") if translated else "Wymiary"
|
||||||
|
lines.append(f"<b>{title}</b>" if with_html else title)
|
||||||
|
|
||||||
|
for d in raw_dim_props.get("dimensions", []):
|
||||||
|
name = d.get("name", "")
|
||||||
|
meas = d.get("measure", "")
|
||||||
|
if not name and not meas:
|
||||||
|
continue
|
||||||
|
name_t = translate_token(name) if translated else name
|
||||||
|
line = f"{name_t}: {meas}".strip()
|
||||||
|
lines.append(line)
|
||||||
|
|
||||||
|
pack = (raw_dim_props.get("packaging") or {})
|
||||||
|
pack_title = translate_token("Opakowanie") if translated else "Opakowanie"
|
||||||
|
lines.append(br if with_html else "")
|
||||||
|
lines.append(f"<b>{pack_title}</b>" if with_html else pack_title)
|
||||||
|
|
||||||
|
content = (pack.get("contentProps") or {}).get("packages") or []
|
||||||
|
for pkg in content:
|
||||||
|
name = pkg.get("name") or ""
|
||||||
|
if name:
|
||||||
|
lines.append(name)
|
||||||
|
|
||||||
|
art = (pkg.get("articleNumber") or {}).get("value")
|
||||||
|
if art:
|
||||||
|
art_lbl = "Numer artykułu"
|
||||||
|
if translated:
|
||||||
|
art_lbl = translate_token(art_lbl)
|
||||||
|
lines.append(art_lbl)
|
||||||
|
lines.append(f"{art}")
|
||||||
|
|
||||||
|
ms = pkg.get("measurements") or []
|
||||||
|
for block in ms:
|
||||||
|
if not isinstance(block, list):
|
||||||
|
continue
|
||||||
|
for m in block:
|
||||||
|
lbl = m.get("label", "")
|
||||||
|
txt = m.get("text", "")
|
||||||
|
if translated and lbl:
|
||||||
|
lbl = translate_token(lbl)
|
||||||
|
if lbl or txt:
|
||||||
|
lines.append(f"{lbl}: {txt}".strip(": "))
|
||||||
|
|
||||||
|
q_val = ((pkg.get("quantity") or {}).get("value"))
|
||||||
|
if q_val:
|
||||||
|
q_lbl = "Paczka(i)"
|
||||||
|
if translated:
|
||||||
|
q_lbl = translate_token(q_lbl)
|
||||||
|
lines.append(f"{q_lbl}: {q_val}")
|
||||||
|
|
||||||
|
if with_html:
|
||||||
|
s = br.join([x for x in lines if x is not None])
|
||||||
|
s = re.sub(r"(" + re.escape(br) + r"){2,}", br*2, s).strip(br)
|
||||||
|
if s.startswith("b>"): # защита для Excel-превью
|
||||||
|
s = "<" + s
|
||||||
|
return s
|
||||||
|
return "\n".join([x for x in lines if x is not None]).strip()
|
||||||
|
|
||||||
|
def format_product_details(raw_details, add_summary_desc="", with_html=False, skip_assembly=True):
|
||||||
|
if not isinstance(raw_details, dict):
|
||||||
|
return add_summary_desc if with_html else add_summary_desc
|
||||||
|
br = "<br/>" if with_html else "\n"
|
||||||
|
out = []
|
||||||
|
|
||||||
|
if add_summary_desc:
|
||||||
|
out.append(add_summary_desc)
|
||||||
|
out.append(br if with_html else "")
|
||||||
|
|
||||||
|
t1 = "Informacje o produkcie"
|
||||||
|
out.append(f"<b>{t1}</b>" if with_html else t1)
|
||||||
|
pd = (raw_details.get("productDescriptionProps") or {})
|
||||||
|
for p in (pd.get("paragraphs") or []):
|
||||||
|
out.append(p)
|
||||||
|
|
||||||
|
dlabel = pd.get("designerLabel")
|
||||||
|
dname = pd.get("designerName")
|
||||||
|
if dlabel and dname:
|
||||||
|
out.append(dlabel)
|
||||||
|
out.append(dname)
|
||||||
|
|
||||||
|
if raw_details.get("productId"):
|
||||||
|
out.append("Numer artykułu")
|
||||||
|
out.append(raw_details["productId"])
|
||||||
|
|
||||||
|
acc = (raw_details.get("accordionObject") or {})
|
||||||
|
gk = ((acc.get("goodToKnow") or {}).get("contentProps") or {}).get("goodToKnow") or []
|
||||||
|
if gk:
|
||||||
|
out.append(br if with_html else "")
|
||||||
|
t2 = "Dobrze wiedzieć"
|
||||||
|
out.append(f"<b>{t2}</b>" if with_html else t2)
|
||||||
|
for item in gk:
|
||||||
|
txt = item.get("text")
|
||||||
|
if txt:
|
||||||
|
out.append(txt)
|
||||||
|
|
||||||
|
mac = (acc.get("materialsAndCare") or {}).get("contentProps") or {}
|
||||||
|
mats = mac.get("materials") or []
|
||||||
|
care = mac.get("careInstructions") or []
|
||||||
|
|
||||||
|
t3 = "Materiały i pielęgnacja"
|
||||||
|
if mats or care:
|
||||||
|
out.append(br if with_html else "")
|
||||||
|
out.append(f"<b>{t3}</b>" if with_html else t3)
|
||||||
|
|
||||||
|
if mats:
|
||||||
|
out.append("Materiały")
|
||||||
|
for m in mats:
|
||||||
|
ptype = m.get("productType", "")
|
||||||
|
for mat in (m.get("materials") or []):
|
||||||
|
material = mat.get("material", "")
|
||||||
|
if ptype:
|
||||||
|
out.append(ptype)
|
||||||
|
if material:
|
||||||
|
out.append(material)
|
||||||
|
|
||||||
|
if care:
|
||||||
|
detailsCareText = mac.get("detailsCareText", "Pielęgnacja")
|
||||||
|
out.append(detailsCareText)
|
||||||
|
for c in care:
|
||||||
|
ptype = c.get("productType", "")
|
||||||
|
for t in (c.get("texts") or []):
|
||||||
|
if ptype:
|
||||||
|
out.append(ptype)
|
||||||
|
out.append(t)
|
||||||
|
|
||||||
|
safety = (raw_details.get("safetyAndCompliance") or {}).get("contentProps") or {}
|
||||||
|
sc = safety.get("safetyAndCompliance") or []
|
||||||
|
if sc:
|
||||||
|
out.append(br if with_html else "")
|
||||||
|
t4 = "Bezpieczeństwo i zgodność с przepisami"
|
||||||
|
out.append(f"<b>{t4}</b>" if with_html else t4)
|
||||||
|
for s in sc:
|
||||||
|
txt = s.get("text")
|
||||||
|
if txt:
|
||||||
|
out.append(txt)
|
||||||
|
|
||||||
|
if with_html:
|
||||||
|
s = br.join([x for x in out if x is not None])
|
||||||
|
s = re.sub(r"(" + re.escape(br) + r"){2,}", br*2, s).strip(br)
|
||||||
|
return s
|
||||||
|
return "\n".join([x for x in out if x is not None]).strip()
|
||||||
|
|
||||||
|
def build_variant_color_measure(desc: str, type_name: str, measurement: str) -> str:
|
||||||
|
s = (desc or "")
|
||||||
|
t = (type_name or "").strip()
|
||||||
|
if t:
|
||||||
|
pattern = r"^\s*" + re.escape(t) + r"[\s,;:\-–—/]*"
|
||||||
|
s = re.sub(pattern, "", s, flags=re.IGNORECASE)
|
||||||
|
if not re.search(r"[0-9A-Za-zА-Яа-яЁёÀ-ž]", s or ""):
|
||||||
|
s = ""
|
||||||
|
s = s.strip()
|
||||||
|
meas = (measurement or "").strip()
|
||||||
|
if not s:
|
||||||
|
return meas if meas else ""
|
||||||
|
s = s[:1].upper() + s[1:]
|
||||||
|
return f"{s}, {meas}" if meas else s
|
||||||
|
|
||||||
|
def format_dimensions_only(raw_dim_props, with_html=False, translated=False):
|
||||||
|
"""Только секция размеров (Wymiary) без упаковки (для originalComposition)."""
|
||||||
|
if not isinstance(raw_dim_props, dict):
|
||||||
|
return ""
|
||||||
|
lines = []
|
||||||
|
br = "<br/>" if with_html else "\n"
|
||||||
|
title = translate_token("Wymiary") if translated else "Wymiary"
|
||||||
|
lines.append(f"<b>{title}</b>" if with_html else title)
|
||||||
|
for d in raw_dim_props.get("dimensions", []):
|
||||||
|
name = d.get("name", "")
|
||||||
|
meas = d.get("measure", "")
|
||||||
|
if not name and not meas:
|
||||||
|
continue
|
||||||
|
name_t = translate_token(name) if translated else name
|
||||||
|
lines.append(f"{name_t}: {meas}".strip())
|
||||||
|
if with_html:
|
||||||
|
s = br.join([x for x in lines if x is not None])
|
||||||
|
s = re.sub(r"(" + re.escape(br) + r"){2,}", br*2, s).strip(br)
|
||||||
|
if s.startswith("b>"):
|
||||||
|
s = "<" + s
|
||||||
|
return s
|
||||||
|
return "\n".join([x for x in lines if x is not None]).strip()
|
||||||
|
|
||||||
|
# ───────────────────────── ФАЗА 1: IKEA API → FLATTENED ──────────
|
||||||
|
def fetch_category_json(category_id: str) -> dict:
|
||||||
|
"""POST к IKEA API, возврат JSON (через прокси)."""
|
||||||
|
payload = {
|
||||||
|
"searchParameters": {"input": category_id, "type": "CATEGORY"},
|
||||||
|
"zip": "05-090",
|
||||||
|
"store": "188",
|
||||||
|
"isUserLoggedIn": False,
|
||||||
|
"optimizely": {
|
||||||
|
"listing_3547_filter_hnf_sticky": None,
|
||||||
|
"listing_3332_collapsed_filter_bar": None,
|
||||||
|
"discount_percentage": None,
|
||||||
|
"listing_3790_simplify_rating_stars": None
|
||||||
|
},
|
||||||
|
"optimizelyAttributes": {
|
||||||
|
"market": "pl",
|
||||||
|
"device": "desktop",
|
||||||
|
"deviceVendor": "Apple",
|
||||||
|
"deviceType": "desktop",
|
||||||
|
"isLoggedIn": False,
|
||||||
|
"environment": "prod",
|
||||||
|
"browser": "Chrome",
|
||||||
|
"os": "Mac OS",
|
||||||
|
"language": "pl",
|
||||||
|
"feedMarket": "pl-PL",
|
||||||
|
"locale": "pl-PL",
|
||||||
|
"customerType": "guest",
|
||||||
|
"isEntranceVisit": False,
|
||||||
|
"pip_to_pip_src": ""
|
||||||
|
},
|
||||||
|
"components": [{
|
||||||
|
"component": "PRIMARY_AREA",
|
||||||
|
"columns": 4,
|
||||||
|
"types": {"main": "PRODUCT", "breakouts": ["PLANNER", "LOGIN_REMINDER", "MATTRESS_WARRANTY"]},
|
||||||
|
"filterConfig": {"max-num-filters": 6},
|
||||||
|
"window": {"size": 1000, "offset": 0},
|
||||||
|
"forceFilterCalculation": True
|
||||||
|
}]
|
||||||
|
}
|
||||||
|
log(f"POST {SEARCH_URL} category_id={category_id}")
|
||||||
|
r = requests.post(SEARCH_URL, headers=API_HEADERS, json=payload, timeout=REQUEST_TIMEOUT, proxies=PROXIES_WEB)
|
||||||
|
log(f"→ Status: {r.status_code}")
|
||||||
|
r.raise_for_status()
|
||||||
|
return r.json()
|
||||||
|
|
||||||
|
def extract_products_from_api(data: dict) -> list[dict]:
|
||||||
|
"""Извлекает список товаров и вариантов: id/pipUrl/availability/price/category_path."""
|
||||||
|
products = []
|
||||||
|
for result in data.get("results", []):
|
||||||
|
for item in result.get("items", []):
|
||||||
|
product = item.get("product")
|
||||||
|
if not product:
|
||||||
|
continue
|
||||||
|
|
||||||
|
category_path = " / ".join(c.get("name", "") for c in product.get("categoryPath", []))
|
||||||
|
|
||||||
|
def extract_one(prod):
|
||||||
|
av = prod.get("availability", [])
|
||||||
|
av0_status = av[0].get("status") if len(av) > 0 else ""
|
||||||
|
av1_status = av[1].get("status") if len(av) > 1 else ""
|
||||||
|
av1_store = av[1].get("store") if len(av) > 1 else ""
|
||||||
|
price = (prod.get("salesPrice", {}).get("current", {}).get("wholeNumber", "")) or ""
|
||||||
|
return {
|
||||||
|
"id": prod.get("id") or prod.get("itemNoGlobal") or prod.get("itemNo"),
|
||||||
|
"pipUrl": prod.get("pipUrl", ""),
|
||||||
|
"availability_0_status": av0_status,
|
||||||
|
"availability_1_status": av1_status,
|
||||||
|
"availability_1_store": av1_store,
|
||||||
|
"price": price,
|
||||||
|
"category_path": category_path,
|
||||||
|
}
|
||||||
|
|
||||||
|
# Основной продукт
|
||||||
|
products.append(extract_one(product))
|
||||||
|
# Варианты
|
||||||
|
for v in (product.get("gprDescription", {}).get("variants", []) or []):
|
||||||
|
products.append(extract_one(v))
|
||||||
|
return products
|
||||||
|
|
||||||
|
def phase1_collect_flattened():
|
||||||
|
"""Читает leaf_categories.txt, дергает API, пишет flattened_products.json/xlsx. Возвращает список dict."""
|
||||||
|
if not CAT_FILE.exists():
|
||||||
|
log("✖ leaf_categories.txt не найден.")
|
||||||
|
return []
|
||||||
|
|
||||||
|
categories = [line.strip() for line in CAT_FILE.read_text(encoding="utf-8").splitlines() if line.strip()]
|
||||||
|
if not categories:
|
||||||
|
log("✖ Нет категорий для обработки.")
|
||||||
|
return []
|
||||||
|
|
||||||
|
all_products = []
|
||||||
|
for idx, url in enumerate(categories, 1):
|
||||||
|
log(f"[{idx}/{len(categories)}] {url}")
|
||||||
|
m = re.search(r"-([0-9]+)/?$", url.rstrip("/"))
|
||||||
|
if not m:
|
||||||
|
log("⚠️ Не найден ID категории в URL")
|
||||||
|
continue
|
||||||
|
cat_id = m.group(1)
|
||||||
|
try:
|
||||||
|
data = fetch_category_json(cat_id)
|
||||||
|
items = extract_products_from_api(data)
|
||||||
|
all_products.extend(items)
|
||||||
|
log(f"✅ {len(items)} товаров добавлено из категории {cat_id}")
|
||||||
|
except Exception as e:
|
||||||
|
log(f"❌ Ошибка при категории {cat_id}: {e}")
|
||||||
|
|
||||||
|
if not all_products:
|
||||||
|
log("⚠️ Нет товаров для сохранения.")
|
||||||
|
return []
|
||||||
|
|
||||||
|
# JSON
|
||||||
|
OUT_JSON.write_text(json.dumps(all_products, ensure_ascii=False, indent=2), encoding="utf-8")
|
||||||
|
log(f"💾 JSON сохранён → {OUT_JSON.name} ({len(all_products)} записей)")
|
||||||
|
|
||||||
|
# Excel
|
||||||
|
wb = Workbook()
|
||||||
|
ws = wb.active
|
||||||
|
ws.title = "IKEA_flat"
|
||||||
|
headers = list(all_products[0].keys())
|
||||||
|
ws.append(headers)
|
||||||
|
for row in all_products:
|
||||||
|
ws.append([row.get(h, "") for h in headers])
|
||||||
|
wb.save(OUT_XLSX)
|
||||||
|
log(f"📊 Excel сохранён → {OUT_XLSX.name}")
|
||||||
|
|
||||||
|
return all_products
|
||||||
|
|
||||||
|
# ───────────────────────── ФАЗА 2: PIP карточки → records ────────
|
||||||
|
def _ceil_price(v):
|
||||||
|
try:
|
||||||
|
return int(math.ceil(float(v)))
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _ceil_int(v):
|
||||||
|
try:
|
||||||
|
return int(math.ceil(float(v)))
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def build_variant(row: dict) -> dict:
|
||||||
|
visible = row.get("productSummary.visibleItemNo") or ""
|
||||||
|
sku = visible.replace(" ", "")
|
||||||
|
category_name = row.get("categoryBreadcrumb") or ""
|
||||||
|
|
||||||
|
cdesc = row.get("pipPricePackage.productDescription") or ""
|
||||||
|
tname = row.get("stockcheckSection.typeName") or ""
|
||||||
|
meas = row.get("pipPricePackage.measurementText") or ""
|
||||||
|
csm = build_variant_color_measure(cdesc, tname, meas)
|
||||||
|
color, size = ("", "")
|
||||||
|
if csm:
|
||||||
|
parts = [p.strip() for p in csm.split(",", 1)]
|
||||||
|
if len(parts) == 2:
|
||||||
|
color, size = parts[0], parts[1]
|
||||||
|
else:
|
||||||
|
color, size = parts[0], ""
|
||||||
|
|
||||||
|
if not color and not size:
|
||||||
|
size = (row.get("pipPricePackage.measurementText") or "").strip()
|
||||||
|
|
||||||
|
cost = _ceil_price(row.get("buyModule.productPrice"))
|
||||||
|
url = row.get("url") or ""
|
||||||
|
name = row.get("originalName") or row.get("buyModule.productName") or ""
|
||||||
|
desc_html = row.get("productInformationSection.productDetailsProps_formatted_html") or ""
|
||||||
|
composition_html = row.get("productInformationSection.dimensionsOnly_formatted_html_translated") or ""
|
||||||
|
|
||||||
|
imgs = []
|
||||||
|
raw_imgs = row.get("productGallery.urls") or ""
|
||||||
|
if isinstance(raw_imgs, str):
|
||||||
|
imgs = [x for x in raw_imgs.split("\n") if x.strip()]
|
||||||
|
|
||||||
|
in_stock = bool(row.get("availabilityGroup.serverOnlineSellable")) or bool(row.get("buyModule.onlineSellable"))
|
||||||
|
weight_kg = _ceil_int(row.get("total brutto"))
|
||||||
|
|
||||||
|
variant = {
|
||||||
|
"status_id": 1,
|
||||||
|
"color": color,
|
||||||
|
"sku": sku,
|
||||||
|
"size": size,
|
||||||
|
"cost": cost,
|
||||||
|
"originalUrl": url,
|
||||||
|
"originalName": name,
|
||||||
|
"originalDescription": desc_html,
|
||||||
|
"originalComposition": composition_html,
|
||||||
|
"images": imgs,
|
||||||
|
"inStock": in_stock,
|
||||||
|
"weight": weight_kg if weight_kg is not None else 0,
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
"category": {"name": category_name},
|
||||||
|
"brand": {"name": "ikea"},
|
||||||
|
"variant": variant,
|
||||||
|
}
|
||||||
|
|
||||||
|
def post_payload(payload: dict) -> dict:
|
||||||
|
headers = {"Content-Type": "application/json"}
|
||||||
|
if POST_API_KEY:
|
||||||
|
headers["Authorization"] = f"Bearer {POST_API_KEY}"
|
||||||
|
body = json.dumps(payload, ensure_ascii=False)
|
||||||
|
_post_log(f"→ POST {POST_URL}\nHeaders: {headers}\nBody: {body}")
|
||||||
|
try:
|
||||||
|
r = requests.post(POST_URL, headers=headers, data=body.encode("utf-8"), timeout=POST_TIMEOUT)
|
||||||
|
text = r.text
|
||||||
|
_post_log(f"← {r.status_code}\n{text}\n{'-'*60}")
|
||||||
|
ok = 200 <= r.status_code < 300
|
||||||
|
return {"ok": ok, "status": r.status_code, "response": text}
|
||||||
|
except Exception as e:
|
||||||
|
_post_log(f"× ERROR: {e}\n{'-'*60}")
|
||||||
|
return {"ok": False, "status": None, "error": str(e)}
|
||||||
|
|
||||||
|
def safe_cell(val):
|
||||||
|
if isinstance(val, (dict, list)):
|
||||||
|
return json.dumps(val, ensure_ascii=False)
|
||||||
|
return "" if val is None else val
|
||||||
|
|
||||||
|
def extract_data(url: str) -> dict:
|
||||||
|
"""Парсинг карточки: вытягиваем data-hydration-props, раскладываем по KEEP_COLUMNS."""
|
||||||
|
try:
|
||||||
|
resp = requests.get(
|
||||||
|
url, headers=HEADERS, timeout=REQUEST_TIMEOUT_GET,
|
||||||
|
proxies=PROXIES_WEB, allow_redirects=True
|
||||||
|
)
|
||||||
|
resp.raise_for_status()
|
||||||
|
soup = BeautifulSoup(resp.text, "html.parser")
|
||||||
|
|
||||||
|
target = soup.select_one(CSS_SELECTOR)
|
||||||
|
if not target:
|
||||||
|
return {"url": url, "error": "CSS selector not found"}
|
||||||
|
|
||||||
|
raw = target.get("data-hydration-props")
|
||||||
|
if not raw:
|
||||||
|
return {"url": url, "error": "data-hydration-props not found"}
|
||||||
|
|
||||||
|
decoded = html.unescape(raw)
|
||||||
|
full_json = json.loads(decoded)
|
||||||
|
|
||||||
|
result = {"url": url}
|
||||||
|
for block in BLOCKS:
|
||||||
|
result.update(flatten_block(block, full_json.get(block, {})))
|
||||||
|
|
||||||
|
kf_json = _parse_json_value(result.get("keyFacts.keyFacts"))
|
||||||
|
dim_json = _parse_json_value(result.get("productInformationSection.dimensionProps"))
|
||||||
|
det_json = _parse_json_value(result.get("productInformationSection.productDetailsProps"))
|
||||||
|
|
||||||
|
result["keyFacts.keyFacts_formatted"] = format_keyfacts(kf_json)
|
||||||
|
|
||||||
|
# Полные размеры (с упаковкой) в HTML
|
||||||
|
html_trans = format_dimensions(dim_json, with_html=True, translated=True)
|
||||||
|
if isinstance(html_trans, str) and html_trans.startswith("b>"):
|
||||||
|
html_trans = "<" + html_trans
|
||||||
|
result["productInformationSection.dimensionProps_formatted_html_translated"] = html_trans
|
||||||
|
|
||||||
|
# Только "Wymiary" (без упаковки) в HTML → для originalComposition
|
||||||
|
dims_only_html = format_dimensions_only(dim_json, with_html=True, translated=True)
|
||||||
|
result["productInformationSection.dimensionsOnly_formatted_html_translated"] = dims_only_html
|
||||||
|
|
||||||
|
# Текстовая версия размеров
|
||||||
|
result["productInformationSection.dimensionProps_formatted"] = format_dimensions(dim_json, with_html=False, translated=False)
|
||||||
|
|
||||||
|
total_kg = _collect_packaging_total_kg((dim_json or {}).get("packaging") or {})
|
||||||
|
result["total brutto"] = _fmt_float(total_kg)
|
||||||
|
|
||||||
|
summary_desc = result.get("productSummary.description", "") or ""
|
||||||
|
result["productInformationSection.productDetailsProps_formatted"] = format_product_details(det_json, add_summary_desc=summary_desc, with_html=False, skip_assembly=True)
|
||||||
|
result["productInformationSection.productDetailsProps_formatted_html"] = format_product_details(det_json, add_summary_desc=summary_desc, with_html=True, skip_assembly=True)
|
||||||
|
|
||||||
|
desc = result.get("pipPricePackage.productDescription", "") or ""
|
||||||
|
tname = result.get("stockcheckSection.typeName", "") or ""
|
||||||
|
meas = result.get("pipPricePackage.measurementText", "") or ""
|
||||||
|
result["prductVariantColorMeasure"] = build_variant_color_measure(desc, tname, meas)
|
||||||
|
|
||||||
|
# breadcrumb из ld+json
|
||||||
|
breadcrumb = None
|
||||||
|
for tag in soup.find_all("script", attrs={"type": lambda t: t and "ld+json" in t}):
|
||||||
|
try:
|
||||||
|
data = json.loads(tag.string)
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
if isinstance(data, list):
|
||||||
|
data = next((d for d in data if isinstance(d, dict) and d.get("@type") == "BreadcrumbList"), None)
|
||||||
|
if isinstance(data, dict) and data.get("@type") == "BreadcrumbList":
|
||||||
|
items = data.get("itemListElement", [])
|
||||||
|
names = [it.get("name", "") for it in items]
|
||||||
|
breadcrumb = "/".join(names)
|
||||||
|
break
|
||||||
|
if breadcrumb:
|
||||||
|
result["categoryBreadcrumb"] = breadcrumb
|
||||||
|
|
||||||
|
# whitelist + originalName
|
||||||
|
filtered = {k: result.get(k) for k in KEEP_COLUMNS if k != "originalName"}
|
||||||
|
pn = (result.get("buyModule.productName") or "").strip()
|
||||||
|
tn = (result.get("stockcheckSection.typeName") or "").strip()
|
||||||
|
filtered["originalName"] = (f"{pn} {tn}".strip() or pn or tn)
|
||||||
|
|
||||||
|
return filtered
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
return {"url": url, "error": str(e)}
|
||||||
|
|
||||||
|
# ───────────────────────── ОБЩИЙ PIPELINE ────────────────────────
|
||||||
|
def run_pipeline():
|
||||||
|
log(f"Запуск {datetime.datetime.now()} pid={os.getpid()}")
|
||||||
|
|
||||||
|
# ФАЗА 1: собрать flattened (API)
|
||||||
|
flat_items = phase1_collect_flattened()
|
||||||
|
|
||||||
|
# Карта → для быстрого присоединения полей в Фазе 2
|
||||||
|
# ключ = pipUrl, значение = dict(flat.*)
|
||||||
|
flat_by_url = {}
|
||||||
|
links_in_order = [] # порядок обхода (все pipUrl, уникальные, по порядку)
|
||||||
|
for row in flat_items:
|
||||||
|
url = row.get("pipUrl") or ""
|
||||||
|
if not url:
|
||||||
|
continue
|
||||||
|
flat_by_url[url] = {
|
||||||
|
"flat.id": row.get("id", ""),
|
||||||
|
"flat.price": row.get("price", ""),
|
||||||
|
"flat.availability_0_status": row.get("availability_0_status", ""),
|
||||||
|
"flat.availability_1_status": row.get("availability_1_status", ""),
|
||||||
|
"flat.availability_1_store": row.get("availability_1_store", ""),
|
||||||
|
"flat.category_path": row.get("category_path", ""),
|
||||||
|
}
|
||||||
|
if url not in links_in_order:
|
||||||
|
links_in_order.append(url)
|
||||||
|
|
||||||
|
if not links_in_order:
|
||||||
|
log("⚠️ Нет ссылок для Фазы 2.")
|
||||||
|
return
|
||||||
|
|
||||||
|
# ФАЗА 2: карточки по ссылкам → records.xlsx (+ POST/JSON батчи)
|
||||||
|
SAVE_JSON = ask_bool("SAVE_JSON (сохранять JSON батчи?)", "1")
|
||||||
|
SEND_JSON = ask_bool("SEND_JSON (отправлять на API?)", "1")
|
||||||
|
|
||||||
|
# === Загружаем данные из flattened_products.xlsx начало===
|
||||||
|
FLAT_FILE = os.path.join(BASE_DIR, "json_raw", "flattened_products.xlsx")
|
||||||
|
AVAIL_MAP = {}
|
||||||
|
|
||||||
|
if os.path.exists(FLAT_FILE):
|
||||||
|
wb_flat = load_workbook(FLAT_FILE, read_only=True)
|
||||||
|
ws_flat = wb_flat.active
|
||||||
|
|
||||||
|
# определяем индексы нужных колонок
|
||||||
|
headers = [c.value for c in next(ws_flat.iter_rows(min_row=1, max_row=1))]
|
||||||
|
url_idx = headers.index("pipUrl") + 1 if "pipUrl" in headers else None
|
||||||
|
a0_idx = headers.index("availability_0_status") + 1 if "availability_0_status" in headers else None
|
||||||
|
a1_idx = headers.index("availability_1_status") + 1 if "availability_1_status" in headers else None
|
||||||
|
|
||||||
|
if url_idx and (a0_idx or a1_idx):
|
||||||
|
for row in ws_flat.iter_rows(min_row=2, values_only=True):
|
||||||
|
url = row[url_idx - 1]
|
||||||
|
if not url:
|
||||||
|
continue
|
||||||
|
a0 = (row[a0_idx - 1] if a0_idx else "") or ""
|
||||||
|
a1 = (row[a1_idx - 1] if a1_idx else "") or ""
|
||||||
|
AVAIL_MAP[url] = {
|
||||||
|
"availability_0_status": str(a0).strip(),
|
||||||
|
"availability_1_status": str(a1).strip(),
|
||||||
|
}
|
||||||
|
|
||||||
|
print(f"📦 Загружено {len(AVAIL_MAP)} записей из flattened_products.xlsx")
|
||||||
|
else:
|
||||||
|
print("⚠️ Файл flattened_products.xlsx не найден, фильтр по HIGH_IN_STOCK не будет применён.")
|
||||||
|
# === Загружаем данные из flattened_products.xlsx конец===
|
||||||
|
|
||||||
|
wb = Workbook()
|
||||||
|
ws = wb.active
|
||||||
|
ws.title = "IKEA Products"
|
||||||
|
|
||||||
|
# Заголовки = карточка (KEEP_COLUMNS) + flat.*
|
||||||
|
headers = KEEP_COLUMNS + FLAT_EXTRA_COLS
|
||||||
|
ws.append(headers)
|
||||||
|
|
||||||
|
batch_items = []
|
||||||
|
batch_index = 1
|
||||||
|
|
||||||
|
def _save_json_batch(payload: dict, batch_index: int):
|
||||||
|
fname = f"ikea_batch_{_now_tag()}_{batch_index:04d}.json"
|
||||||
|
fpath = RECORDS_DIR / fname
|
||||||
|
fpath.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
|
||||||
|
print(f"💾 JSON saved: {fname}")
|
||||||
|
return fpath
|
||||||
|
|
||||||
|
def flush_batch():
|
||||||
|
nonlocal batch_items, batch_index
|
||||||
|
if not batch_items:
|
||||||
|
return
|
||||||
|
payload = {"parserName": "ikea", "items": batch_items}
|
||||||
|
if SAVE_JSON:
|
||||||
|
_save_json_batch(payload, batch_index)
|
||||||
|
if SEND_JSON:
|
||||||
|
res = post_payload(payload)
|
||||||
|
ok = res.get("ok")
|
||||||
|
print(f"POST batch {batch_index}: {'OK' if ok else 'FAIL'} (status={res.get('status')})")
|
||||||
|
batch_index += 1
|
||||||
|
batch_items = []
|
||||||
|
|
||||||
|
log(f"Всего ссылок к обходу: {len(links_in_order)}")
|
||||||
|
for idx, link in enumerate(links_in_order, 1):
|
||||||
|
print(f"[{idx}/{len(links_in_order)}] {link}")
|
||||||
|
row = extract_data(link)
|
||||||
|
|
||||||
|
# Вставляем URL (страховка) + добавим flat.* в Excel
|
||||||
|
row["url"] = link
|
||||||
|
|
||||||
|
# Excel: карточка
|
||||||
|
excel_row = [safe_cell(row.get(col, "")) for col in KEEP_COLUMNS]
|
||||||
|
|
||||||
|
# Excel: flat.*
|
||||||
|
flat_extra = flat_by_url.get(link, {})
|
||||||
|
excel_row.extend([flat_extra.get(col, "") for col in FLAT_EXTRA_COLS])
|
||||||
|
|
||||||
|
ws.append(excel_row)
|
||||||
|
|
||||||
|
try:
|
||||||
|
price = float(row.get("buyModule.productPrice") or 0)
|
||||||
|
except Exception:
|
||||||
|
price = 0.0
|
||||||
|
|
||||||
|
try:
|
||||||
|
total_kg = float(row.get("total brutto") or 0)
|
||||||
|
except Exception:
|
||||||
|
total_kg = 0.0
|
||||||
|
|
||||||
|
details_json = row.get("productInformationSection.productDetailsProps") or {}
|
||||||
|
|
||||||
|
# --- Проверяем наличие HIGH_IN_STOCK на основании flattened_products ---
|
||||||
|
avail_0 = ""
|
||||||
|
avail_1 = ""
|
||||||
|
if link in AVAIL_MAP:
|
||||||
|
avail_0 = AVAIL_MAP[link]["availability_0_status"].upper()
|
||||||
|
avail_1 = AVAIL_MAP[link]["availability_1_status"].upper()
|
||||||
|
|
||||||
|
avail_ok = (avail_0 == "HIGH_IN_STOCK") or (avail_1 == "HIGH_IN_STOCK")
|
||||||
|
|
||||||
|
# --- Фильтры ---
|
||||||
|
if not (20 <= price <= 2000):
|
||||||
|
pass
|
||||||
|
elif total_kg > 30:
|
||||||
|
pass
|
||||||
|
elif materials_match_exclusions(details_json, EXCLUSIONS):
|
||||||
|
pass
|
||||||
|
elif not avail_ok:
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
item = build_variant(row)
|
||||||
|
batch_items.append(item)
|
||||||
|
except Exception as e:
|
||||||
|
_post_log(f"× build_variant error for {link}: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# autosave Excel каждые 50 строк
|
||||||
|
if idx % 50 == 0:
|
||||||
|
wb.save(OUTPUT_FILE)
|
||||||
|
print(f"💾 autosave: {OUTPUT_FILE}")
|
||||||
|
|
||||||
|
# флаш батча при достижении лимита
|
||||||
|
if len(batch_items) >= BATCH_SIZE:
|
||||||
|
flush_batch()
|
||||||
|
|
||||||
|
# финал
|
||||||
|
wb.save(OUTPUT_FILE)
|
||||||
|
print(f"\n✅ Excel готов: {OUTPUT_FILE}")
|
||||||
|
|
||||||
|
flush_batch()
|
||||||
|
print("🎯 Готово.")
|
||||||
|
|
||||||
|
# ───────────────────────── Точка входа ───────────────────────────
|
||||||
|
if __name__ == "__main__":
|
||||||
|
run_pipeline()
|
||||||
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user