MacOS_Parsers/Парсер_IKEA/main.py

201 lines
7.0 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import requests
import json
import os
import html
from bs4 import BeautifulSoup
from openpyxl import Workbook
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
INPUT_FILE = os.path.join(BASE_DIR, "links.txt")
OUTPUT_FILE = os.path.join(BASE_DIR, "result.xlsx")
CSS_SELECTOR = ".pip-product__subgrid.product-pip.js-product-pip"
BLOCKS = [
"buyModule",
"productSummary",
"pipPricePackage",
"productInformationSection",
"keyFacts",
"stockcheckSection",
"availabilityGroup",
"productGallery"
]
# ── какие колонки сохраняем ─────────────────────────────────────────
KEEP_COLUMNS = [
"availabilityGroup.serverOnlineSellable",
"availabilityGroup.storeHeader",
"buyModule.onlineSellable",
"buyModule.productName",
"buyModule.productPrice",
"buyModule.productType",
"keyFacts.ariaLabels",
"keyFacts.gaLabel",
"keyFacts.keyFacts",
"pipPricePackage.measurementText",
"pipPricePackage.productDescription",
"productGallery.urls",
"productInformationSection.dimensionProps",
"productInformationSection.productDetailsProps",
"productSummary.description",
"productSummary.visibleItemNo",
"stockcheckSection.packagingProps",
"stockcheckSection.typeName",
"url",
"categoryBreadcrumb",
]
def flatten_block(block_name, data):
if not isinstance(data, dict):
return {}
flat = {}
for k, v in data.items():
'''
# === 1. dimensionProps.images ===
if block_name == "productInformationSection" and k == "dimensionProps":
if isinstance(v, dict):
urls = []
for img in v.get("images", []):
if isinstance(img, dict):
url = img.get("url")
if url:
urls.append(url)
flat[f"{key_name}.images_urls"] = "\n".join(urls)
continue
'''
# === 2. mediaList.content.url → productGallery.urls
if block_name == "productGallery" and k == "mediaList":
if isinstance(v, list):
urls = []
for item in v:
content = item.get("content", {})
if isinstance(content, dict) and "url" in content:
urls.append(content["url"])
flat["productGallery.urls"] = "\n".join(urls)
return flat # ⬅ возвращаем только urls, остальные поля игнорируем
continue
# === Остальные поля — по умолчанию ===
key = f"{block_name}.{k}"
flat[key] = v
return flat
def extract_data(url):
"""
Возвращает словарь с нужными полями товара IKEA.
+ NEW: добавляет ключ 'categoryBreadcrumb' вида
'Produkty/Tekstylia/Tekstylia do sypialni/Narzuty na łóżko'
(берётся из JSON-LD BreadcrumbList).
"""
try:
response = requests.get(url, timeout=10,
headers={"User-Agent": "Mozilla/5.0"})
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
# ── основной JSON из data-hydration-props ──────────────────
target = soup.select_one(CSS_SELECTOR)
if not target:
return {"url": url, "error": "CSS selector not found"}
raw = target.get("data-hydration-props")
if not raw:
return {"url": url, "error": "data-hydration-props not found"}
decoded = html.unescape(raw)
full_json = json.loads(decoded)
result = {"url": url}
# вытаскиваем нужные блоки
for block in BLOCKS:
result.update(flatten_block(block, full_json.get(block, {})))
# ── NEW: извлекаем BreadcrumbList → categoryBreadcrumb ────
breadcrumb = None
for tag in soup.find_all("script",
attrs={"type": lambda t: t and "ld+json" in t}):
try:
data = json.loads(tag.string)
except Exception:
continue
# если это массив JSON-LD, ищем в нём объект Product / Breadcrumb
if isinstance(data, list):
data = next((d for d in data
if d.get("@type") == "BreadcrumbList"), None)
if isinstance(data, dict) and data.get("@type") == "BreadcrumbList":
items = data.get("itemListElement", [])
names = [it.get("name", "") for it in items]
breadcrumb = "/".join(names)
break # нашли нужный блок выходим из цикла
if breadcrumb:
result["categoryBreadcrumb"] = breadcrumb
return result
except Exception as e:
return {"url": url, "error": str(e)}
def main():
# ── читаем ссылки ────────────────────────────────────────────
with open(INPUT_FILE, "r", encoding="utf-8") as f:
links = [line.strip() for line in f if line.strip()]
rows = []
# ---- РЕЖИМ КОЛОНОК -----------------------------------------
# NEW: фиксированный список колонок (см. KEEP_COLUMNS вверху)
all_columns = KEEP_COLUMNS
# OLD (восстановить-если-нужно):
# all_columns = set() # ← копил все поля
# ------------------------------------------------------------
print("🔍 Извлечение данных...")
for idx, link in enumerate(links, 1):
print(f"[{idx}/{len(links)}] {link}")
row = extract_data(link)
# NEW: оставляем только нужные 17 полей
row = {k: v for k, v in row.items() if k in KEEP_COLUMNS}
# OLD (восстановить-если-нужно):
# all_columns.update(row.keys()) # ← собирал все ключи
rows.append(row)
# OLD (восстановить-если-нужно):
# if isinstance(all_columns, set):
# all_columns = sorted(all_columns) # упорядочивал всё
def safe(val):
"""Преобразует dict / list в JSON-строку, None → ''."""
if isinstance(val, (dict, list)):
return json.dumps(val, ensure_ascii=False)
return "" if val is None else val
print("📤 Сохраняем Excel...")
wb = Workbook()
ws = wb.active
ws.title = "IKEA Products"
ws.append(all_columns)
for row in rows:
ws.append([safe(row.get(col, "")) for col in all_columns])
wb.save(OUTPUT_FILE)
print(f"\n✅ Готово: {OUTPUT_FILE}")
if __name__ == "__main__":
main()