IKEA_сбор данных по ссылке на товар в текстовом файле.

This commit is contained in:
va1is 2025-08-01 12:05:02 +03:00
parent 20bd54dd3c
commit 790abe8b95
4 changed files with 201 additions and 0 deletions

View File

@ -0,0 +1 @@
https://www.ikea.com/pl/pl/p/indira-narzuta-zoltobezowy-20582629/#content

200
Парсер_IKEA/main.py Normal file
View File

@ -0,0 +1,200 @@
import requests
import json
import os
import html
from bs4 import BeautifulSoup
from openpyxl import Workbook
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
INPUT_FILE = os.path.join(BASE_DIR, "links.txt")
OUTPUT_FILE = os.path.join(BASE_DIR, "result.xlsx")
CSS_SELECTOR = ".pip-product__subgrid.product-pip.js-product-pip"
BLOCKS = [
"buyModule",
"productSummary",
"pipPricePackage",
"productInformationSection",
"keyFacts",
"stockcheckSection",
"availabilityGroup",
"productGallery"
]
# ── какие колонки сохраняем ─────────────────────────────────────────
KEEP_COLUMNS = [
"availabilityGroup.serverOnlineSellable",
"availabilityGroup.storeHeader",
"buyModule.onlineSellable",
"buyModule.productName",
"buyModule.productPrice",
"buyModule.productType",
"keyFacts.ariaLabels",
"keyFacts.gaLabel",
"keyFacts.keyFacts",
"pipPricePackage.measurementText",
"pipPricePackage.productDescription",
"productGallery.urls",
"productInformationSection.dimensionProps",
"productInformationSection.productDetailsProps",
"productSummary.description",
"productSummary.visibleItemNo",
"stockcheckSection.packagingProps",
"stockcheckSection.typeName",
"url",
"categoryBreadcrumb",
]
def flatten_block(block_name, data):
if not isinstance(data, dict):
return {}
flat = {}
for k, v in data.items():
'''
# === 1. dimensionProps.images ===
if block_name == "productInformationSection" and k == "dimensionProps":
if isinstance(v, dict):
urls = []
for img in v.get("images", []):
if isinstance(img, dict):
url = img.get("url")
if url:
urls.append(url)
flat[f"{key_name}.images_urls"] = "\n".join(urls)
continue
'''
# === 2. mediaList.content.url → productGallery.urls
if block_name == "productGallery" and k == "mediaList":
if isinstance(v, list):
urls = []
for item in v:
content = item.get("content", {})
if isinstance(content, dict) and "url" in content:
urls.append(content["url"])
flat["productGallery.urls"] = "\n".join(urls)
return flat # ⬅ возвращаем только urls, остальные поля игнорируем
continue
# === Остальные поля — по умолчанию ===
key = f"{block_name}.{k}"
flat[key] = v
return flat
def extract_data(url):
"""
Возвращает словарь с нужными полями товара IKEA.
+ NEW: добавляет ключ 'categoryBreadcrumb' вида
'Produkty/Tekstylia/Tekstylia do sypialni/Narzuty na łóżko'
(берётся из JSON-LD BreadcrumbList).
"""
try:
response = requests.get(url, timeout=10,
headers={"User-Agent": "Mozilla/5.0"})
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
# ── основной JSON из data-hydration-props ──────────────────
target = soup.select_one(CSS_SELECTOR)
if not target:
return {"url": url, "error": "CSS selector not found"}
raw = target.get("data-hydration-props")
if not raw:
return {"url": url, "error": "data-hydration-props not found"}
decoded = html.unescape(raw)
full_json = json.loads(decoded)
result = {"url": url}
# вытаскиваем нужные блоки
for block in BLOCKS:
result.update(flatten_block(block, full_json.get(block, {})))
# ── NEW: извлекаем BreadcrumbList → categoryBreadcrumb ────
breadcrumb = None
for tag in soup.find_all("script",
attrs={"type": lambda t: t and "ld+json" in t}):
try:
data = json.loads(tag.string)
except Exception:
continue
# если это массив JSON-LD, ищем в нём объект Product / Breadcrumb
if isinstance(data, list):
data = next((d for d in data
if d.get("@type") == "BreadcrumbList"), None)
if isinstance(data, dict) and data.get("@type") == "BreadcrumbList":
items = data.get("itemListElement", [])
names = [it.get("name", "") for it in items]
breadcrumb = "/".join(names)
break # нашли нужный блок выходим из цикла
if breadcrumb:
result["categoryBreadcrumb"] = breadcrumb
return result
except Exception as e:
return {"url": url, "error": str(e)}
def main():
# ── читаем ссылки ────────────────────────────────────────────
with open(INPUT_FILE, "r", encoding="utf-8") as f:
links = [line.strip() for line in f if line.strip()]
rows = []
# ---- РЕЖИМ КОЛОНОК -----------------------------------------
# NEW: фиксированный список колонок (см. KEEP_COLUMNS вверху)
all_columns = KEEP_COLUMNS
# OLD (восстановить-если-нужно):
# all_columns = set() # ← копил все поля
# ------------------------------------------------------------
print("🔍 Извлечение данных...")
for idx, link in enumerate(links, 1):
print(f"[{idx}/{len(links)}] {link}")
row = extract_data(link)
# NEW: оставляем только нужные 17 полей
row = {k: v for k, v in row.items() if k in KEEP_COLUMNS}
# OLD (восстановить-если-нужно):
# all_columns.update(row.keys()) # ← собирал все ключи
rows.append(row)
# OLD (восстановить-если-нужно):
# if isinstance(all_columns, set):
# all_columns = sorted(all_columns) # упорядочивал всё
def safe(val):
"""Преобразует dict / list в JSON-строку, None → ''."""
if isinstance(val, (dict, list)):
return json.dumps(val, ensure_ascii=False)
return "" if val is None else val
print("📤 Сохраняем Excel...")
wb = Workbook()
ws = wb.active
ws.title = "IKEA Products"
ws.append(all_columns)
for row in rows:
ws.append([safe(row.get(col, "")) for col in all_columns])
wb.save(OUTPUT_FILE)
print(f"\n✅ Готово: {OUTPUT_FILE}")
if __name__ == "__main__":
main()

Binary file not shown.

Binary file not shown.