IKEA_сбор данных по ссылке на товар в текстовом файле.
This commit is contained in:
parent
20bd54dd3c
commit
790abe8b95
1
Парсер_IKEA/links.txt
Normal file
1
Парсер_IKEA/links.txt
Normal file
@ -0,0 +1 @@
|
||||
https://www.ikea.com/pl/pl/p/indira-narzuta-zoltobezowy-20582629/#content
|
||||
200
Парсер_IKEA/main.py
Normal file
200
Парсер_IKEA/main.py
Normal file
@ -0,0 +1,200 @@
|
||||
import requests
|
||||
import json
|
||||
import os
|
||||
import html
|
||||
from bs4 import BeautifulSoup
|
||||
from openpyxl import Workbook
|
||||
|
||||
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||
INPUT_FILE = os.path.join(BASE_DIR, "links.txt")
|
||||
OUTPUT_FILE = os.path.join(BASE_DIR, "result.xlsx")
|
||||
|
||||
CSS_SELECTOR = ".pip-product__subgrid.product-pip.js-product-pip"
|
||||
BLOCKS = [
|
||||
"buyModule",
|
||||
"productSummary",
|
||||
"pipPricePackage",
|
||||
"productInformationSection",
|
||||
"keyFacts",
|
||||
"stockcheckSection",
|
||||
"availabilityGroup",
|
||||
"productGallery"
|
||||
]
|
||||
# ── какие колонки сохраняем ─────────────────────────────────────────
|
||||
KEEP_COLUMNS = [
|
||||
"availabilityGroup.serverOnlineSellable",
|
||||
"availabilityGroup.storeHeader",
|
||||
"buyModule.onlineSellable",
|
||||
"buyModule.productName",
|
||||
"buyModule.productPrice",
|
||||
"buyModule.productType",
|
||||
"keyFacts.ariaLabels",
|
||||
"keyFacts.gaLabel",
|
||||
"keyFacts.keyFacts",
|
||||
"pipPricePackage.measurementText",
|
||||
"pipPricePackage.productDescription",
|
||||
"productGallery.urls",
|
||||
"productInformationSection.dimensionProps",
|
||||
"productInformationSection.productDetailsProps",
|
||||
"productSummary.description",
|
||||
"productSummary.visibleItemNo",
|
||||
"stockcheckSection.packagingProps",
|
||||
"stockcheckSection.typeName",
|
||||
"url",
|
||||
"categoryBreadcrumb",
|
||||
]
|
||||
|
||||
def flatten_block(block_name, data):
|
||||
if not isinstance(data, dict):
|
||||
return {}
|
||||
|
||||
flat = {}
|
||||
|
||||
for k, v in data.items():
|
||||
|
||||
'''
|
||||
# === 1. dimensionProps.images ===
|
||||
if block_name == "productInformationSection" and k == "dimensionProps":
|
||||
if isinstance(v, dict):
|
||||
urls = []
|
||||
for img in v.get("images", []):
|
||||
if isinstance(img, dict):
|
||||
url = img.get("url")
|
||||
if url:
|
||||
urls.append(url)
|
||||
flat[f"{key_name}.images_urls"] = "\n".join(urls)
|
||||
continue
|
||||
'''
|
||||
# === 2. mediaList.content.url → productGallery.urls
|
||||
if block_name == "productGallery" and k == "mediaList":
|
||||
if isinstance(v, list):
|
||||
urls = []
|
||||
for item in v:
|
||||
content = item.get("content", {})
|
||||
if isinstance(content, dict) and "url" in content:
|
||||
urls.append(content["url"])
|
||||
flat["productGallery.urls"] = "\n".join(urls)
|
||||
return flat # ⬅ возвращаем только urls, остальные поля игнорируем
|
||||
|
||||
continue
|
||||
|
||||
# === Остальные поля — по умолчанию ===
|
||||
key = f"{block_name}.{k}"
|
||||
flat[key] = v
|
||||
|
||||
return flat
|
||||
|
||||
|
||||
|
||||
|
||||
def extract_data(url):
|
||||
"""
|
||||
Возвращает словарь с нужными полями товара IKEA.
|
||||
+ NEW: добавляет ключ 'categoryBreadcrumb' вида
|
||||
'Produkty/Tekstylia/Tekstylia do sypialni/Narzuty na łóżko'
|
||||
(берётся из JSON-LD BreadcrumbList).
|
||||
"""
|
||||
try:
|
||||
response = requests.get(url, timeout=10,
|
||||
headers={"User-Agent": "Mozilla/5.0"})
|
||||
response.raise_for_status()
|
||||
soup = BeautifulSoup(response.text, "html.parser")
|
||||
|
||||
# ── основной JSON из data-hydration-props ──────────────────
|
||||
target = soup.select_one(CSS_SELECTOR)
|
||||
if not target:
|
||||
return {"url": url, "error": "CSS selector not found"}
|
||||
|
||||
raw = target.get("data-hydration-props")
|
||||
if not raw:
|
||||
return {"url": url, "error": "data-hydration-props not found"}
|
||||
|
||||
decoded = html.unescape(raw)
|
||||
full_json = json.loads(decoded)
|
||||
result = {"url": url}
|
||||
|
||||
# вытаскиваем нужные блоки
|
||||
for block in BLOCKS:
|
||||
result.update(flatten_block(block, full_json.get(block, {})))
|
||||
|
||||
# ── NEW: извлекаем BreadcrumbList → categoryBreadcrumb ────
|
||||
breadcrumb = None
|
||||
for tag in soup.find_all("script",
|
||||
attrs={"type": lambda t: t and "ld+json" in t}):
|
||||
try:
|
||||
data = json.loads(tag.string)
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
# если это массив JSON-LD, ищем в нём объект Product / Breadcrumb
|
||||
if isinstance(data, list):
|
||||
data = next((d for d in data
|
||||
if d.get("@type") == "BreadcrumbList"), None)
|
||||
|
||||
if isinstance(data, dict) and data.get("@type") == "BreadcrumbList":
|
||||
items = data.get("itemListElement", [])
|
||||
names = [it.get("name", "") for it in items]
|
||||
breadcrumb = "/".join(names)
|
||||
break # нашли нужный блок – выходим из цикла
|
||||
|
||||
if breadcrumb:
|
||||
result["categoryBreadcrumb"] = breadcrumb
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
return {"url": url, "error": str(e)}
|
||||
|
||||
|
||||
def main():
|
||||
# ── читаем ссылки ────────────────────────────────────────────
|
||||
with open(INPUT_FILE, "r", encoding="utf-8") as f:
|
||||
links = [line.strip() for line in f if line.strip()]
|
||||
|
||||
rows = []
|
||||
|
||||
# ---- РЕЖИМ КОЛОНОК -----------------------------------------
|
||||
# NEW: фиксированный список колонок (см. KEEP_COLUMNS вверху)
|
||||
all_columns = KEEP_COLUMNS
|
||||
|
||||
# OLD (восстановить-если-нужно):
|
||||
# all_columns = set() # ← копил все поля
|
||||
# ------------------------------------------------------------
|
||||
|
||||
print("🔍 Извлечение данных...")
|
||||
for idx, link in enumerate(links, 1):
|
||||
print(f"[{idx}/{len(links)}] {link}")
|
||||
row = extract_data(link)
|
||||
|
||||
# NEW: оставляем только нужные 17 полей
|
||||
row = {k: v for k, v in row.items() if k in KEEP_COLUMNS}
|
||||
|
||||
# OLD (восстановить-если-нужно):
|
||||
# all_columns.update(row.keys()) # ← собирал все ключи
|
||||
|
||||
rows.append(row)
|
||||
|
||||
# OLD (восстановить-если-нужно):
|
||||
# if isinstance(all_columns, set):
|
||||
# all_columns = sorted(all_columns) # упорядочивал всё
|
||||
|
||||
def safe(val):
|
||||
"""Преобразует dict / list в JSON-строку, None → ''."""
|
||||
if isinstance(val, (dict, list)):
|
||||
return json.dumps(val, ensure_ascii=False)
|
||||
return "" if val is None else val
|
||||
|
||||
print("📤 Сохраняем Excel...")
|
||||
wb = Workbook()
|
||||
ws = wb.active
|
||||
ws.title = "IKEA Products"
|
||||
ws.append(all_columns)
|
||||
|
||||
for row in rows:
|
||||
ws.append([safe(row.get(col, "")) for col in all_columns])
|
||||
|
||||
wb.save(OUTPUT_FILE)
|
||||
print(f"\n✅ Готово: {OUTPUT_FILE}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
BIN
Парсер_IKEA/result.xlsx
Normal file
BIN
Парсер_IKEA/result.xlsx
Normal file
Binary file not shown.
BIN
Парсер_IKEA/~$result.xlsx
Normal file
BIN
Парсер_IKEA/~$result.xlsx
Normal file
Binary file not shown.
Loading…
Reference in New Issue
Block a user