NEXT
This commit is contained in:
parent
a2ea206e32
commit
b3c1ee2b69
3
.gitignore
vendored
3
.gitignore
vendored
@ -28,6 +28,7 @@ Temporary Items
|
||||
.apdisk
|
||||
|
||||
__pycache__
|
||||
/Parsing ZARAHOME/src/records_folder
|
||||
records_folder
|
||||
Ignore_Temp
|
||||
/Processing/Files-todo
|
||||
out
|
||||
2
Parser_NEXT/.env.example
Normal file
2
Parser_NEXT/.env.example
Normal file
@ -0,0 +1,2 @@
|
||||
# PROXY=http://user:pass@host:port
|
||||
# RATE_LIMIT=1.0
|
||||
17
Parser_NEXT/README.md
Normal file
17
Parser_NEXT/README.md
Normal file
@ -0,0 +1,17 @@
|
||||
# NEXT.pl Parser (Playwright, Python 3.12)
|
||||
|
||||
## Quick start
|
||||
```bash
|
||||
python -m venv .venv
|
||||
source .venv/bin/activate # Windows: .venv\Scripts\activate
|
||||
pip install -r requirements.txt
|
||||
python -m playwright install chromium
|
||||
python main.py
|
||||
```
|
||||
|
||||
**categories.xlsx** — формат ввода:
|
||||
- Первая колонка (A) — ссылки на категории (без заголовка).
|
||||
- Любые другие колонки (B, C, …) игнорируются (можно писать пометки).
|
||||
- Пустые строки и ячейки не учитываются.
|
||||
|
||||
Outputs land in **records_folder/** as XLSX (+CSV/JSONL). Configure selectors/scroll in **config.yaml**.
|
||||
BIN
Parser_NEXT/categories.xlsx
Normal file
BIN
Parser_NEXT/categories.xlsx
Normal file
Binary file not shown.
53
Parser_NEXT/config.yaml
Normal file
53
Parser_NEXT/config.yaml
Normal file
@ -0,0 +1,53 @@
|
||||
base_url: "https://www.next.pl/en"
|
||||
locale: "en-GB"
|
||||
timezoneId: "Europe/Warsaw"
|
||||
|
||||
# На время отладки удобно видеть браузер:
|
||||
headless: false
|
||||
|
||||
nav_timeout_ms: 60000
|
||||
wait_timeout_ms: 30000
|
||||
retries: 3
|
||||
|
||||
# Рейт-лимит можно настраивать при масштабировании
|
||||
rate_limit_per_host_per_sec: 1.0
|
||||
|
||||
scroll:
|
||||
# Старые параметры (используются в резервном auto_scroll и для пауз)
|
||||
max_scrolls: 80
|
||||
pause_ms_between_scrolls_min: 300
|
||||
pause_ms_between_scrolls_max: 700
|
||||
stop_if_no_new_items_after: 8
|
||||
|
||||
# Новые параметры для auto_scroll_until_total
|
||||
hard_max_scrolls: 2500 # предохранитель на максимум скроллов
|
||||
wait_networkidle_timeout_ms: 8000 # ожидание networkidle после каждого скролла
|
||||
|
||||
selectors:
|
||||
# карточки товаров
|
||||
product_tile: '[data-testid="plp-product-grid-item"], [data-testid="product-tile"], .ProductCard, [data-qa="plp-product"]'
|
||||
product_link: 'a[href*="/style/"], a[href*="/p/"], a[data-testid="productLink"]'
|
||||
product_name: '[data-testid="product-name"], .productName, [itemprop="name"]'
|
||||
product_price: '[data-testid="price"], [itemprop="price"], .price'
|
||||
|
||||
# признак готовности
|
||||
grid_ready: 'script[id^="next-product-summary-script-"], [data-testid="plp-product-grid-item"], [data-testid="product-grid"], .plpGrid, [data-qa="plp-grid"]'
|
||||
|
||||
# счётчик общего количества в шапке (например "(434)")
|
||||
total_count: '#plp-seo-heading .esi-count, .esi-count'
|
||||
|
||||
xhr_patterns:
|
||||
- "/search"
|
||||
- "/api/search"
|
||||
- "/plp"
|
||||
- "/productsummary"
|
||||
|
||||
output:
|
||||
folder: "records_folder"
|
||||
excel_prefix: "next_dump"
|
||||
csv_also: true
|
||||
jsonl_also: true
|
||||
|
||||
debug:
|
||||
dump_always: false # true — чтобы писать дампы на каждом шаге
|
||||
|
||||
636
Parser_NEXT/fetcher.py
Normal file
636
Parser_NEXT/fetcher.py
Normal file
@ -0,0 +1,636 @@
|
||||
import asyncio
|
||||
import logging
|
||||
import re
|
||||
import json
|
||||
import os
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Any, Optional
|
||||
import re
|
||||
from playwright.async_api import async_playwright
|
||||
from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type
|
||||
|
||||
|
||||
# ---- Price parsing helpers ----
|
||||
_PLN_PRICE_RE = re.compile(
|
||||
r'(?<!\d)(\d{1,3}(?:[ \u00A0]?\d{3})*(?:[.,]\d{2})?)(?:\s*(?:zł|PLN))',
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
def parse_pln_price_to_float(price_text: str | None) -> float | None:
|
||||
"""
|
||||
'1 299,00 zł' / '1299 zł' / '1 299 zł' -> 1299.00
|
||||
Возвращает None, если распарсить не удалось.
|
||||
"""
|
||||
if not price_text:
|
||||
return None
|
||||
t = (
|
||||
price_text
|
||||
.replace("\u00a0", " ") # NBSP
|
||||
.replace("\u2009", " ") # thin space
|
||||
.strip()
|
||||
)
|
||||
m = _PLN_PRICE_RE.search(t)
|
||||
if not m:
|
||||
return None
|
||||
num = m.group(1)
|
||||
num = num.replace(" ", "").replace("\u00a0", "").replace("\u2009", "")
|
||||
num = num.replace(",", ".")
|
||||
try:
|
||||
return float(num)
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
class FetchError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class Fetcher:
|
||||
"""
|
||||
Browser layer: Playwright Chromium with anti-bot hygiene and robust debug dumps.
|
||||
- Blocks heavy resources (fonts/media/images), keeps stylesheets.
|
||||
- Waits for either SSR summary scripts or window.ssrClientSettings.
|
||||
- Two ways to read product summaries:
|
||||
1) window.ssrClientSettings.productSummary
|
||||
2) inline <script id="next-product-summary-script-..."> content (fallback)
|
||||
- Captures XHR JSON responses by patterns.
|
||||
- Dumps HTML/PNG with timestamps at key checkpoints and on failure.
|
||||
"""
|
||||
|
||||
def __init__(self, cfg: Dict[str, Any]):
|
||||
self.cfg = cfg
|
||||
self.base_url = cfg.get("base_url")
|
||||
self.xhr_patterns = [re.compile(p) for p in cfg.get("xhr_patterns", [])]
|
||||
self.collected_xhr: List[Dict[str, Any]] = []
|
||||
|
||||
async def __aenter__(self):
|
||||
self.playwright = await async_playwright().start()
|
||||
args = ["--disable-dev-shm-usage", "--no-sandbox"]
|
||||
self.browser = await self.playwright.chromium.launch(
|
||||
headless=self.cfg.get("headless", True),
|
||||
args=args,
|
||||
devtools=not self.cfg.get("headless", True),
|
||||
)
|
||||
self.context = await self.browser.new_context(
|
||||
locale=self.cfg.get("locale", "en-GB"),
|
||||
timezone_id=self.cfg.get("timezoneId", "Europe/Warsaw"),
|
||||
user_agent=(
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
"Chrome/124.0.0.0 Safari/537.36"
|
||||
),
|
||||
viewport={"width": 1366, "height": 900},
|
||||
)
|
||||
self.page = await self.context.new_page()
|
||||
|
||||
# Block heavy resources; keep stylesheets.
|
||||
await self.context.route("**/*", self._route)
|
||||
|
||||
# Listen to JSON XHRs for optional parsing.
|
||||
self.page.on("response", self._on_response)
|
||||
self.page.on("console", lambda msg: logging.debug(f"[page.console] {msg.type} {msg.text}"))
|
||||
return self
|
||||
|
||||
async def __aexit__(self, exc_type, exc, tb):
|
||||
await self.context.close()
|
||||
await self.browser.close()
|
||||
await self.playwright.stop()
|
||||
|
||||
async def _route(self, route, request):
|
||||
"""
|
||||
Блокируем часть тяжёлых ресурсов.
|
||||
Для отладки с картинками убери 'image' из списка.
|
||||
"""
|
||||
if request.resource_type in ["font", "media", "image"]:
|
||||
return await route.abort()
|
||||
return await route.continue_()
|
||||
|
||||
def _on_response(self, response):
|
||||
try:
|
||||
url = response.url
|
||||
if any(p.search(url) for p in self.xhr_patterns):
|
||||
if "application/json" in (response.headers.get("content-type", "")):
|
||||
self.collected_xhr.append({"url": url, "response": response})
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
async def _dump_debug(self, tag: str):
|
||||
"""Save HTML and screenshot with timestamp; log absolute paths and CWD."""
|
||||
try:
|
||||
raw_dir = Path("out/raw_html").resolve()
|
||||
raw_dir.mkdir(parents=True, exist_ok=True)
|
||||
ts = datetime.utcnow().strftime("%Y%m%d_%H%M%S_%f")
|
||||
html_path = raw_dir / f"{ts}_{tag}.html"
|
||||
png_path = raw_dir / f"{ts}_{tag}.png"
|
||||
|
||||
cwd = Path(os.getcwd()).resolve()
|
||||
logging.info(f"[dump_debug] CWD={cwd} → html={html_path} png={png_path}")
|
||||
|
||||
try:
|
||||
html = await self.page.content()
|
||||
html_path.write_text(html, encoding="utf-8")
|
||||
except Exception as e:
|
||||
logging.warning(f"[dump_debug] writing HTML failed: {e}")
|
||||
|
||||
try:
|
||||
await self.page.screenshot(path=str(png_path), full_page=True)
|
||||
except Exception as e:
|
||||
logging.warning(f"[dump_debug] screenshot failed: {e}")
|
||||
|
||||
logging.info(f"[dump_debug] saved OK: {html_path.name}, {png_path.name}")
|
||||
except Exception as e:
|
||||
logging.warning(f"[dump_debug] general fail: {e}")
|
||||
|
||||
async def _accept_cookies_if_any(self):
|
||||
selectors = [
|
||||
"#onetrust-accept-btn-handler",
|
||||
"button#onetrust-accept-btn-handler",
|
||||
'button:has-text("Accept all")',
|
||||
'button:has-text("Accept All")',
|
||||
'button[aria-label*="Accept"]',
|
||||
]
|
||||
for sel in selectors:
|
||||
try:
|
||||
el = self.page.locator(sel)
|
||||
if await el.count() > 0:
|
||||
await el.first.click(timeout=2000)
|
||||
logging.info("Cookie banner accepted.")
|
||||
break
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
async def _log_plp_state(self, stage: str):
|
||||
"""Log counts of SSR scripts and presence of window.ssrClientSettings."""
|
||||
try:
|
||||
scripts_count = await self.page.locator('script[id^="next-product-summary-script-"]').count()
|
||||
except Exception:
|
||||
scripts_count = -1
|
||||
try:
|
||||
has_window = await self.page.evaluate("""() => {
|
||||
const ps = globalThis?.ssrClientSettings?.productSummary;
|
||||
return !!(ps && Array.isArray(ps.itemNumbers) && ps.itemNumbers.length > 0);
|
||||
}""")
|
||||
except Exception:
|
||||
has_window = False
|
||||
logging.info(f"[{stage}] scripts: {scripts_count}, window.ps: {has_window}")
|
||||
|
||||
@retry(
|
||||
stop=stop_after_attempt(3),
|
||||
wait=wait_exponential(multiplier=1, min=1, max=8),
|
||||
retry=retry_if_exception_type(FetchError),
|
||||
)
|
||||
async def load_category(self, url: str):
|
||||
"""
|
||||
Navigation + robust readiness:
|
||||
1) domcontentloaded
|
||||
2) accept cookies
|
||||
3) warm-up scroll
|
||||
4) wait for <script id^="next-product-summary-script-"> (attached)
|
||||
5) attempt window.ssrClientSettings (non-fatal)
|
||||
Dumps at key checkpoints and on failure.
|
||||
"""
|
||||
try:
|
||||
await self.page.goto(
|
||||
url,
|
||||
timeout=self.cfg.get("nav_timeout_ms", 60000),
|
||||
wait_until="domcontentloaded",
|
||||
)
|
||||
await self._dump_debug("after_goto")
|
||||
|
||||
await self._accept_cookies_if_any()
|
||||
await self._dump_debug("after_cookies")
|
||||
await self._log_plp_state("after_accept")
|
||||
|
||||
# warm-up scroll to trigger scripts/lazy
|
||||
for _ in range(3):
|
||||
await self.page.mouse.wheel(0, 1600)
|
||||
await self.page.wait_for_timeout(300)
|
||||
await self._dump_debug("after_warmup")
|
||||
await self._log_plp_state("after_warmup")
|
||||
|
||||
# wait for SSR script tags
|
||||
await self.page.wait_for_selector(
|
||||
'script[id^="next-product-summary-script-"]',
|
||||
state="attached",
|
||||
timeout=self.cfg.get("wait_timeout_ms", 30000),
|
||||
)
|
||||
await self._dump_debug("after_scripts_present")
|
||||
|
||||
# optional window readiness
|
||||
try:
|
||||
await self.page.wait_for_function(
|
||||
"""
|
||||
() => {
|
||||
const ps = globalThis?.ssrClientSettings?.productSummary;
|
||||
return !!(ps && Array.isArray(ps.itemNumbers) && ps.itemNumbers.length > 0);
|
||||
}
|
||||
""",
|
||||
timeout=5000,
|
||||
)
|
||||
except Exception:
|
||||
logging.info("window.ssrClientSettings not ready (non-fatal).")
|
||||
|
||||
await self._dump_debug("after_window_check")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"load_category failed: {e}")
|
||||
await self._dump_debug("fail_load_category")
|
||||
raise FetchError(str(e))
|
||||
|
||||
# ---------- NEW: read total count and scroll until target ----------
|
||||
|
||||
async def read_total_from_header(self) -> Optional[int]:
|
||||
"""
|
||||
Tries to read category total from the header count like '(434)'.
|
||||
Looks in '#plp-seo-heading .esi-count' or any '.esi-count' fallback.
|
||||
"""
|
||||
sels = ["#plp-seo-heading .esi-count", ".esi-count"]
|
||||
for sel in sels:
|
||||
try:
|
||||
el = self.page.locator(sel)
|
||||
if await el.count() > 0:
|
||||
txt = await el.first.inner_text(timeout=1500)
|
||||
digits = "".join(ch for ch in txt if ch.isdigit())
|
||||
if digits:
|
||||
total = int(digits)
|
||||
logging.info(f"Total from header: {total}")
|
||||
return total
|
||||
except Exception:
|
||||
continue
|
||||
logging.info("Total from header: not found")
|
||||
return None
|
||||
|
||||
async def auto_scroll_until_total(self, hard_max_scrolls: Optional[int] = None):
|
||||
"""
|
||||
Scrolls until we reach target total (from header), with a hard cap.
|
||||
Uses networkidle + a small jiggle to retrigger lazy loading.
|
||||
"""
|
||||
hard_cap = hard_max_scrolls or self.cfg.get("scroll", {}).get("hard_max_scrolls", 2000)
|
||||
netidle_ms = self.cfg.get("scroll", {}).get("wait_networkidle_timeout_ms", 8000)
|
||||
# Combined product tile selector
|
||||
sel_tiles = '[data-testid="plp-product-grid-item"], [data-testid="product-tile"], .ProductCard, [data-qa="plp-product"]'
|
||||
|
||||
target = await self.read_total_from_header()
|
||||
last = 0
|
||||
same_ticks = 0
|
||||
same_limit = self.cfg.get("scroll", {}).get("stop_if_no_new_items_after", 8)
|
||||
|
||||
for i in range(hard_cap):
|
||||
# Scroll to bottom
|
||||
try:
|
||||
await self.page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Wait for network idle
|
||||
try:
|
||||
await self.page.wait_for_load_state("networkidle", timeout=netidle_ms)
|
||||
except Exception:
|
||||
# not fatal
|
||||
await asyncio.sleep(0.25)
|
||||
|
||||
# Jiggle to retrigger observers
|
||||
try:
|
||||
await self.page.mouse.wheel(0, -200)
|
||||
await asyncio.sleep(0.1)
|
||||
await self.page.mouse.wheel(0, 1200)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
try:
|
||||
seen = await self.page.locator(sel_tiles).count()
|
||||
except Exception:
|
||||
seen = last
|
||||
|
||||
if target and seen >= target:
|
||||
logging.info(f"Reached target: seen {seen}/{target} (i={i})")
|
||||
break
|
||||
|
||||
if seen <= last:
|
||||
same_ticks += 1
|
||||
if same_ticks >= same_limit:
|
||||
logging.info(f"No growth for a while: seen={seen}, i={i}")
|
||||
break
|
||||
else:
|
||||
same_ticks = 0
|
||||
last = seen
|
||||
|
||||
logging.info(f"Final seen items: {last} (target={target}, cap={hard_cap})")
|
||||
|
||||
# ---------- existing helpers ----------
|
||||
|
||||
async def current_html(self) -> str:
|
||||
return await self.page.content()
|
||||
|
||||
async def extract_xhr_json(self) -> List[Dict[str, Any]]:
|
||||
results = []
|
||||
for entry in self.collected_xhr:
|
||||
try:
|
||||
body = await entry["response"].json()
|
||||
results.append({"url": entry["url"], "json": body})
|
||||
except Exception:
|
||||
pass
|
||||
return results
|
||||
|
||||
async def read_ssr_product_summaries(self) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Returns simplified product summaries.
|
||||
Path 1: window.ssrClientSettings.productSummary
|
||||
Path 2: parse inline <script id="next-product-summary-script-..."> blocks
|
||||
"""
|
||||
# Path 1 — from window
|
||||
js_window = """
|
||||
() => {
|
||||
const out = [];
|
||||
const ps = globalThis?.ssrClientSettings?.productSummary;
|
||||
if (!ps) return out;
|
||||
const ids = Array.isArray(ps.itemNumbers) ? ps.itemNumbers : [];
|
||||
for (const id of ids) {
|
||||
const obj = ps[id];
|
||||
if (!obj) continue;
|
||||
const sd = obj?._STATE_?.productSummary?.summaryData;
|
||||
if (!sd) continue;
|
||||
const cw = Array.isArray(sd.colourways) && sd.colourways.length ? sd.colourways[0] : null;
|
||||
out.push({
|
||||
id: sd.id || null,
|
||||
title: sd.title || null,
|
||||
baseUrl: sd.baseUrl || null,
|
||||
brand: sd.brand || null,
|
||||
category: sd.category || null,
|
||||
currencyCode: sd.currencyCode || null,
|
||||
colourway: cw ? {
|
||||
id: cw.id ?? null,
|
||||
url: cw.url ?? null,
|
||||
color: cw.c ?? null,
|
||||
title: cw.t ?? null,
|
||||
price: cw.p ?? null,
|
||||
priceMarket: cw.mp ?? null,
|
||||
selected: !!cw.s
|
||||
} : null,
|
||||
imageCdnUrl: sd.imageCdnUrl || null,
|
||||
productImageUrlPart: sd.productImageUrlPart || null,
|
||||
lgImagePath: sd.lgImagePath || null
|
||||
});
|
||||
}
|
||||
return out;
|
||||
}
|
||||
"""
|
||||
try:
|
||||
w = await self.page.evaluate(js_window)
|
||||
if isinstance(w, list) and w:
|
||||
logging.info(f"SSR(window) summaries: {len(w)}")
|
||||
return w
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Path 2 — parse inline scripts
|
||||
js_scripts = """
|
||||
() => {
|
||||
const list = Array.from(document.querySelectorAll('script[id^="next-product-summary-script-"]'));
|
||||
return list.map(s => s.textContent || "");
|
||||
}
|
||||
"""
|
||||
try:
|
||||
texts = await self.page.evaluate(js_scripts)
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
out: List[Dict[str, Any]] = []
|
||||
# productSummary["ID"] = { ... } OR productSummary['ID'] = { ... }
|
||||
assign_re = re.compile(r'productSummary\s*\[\s*([\'"])(.*?)\1\s*\]\s*=\s*\{')
|
||||
for t in texts or []:
|
||||
for m in assign_re.finditer(t):
|
||||
start = m.end() - 1 # at '{'
|
||||
depth = 0
|
||||
end = None
|
||||
for i in range(start, len(t)):
|
||||
ch = t[i]
|
||||
if ch == "{":
|
||||
depth += 1
|
||||
elif ch == "}":
|
||||
depth -= 1
|
||||
if depth == 0:
|
||||
end = i + 1
|
||||
break
|
||||
if end is None:
|
||||
continue
|
||||
block = t[start:end]
|
||||
try:
|
||||
data = json.loads(block)
|
||||
sd = (
|
||||
data.get("_STATE_", {})
|
||||
.get("productSummary", {})
|
||||
.get("summaryData", {})
|
||||
)
|
||||
cws = sd.get("colourways") or []
|
||||
cw = cws[0] if cws else None
|
||||
out.append(
|
||||
{
|
||||
"id": sd.get("id"),
|
||||
"title": sd.get("title"),
|
||||
"baseUrl": sd.get("baseUrl"),
|
||||
"brand": sd.get("brand"),
|
||||
"category": sd.get("category"),
|
||||
"currencyCode": sd.get("currencyCode"),
|
||||
"colourway": {
|
||||
"id": cw.get("id"),
|
||||
"url": cw.get("url"),
|
||||
"color": cw.get("c"),
|
||||
"title": cw.get("t"),
|
||||
"price": cw.get("p"),
|
||||
"priceMarket": cw.get("mp"),
|
||||
"selected": bool(cw.get("s")),
|
||||
} if cw else None,
|
||||
"imageCdnUrl": sd.get("imageCdnUrl"),
|
||||
"productImageUrlPart": sd.get("productImageUrlPart"),
|
||||
"lgImagePath": sd.get("lgImagePath"),
|
||||
}
|
||||
)
|
||||
except Exception:
|
||||
continue
|
||||
async def read_dom_products(self) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Парсит карточки из DOM после прокрутки.
|
||||
Покрывает несколько вариантов разметки Next PLP.
|
||||
"""
|
||||
js = r"""
|
||||
() => {
|
||||
const out = [];
|
||||
const gridItems = document.querySelectorAll('[data-testid="plp-product-grid-item"], .ProductCard, [data-qa="plp-product"]');
|
||||
|
||||
const getPid = (container) => {
|
||||
// Вариант 1: data-pid на entrypoint
|
||||
const entry = container.querySelector('[id^="plp-product-summary-entrypoint-"]');
|
||||
if (entry && entry.getAttribute('data-pid')) return entry.getAttribute('data-pid');
|
||||
// Вариант 2: id="plp-product-summary-tile-<ID>"
|
||||
const tile = container.closest('[id^="plp-product-summary-tile-"]') || container.querySelector('[id^="plp-product-summary-tile-"]');
|
||||
if (tile) {
|
||||
const m = (tile.id || '').match(/plp-product-summary-tile-([A-Za-z0-9]+)/);
|
||||
if (m) return m[1];
|
||||
}
|
||||
// Вариант 3: вытащим из href вида .../<ID>#<ID> или .../T43162
|
||||
const a = container.querySelector('a[href*="/style/"], a[data-testid^="product_summary_tile_"], a[href*="/p/"]');
|
||||
if (a) {
|
||||
const href = a.getAttribute('href') || '';
|
||||
const m2 = href.match(/([A-Z]\d{4,})/i);
|
||||
if (m2) return m2[1].toUpperCase();
|
||||
}
|
||||
return null;
|
||||
};
|
||||
|
||||
const getAbsUrl = (href) => {
|
||||
try {
|
||||
if (!href) return null;
|
||||
if (/^https?:\/\//i.test(href)) return href;
|
||||
const a = document.createElement('a');
|
||||
a.href = href;
|
||||
return a.href;
|
||||
} catch { return href || null; }
|
||||
};
|
||||
|
||||
const getTitle = (container) => {
|
||||
const t1 = container.querySelector('[data-testid="product_summary_title"]');
|
||||
if (t1) return (t1.getAttribute('data-label') || t1.textContent || '').trim();
|
||||
const t2 = container.querySelector('[data-testid="product-name"], .productName, [itemprop="name"]');
|
||||
if (t2) return (t2.textContent || '').trim();
|
||||
return null;
|
||||
};
|
||||
|
||||
const getPriceText = (container) => {
|
||||
// охватим несколько вариантов
|
||||
const priceRoots = [
|
||||
container.querySelector('[data-testid="price"]'),
|
||||
container.querySelector('[data-testid="ProductCard-Price"]'),
|
||||
container.querySelector('[itemprop="price"]'),
|
||||
container.querySelector('[aria-label*="price" i]'),
|
||||
container
|
||||
].filter(Boolean);
|
||||
|
||||
for (const root of priceRoots) {
|
||||
const spans = root.querySelectorAll('span, div');
|
||||
for (const el of spans) {
|
||||
const t = (el.textContent || '').trim();
|
||||
if (!t) continue;
|
||||
if (/\d/.test(t) && (t.includes('zł') || /PLN/i.test(t))) {
|
||||
return t;
|
||||
}
|
||||
}
|
||||
}
|
||||
return null;
|
||||
};
|
||||
|
||||
gridItems.forEach(container => {
|
||||
// Основная ссылка
|
||||
const link = container.querySelector('a[href*="/style/"], a[data-testid^="product_summary_tile_"], a[href*="/p/"]');
|
||||
const href = link ? link.getAttribute('href') : null;
|
||||
|
||||
const rec = {
|
||||
id: getPid(container),
|
||||
title: getTitle(container),
|
||||
url: getAbsUrl(href),
|
||||
price_text: getPriceText(container),
|
||||
currency: null
|
||||
};
|
||||
|
||||
if (rec.price_text) {
|
||||
if (rec.price_text.includes('zł') || /PLN/i.test(rec.price_text)) rec.currency = 'PLN';
|
||||
}
|
||||
|
||||
// фильтруем пустые карточки без ссылки и заголовка
|
||||
if (rec.url || rec.title) out.push(rec);
|
||||
});
|
||||
|
||||
// Удаляем дубли по id|url
|
||||
const seen = new Set();
|
||||
const uniq = [];
|
||||
for (const d of out) {
|
||||
const key = `${d.id || ''}|${d.url || ''}`;
|
||||
if (seen.has(key)) continue;
|
||||
seen.add(key);
|
||||
uniq.push(d);
|
||||
}
|
||||
return uniq;
|
||||
}
|
||||
"""
|
||||
try:
|
||||
data = await self.page.evaluate(js)
|
||||
logging.info(f"DOM cards parsed: {len(data)}")
|
||||
return data
|
||||
except Exception as e:
|
||||
logging.warning(f"read_dom_products failed: {e}")
|
||||
return []
|
||||
|
||||
async def collect_products(self) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Унифицированный сбор: SSR (если есть) + DOM.
|
||||
Нормализуем к: id, title, url, price(float|None), currency('PLN'|...).
|
||||
"""
|
||||
ssr = await self.read_ssr_product_summaries() or []
|
||||
dom = await self.read_dom_products() or []
|
||||
|
||||
bykey: Dict[str, Dict[str, Any]] = {}
|
||||
|
||||
def key(d: Dict[str, Any]) -> str:
|
||||
return f"{(d.get('id') or '')}|{(d.get('url') or '')}"
|
||||
|
||||
# 1) Скелет из DOM
|
||||
for d in dom:
|
||||
bykey[key(d)] = {
|
||||
"id": d.get("id"),
|
||||
"title": d.get("title"),
|
||||
"url": d.get("url"),
|
||||
"price_text": d.get("price_text"),
|
||||
"currency": d.get("currency"),
|
||||
}
|
||||
|
||||
# 2) Обогащаем из SSR (если есть)
|
||||
for s in ssr:
|
||||
cw = (s.get("colourway") or {})
|
||||
# собрать абсолютный URL
|
||||
url = None
|
||||
try:
|
||||
base = (s.get("baseUrl") or "").rstrip("/")
|
||||
rel = (cw.get("url") or "").lstrip("/")
|
||||
url = f"{base}/{rel}" if (base and rel) else None
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
cand = {"id": s.get("id"), "url": url}
|
||||
k = key(cand)
|
||||
rec = bykey.get(k)
|
||||
if rec is None:
|
||||
bykey[k] = {
|
||||
"id": s.get("id"),
|
||||
"title": s.get("title"),
|
||||
"url": url,
|
||||
"price_text": cw.get("price"),
|
||||
"currency": s.get("currencyCode"),
|
||||
}
|
||||
else:
|
||||
if not rec.get("title") and s.get("title"):
|
||||
rec["title"] = s["title"]
|
||||
if not rec.get("price_text") and cw.get("price"):
|
||||
rec["price_text"] = cw["price"]
|
||||
if not rec.get("currency") and s.get("currencyCode"):
|
||||
rec["currency"] = s["currencyCode"]
|
||||
|
||||
# 3) Финальная нормализация цены
|
||||
out: List[Dict[str, Any]] = []
|
||||
for v in bykey.values():
|
||||
price_val = parse_pln_price_to_float(v.get("price_text"))
|
||||
currency = v.get("currency")
|
||||
if not currency and (v.get("price_text") or "").lower().find("zł") != -1:
|
||||
currency = "PLN"
|
||||
out.append({
|
||||
"id": v.get("id"),
|
||||
"title": v.get("title"),
|
||||
"url": v.get("url"),
|
||||
"price": price_val, # float или None
|
||||
"currency": currency or "PLN"
|
||||
})
|
||||
|
||||
logging.info(f"Total collected (SSR+DOM): {len(out)}")
|
||||
return out
|
||||
|
||||
193
Parser_NEXT/main.py
Normal file
193
Parser_NEXT/main.py
Normal file
@ -0,0 +1,193 @@
|
||||
import asyncio
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import List, Tuple
|
||||
from datetime import timedelta
|
||||
import pandas as pd
|
||||
import yaml
|
||||
|
||||
from fetcher import Fetcher, FetchError
|
||||
from sink import write_outputs
|
||||
from models import Product
|
||||
|
||||
|
||||
# ---------- конфиг/логи ----------
|
||||
|
||||
def setup_logging():
|
||||
Path("out/logs").mkdir(parents=True, exist_ok=True)
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s %(levelname)s %(message)s",
|
||||
handlers=[
|
||||
logging.FileHandler("out/logs/run.log", encoding="utf-8"),
|
||||
logging.StreamHandler(),
|
||||
],
|
||||
)
|
||||
|
||||
def load_config() -> dict:
|
||||
with open("config.yaml", "r", encoding="utf-8") as f:
|
||||
return yaml.safe_load(f)
|
||||
|
||||
|
||||
# ---------- загрузка категорий из первой колонки A ----------
|
||||
|
||||
def load_categories() -> List[Tuple[str, str]]:
|
||||
"""
|
||||
Читает categories.xlsx без заголовка.
|
||||
Берёт только первую колонку (A) — ссылки на категории.
|
||||
Имя категории вычисляет из последнего сегмента URL.
|
||||
"""
|
||||
from urllib.parse import urlparse
|
||||
xlsx = Path("categories.xlsx")
|
||||
if not xlsx.exists():
|
||||
# демо, если файл не создан
|
||||
return [
|
||||
("bathroom-accessories", "https://www.next.pl/en/shop/home/bathroom/bathroom-accessories"),
|
||||
]
|
||||
|
||||
df = pd.read_excel(xlsx, header=None)
|
||||
if df.shape[1] == 0:
|
||||
return []
|
||||
|
||||
urls: List[str] = []
|
||||
for val in df.iloc[:, 0].tolist():
|
||||
if isinstance(val, str):
|
||||
u = val.strip()
|
||||
elif pd.notna(val):
|
||||
u = str(val).strip()
|
||||
else:
|
||||
continue
|
||||
if not u or not u.lower().startswith(("http://", "https://")):
|
||||
continue
|
||||
urls.append(u)
|
||||
|
||||
def name_from_url(u: str) -> str:
|
||||
p = urlparse(u)
|
||||
parts = [s for s in p.path.split("/") if s]
|
||||
return parts[-1] if parts else p.netloc
|
||||
|
||||
return [(name_from_url(u), u) for u in urls]
|
||||
|
||||
|
||||
# ---------- адаптер: dict -> Product ----------
|
||||
|
||||
def normalize_to_models(collected: List[dict]) -> List[Product]:
|
||||
out: List[Product] = []
|
||||
for d in collected:
|
||||
pid = d.get("id")
|
||||
url = d.get("url")
|
||||
title = d.get("title")
|
||||
price_val = d.get("price") # float | None
|
||||
currency = (d.get("currency") or "PLN").upper()
|
||||
|
||||
price_str = None
|
||||
if price_val is not None:
|
||||
try:
|
||||
price_str = f"{float(price_val):.2f}"
|
||||
except Exception:
|
||||
price_str = None
|
||||
|
||||
out.append(Product(
|
||||
product_id=str(pid) if pid is not None else None,
|
||||
url=str(url) if url else None,
|
||||
name=title,
|
||||
price=price_str,
|
||||
currency=currency,
|
||||
image_urls=[],
|
||||
color=None,
|
||||
size_variants=[]
|
||||
))
|
||||
return out
|
||||
|
||||
# ---------- основной сценарий ----------
|
||||
|
||||
async def run_category(fetcher: Fetcher, cfg: dict, name: str, url: str):
|
||||
logging.info(f"Category start: {name} — {url}")
|
||||
try:
|
||||
await fetcher.load_category(url)
|
||||
# доскроллить до полного количества (считает из шапки "(N)")
|
||||
await fetcher.auto_scroll_until_total()
|
||||
|
||||
# собрать товары (SSR + DOM)
|
||||
collected = await fetcher.collect_products()
|
||||
products = normalize_to_models(collected)
|
||||
|
||||
# сохранить в xlsx/csv/jsonl
|
||||
path, n = write_outputs(
|
||||
category_name=name,
|
||||
category_url=url,
|
||||
products=products,
|
||||
out_folder=cfg["output"]["folder"],
|
||||
excel_prefix=cfg["output"]["excel_prefix"],
|
||||
csv_also=cfg["output"].get("csv_also", True),
|
||||
jsonl_also=cfg["output"].get("jsonl_also", True),
|
||||
)
|
||||
logging.info(f"✔ {name}: {n} товаров → {path}")
|
||||
|
||||
except FetchError as e:
|
||||
logging.error(f"Category failed: {name} — {e}")
|
||||
except Exception as e:
|
||||
logging.exception(f"Category crashed: {name} — {e}")
|
||||
|
||||
|
||||
async def main_async():
|
||||
setup_logging()
|
||||
cfg = load_config()
|
||||
categories = load_categories()
|
||||
if not categories:
|
||||
logging.warning("categories.xlsx пуст — добавьте ссылки в первую колонку (без заголовков).")
|
||||
return
|
||||
|
||||
# Аккумулятор для общего XLSX
|
||||
master_rows: List[dict] = []
|
||||
|
||||
# Имя общего файла: all_YYYYMMDD_HHMMSS_UTC+3.xlsx
|
||||
now_utc = pd.Timestamp.utcnow().to_pydatetime()
|
||||
ts_utc_plus3 = (now_utc + timedelta(hours=3)).strftime("%Y%m%d_%H%M%S")
|
||||
all_filename = f"all_{ts_utc_plus3}_UTC+3.xlsx"
|
||||
all_path = str(Path(cfg["output"]["folder"]) / all_filename)
|
||||
|
||||
async with Fetcher(cfg) as fetcher:
|
||||
for name, url in categories:
|
||||
# обычный прогон по категории
|
||||
try:
|
||||
logging.info(f"Category start: {name} — {url}")
|
||||
await fetcher.load_category(url)
|
||||
await fetcher.auto_scroll_until_total()
|
||||
|
||||
collected = await fetcher.collect_products()
|
||||
products = normalize_to_models(collected)
|
||||
|
||||
# запись per‑category
|
||||
path, n, rows = write_outputs(
|
||||
category_name=name,
|
||||
category_url=url,
|
||||
products=products,
|
||||
out_folder=cfg["output"]["folder"],
|
||||
excel_prefix=cfg["output"]["excel_prefix"],
|
||||
csv_also=cfg["output"].get("csv_also", True),
|
||||
jsonl_also=cfg["output"].get("jsonl_also", True),
|
||||
)
|
||||
logging.info(f"✔ {name}: {n} товаров → {path}")
|
||||
|
||||
# накапливаем в общий список
|
||||
master_rows.extend(rows)
|
||||
|
||||
except FetchError as e:
|
||||
logging.error(f"Category failed: {name} — {e}")
|
||||
except Exception as e:
|
||||
logging.exception(f"Category crashed: {name} — {e}")
|
||||
|
||||
# По завершении всех категорий — пишем общий XLSX
|
||||
from sink import write_master_excel
|
||||
all_written_path, total = write_master_excel(all_path, master_rows)
|
||||
logging.info(f"◎ ALL: {total} товаров → {all_written_path}")
|
||||
|
||||
|
||||
|
||||
def main():
|
||||
asyncio.run(main_async())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
23
Parser_NEXT/models.py
Normal file
23
Parser_NEXT/models.py
Normal file
@ -0,0 +1,23 @@
|
||||
from pydantic import BaseModel, Field, HttpUrl
|
||||
from typing import Optional, List
|
||||
|
||||
class Product(BaseModel):
|
||||
product_id: Optional[str] = Field(default=None)
|
||||
url: Optional[HttpUrl] = None
|
||||
name: Optional[str] = None
|
||||
price: Optional[str] = None
|
||||
currency: Optional[str] = None
|
||||
image_urls: List[str] = []
|
||||
color: Optional[str] = None
|
||||
size_variants: List[str] = []
|
||||
|
||||
class RowOut(BaseModel):
|
||||
category_name: str
|
||||
category_url: str
|
||||
product_id: Optional[str]
|
||||
url: Optional[str]
|
||||
name: Optional[str]
|
||||
price: Optional[str]
|
||||
currency: Optional[str]
|
||||
color: Optional[str]
|
||||
images_joined: Optional[str]
|
||||
115
Parser_NEXT/parser.py
Normal file
115
Parser_NEXT/parser.py
Normal file
@ -0,0 +1,115 @@
|
||||
from urllib.parse import urljoin
|
||||
from bs4 import BeautifulSoup
|
||||
from typing import List, Dict, Any
|
||||
from models import Product
|
||||
|
||||
def parse_products_from_ssr(summaries: List[Dict[str, Any]]) -> List[Product]:
|
||||
out: List[Product] = []
|
||||
for s in summaries or []:
|
||||
cw = (s.get("colourway") or {}) if isinstance(s, dict) else {}
|
||||
base = s.get("baseUrl") or ""
|
||||
rel = cw.get("url") or ""
|
||||
url = urljoin(base + "/", rel) if rel else (base or None)
|
||||
|
||||
name = s.get("title") or cw.get("title") or None
|
||||
price = cw.get("price") or cw.get("priceMarket") or None
|
||||
color = cw.get("color") or None
|
||||
currency = s.get("currencyCode") or None
|
||||
|
||||
out.append(Product(
|
||||
product_id = s.get("id") or None,
|
||||
url = url,
|
||||
name = name,
|
||||
price = str(price) if price is not None else None,
|
||||
currency = currency,
|
||||
image_urls = [], # картинки построим позже по imageCdnUrl + productImageUrlPart
|
||||
color = color,
|
||||
size_variants = [] # для homeware обычно пусто; для fashion добавим позже
|
||||
))
|
||||
return out
|
||||
|
||||
def parse_products_from_dom(html: str, cfg: Dict[str, Any]) -> List[Product]:
|
||||
soup = BeautifulSoup(html, "lxml")
|
||||
sel = cfg["selectors"]
|
||||
tiles = soup.select(sel["product_tile"])
|
||||
out = []
|
||||
for t in tiles:
|
||||
try:
|
||||
a = t.select_one(sel["product_link"])
|
||||
name_el = t.select_one(sel["product_name"])
|
||||
price_el = t.select_one(sel["product_price"])
|
||||
|
||||
url = a.get("href") if a else None
|
||||
if url and url.startswith("/"):
|
||||
url = cfg.get("base_url", "").rstrip("/") + url
|
||||
|
||||
name = name_el.get_text(strip=True) if name_el else None
|
||||
price = price_el.get_text(strip=True) if price_el else None
|
||||
|
||||
pid = t.get("data-style-id") or t.get("data-product-id") or None
|
||||
|
||||
out.append(Product(
|
||||
product_id=pid,
|
||||
url=url,
|
||||
name=name,
|
||||
price=price,
|
||||
currency=None,
|
||||
image_urls=[],
|
||||
color=None,
|
||||
size_variants=[]
|
||||
))
|
||||
except Exception:
|
||||
continue
|
||||
return out
|
||||
|
||||
def parse_products_from_xhr(xhrs: List[Dict[str, Any]]) -> List[Product]:
|
||||
out = []
|
||||
for item in xhrs:
|
||||
j = item.get("json") or {}
|
||||
candidates = []
|
||||
if isinstance(j, dict):
|
||||
for key in ["products", "items", "results", "hits"]:
|
||||
if isinstance(j.get(key), list):
|
||||
candidates = j[key]
|
||||
break
|
||||
if not candidates and isinstance(j, list):
|
||||
candidates = j
|
||||
|
||||
for p in candidates:
|
||||
pid = str(p.get("id") or p.get("productId") or p.get("styleId") or "") or None
|
||||
url = p.get("url") or p.get("link") or None
|
||||
name = p.get("name") or p.get("productName") or None
|
||||
price = None
|
||||
currency = None
|
||||
for k in ["price", "currentPrice", "sellingPrice"]:
|
||||
v = p.get(k)
|
||||
if isinstance(v, (int, float, str)):
|
||||
price = str(v)
|
||||
break
|
||||
if isinstance(v, dict):
|
||||
price = str(v.get("value") or v.get("amount") or "")
|
||||
currency = v.get("currency") or currency
|
||||
|
||||
images = []
|
||||
for k in ["images", "imageList", "media"]:
|
||||
v = p.get(k)
|
||||
if isinstance(v, list):
|
||||
for it in v:
|
||||
if isinstance(it, str):
|
||||
images.append(it)
|
||||
elif isinstance(it, dict):
|
||||
for kk in ["url", "src", "href"]:
|
||||
if it.get(kk):
|
||||
images.append(it[kk])
|
||||
|
||||
out.append(Product(
|
||||
product_id=pid,
|
||||
url=url,
|
||||
name=name,
|
||||
price=price,
|
||||
currency=currency,
|
||||
image_urls=images,
|
||||
color=p.get("color") or None,
|
||||
size_variants=[s for s in p.get("sizes", []) if isinstance(s, str)]
|
||||
))
|
||||
return out
|
||||
8
Parser_NEXT/requirements.txt
Normal file
8
Parser_NEXT/requirements.txt
Normal file
@ -0,0 +1,8 @@
|
||||
playwright==1.46.0
|
||||
pandas==2.2.2
|
||||
openpyxl==3.1.5
|
||||
pydantic==2.8.2
|
||||
pyyaml==6.0.2
|
||||
tenacity==8.3.0
|
||||
beautifulsoup4==4.12.3
|
||||
lxml==5.2.1
|
||||
108
Parser_NEXT/sink.py
Normal file
108
Parser_NEXT/sink.py
Normal file
@ -0,0 +1,108 @@
|
||||
import pandas as pd
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Any
|
||||
from models import Product, RowOut
|
||||
import hashlib, json, datetime
|
||||
import re
|
||||
|
||||
# ---- Price parsing helpers ----
|
||||
_PLN_PRICE_RE = re.compile(
|
||||
r'(?<!\d)(\d{1,3}(?:[ \u00A0]?\d{3})*(?:[.,]\d{2})?)(?:\s*(?:zł|PLN))',
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
def parse_pln_price_to_float(price_text: str | None) -> float | None:
|
||||
"""
|
||||
Из строки вида '1 299,00 zł' / '1299 zł' / '1 299 zł' достаём float 1299.00.
|
||||
Возвращает None, если распарсить не удалось.
|
||||
"""
|
||||
if not price_text:
|
||||
return None
|
||||
t = (
|
||||
price_text.replace("\u00a0", " ") # NBSP
|
||||
.replace("\u2009", " ") # thin space
|
||||
.strip()
|
||||
)
|
||||
m = _PLN_PRICE_RE.search(t)
|
||||
if not m:
|
||||
return None
|
||||
num = m.group(1)
|
||||
num = num.replace(" ", "").replace("\u00a0", "").replace("\u2009", "")
|
||||
num = num.replace(",", ".")
|
||||
try:
|
||||
return float(num)
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
def _as_str(v):
|
||||
return str(v) if v is not None else ""
|
||||
|
||||
def _key_from_fields(product_id: str | None, url: str | None) -> str:
|
||||
base = f"{_as_str(product_id)}|{_as_str(url)}"
|
||||
return hashlib.md5(base.encode("utf-8")).hexdigest()
|
||||
|
||||
def _key(p: Product) -> str:
|
||||
return _key_from_fields(p.product_id, _as_str(p.url))
|
||||
|
||||
def build_rows(category_name: str, category_url: str, products: List[Product]) -> List[Dict[str, Any]]:
|
||||
"""Построить список строк RowOut (dict) из продуктов."""
|
||||
rows: List[Dict[str, Any]] = []
|
||||
seen: set[str] = set()
|
||||
for p in products:
|
||||
k = _key(p)
|
||||
if k in seen:
|
||||
continue
|
||||
seen.add(k)
|
||||
rows.append(RowOut(
|
||||
category_name=category_name,
|
||||
category_url=category_url,
|
||||
product_id=_as_str(p.product_id) or None,
|
||||
url=_as_str(p.url) or None,
|
||||
name=p.name,
|
||||
price=p.price,
|
||||
currency=p.currency,
|
||||
color=p.color,
|
||||
images_joined="\n".join(p.image_urls) if p.image_urls else None
|
||||
).model_dump())
|
||||
return rows
|
||||
|
||||
def write_outputs(category_name: str, category_url: str, products: List[Product], out_folder: str, excel_prefix: str, csv_also: bool, jsonl_also: bool):
|
||||
"""Запись per‑category файлов (xlsx + опционально csv/jsonl). Возвращает (excel_path, nrows, rows)."""
|
||||
Path(out_folder).mkdir(parents=True, exist_ok=True)
|
||||
rows = build_rows(category_name, category_url, products)
|
||||
|
||||
ts = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
excel_path = Path(out_folder) / f"{excel_prefix}_{ts}.xlsx"
|
||||
df = pd.DataFrame(rows)
|
||||
with pd.ExcelWriter(excel_path, engine="openpyxl") as w:
|
||||
df.to_excel(w, sheet_name="Products", index=False)
|
||||
|
||||
if csv_also:
|
||||
df.to_csv(Path(out_folder) / f"{excel_prefix}_{ts}.csv", index=False)
|
||||
|
||||
if jsonl_also:
|
||||
with open(Path(out_folder) / f"{excel_prefix}_{ts}.jsonl", "w", encoding="utf-8") as f:
|
||||
for r in rows:
|
||||
f.write(json.dumps(r, ensure_ascii=False) + "\n")
|
||||
|
||||
return str(excel_path), len(rows), rows
|
||||
|
||||
def write_master_excel(all_path: str, rows: List[Dict[str, Any]]):
|
||||
"""Записать общий XLSX (один лист AllProducts). Перезаписывает файл целиком один раз в конце."""
|
||||
Path(all_path).parent.mkdir(parents=True, exist_ok=True)
|
||||
if not rows:
|
||||
# ничего не писать — пусто
|
||||
return str(all_path), 0
|
||||
# дедуп на всякий случай (по product_id|url)
|
||||
seen: set[str] = set()
|
||||
deduped: List[Dict[str, Any]] = []
|
||||
for r in rows:
|
||||
k = _key_from_fields(r.get("product_id"), r.get("url"))
|
||||
if k in seen:
|
||||
continue
|
||||
seen.add(k)
|
||||
deduped.append(r)
|
||||
df = pd.DataFrame(deduped)
|
||||
with pd.ExcelWriter(all_path, engine="openpyxl") as w:
|
||||
df.to_excel(w, sheet_name="AllProducts", index=False)
|
||||
return str(all_path), len(deduped)
|
||||
9
Parser_NEXT/tests/fixtures/category_sample.html
vendored
Normal file
9
Parser_NEXT/tests/fixtures/category_sample.html
vendored
Normal file
@ -0,0 +1,9 @@
|
||||
<!doctype html><html><body>
|
||||
<div data-testid="product-grid">
|
||||
<div data-testid="product-tile" data-product-id="123">
|
||||
<a href="/en/style/st123" data-testid="productLink">Open</a>
|
||||
<div data-testid="product-name">Sample Product A</div>
|
||||
<div data-testid="price">PLN 99</div>
|
||||
</div>
|
||||
</div>
|
||||
</body></html>
|
||||
9
Parser_NEXT/tests/test_parser.py
Normal file
9
Parser_NEXT/tests/test_parser.py
Normal file
@ -0,0 +1,9 @@
|
||||
from parser import parse_products_from_dom
|
||||
from pathlib import Path
|
||||
import yaml
|
||||
|
||||
def test_dom_parse_basic():
|
||||
html = Path("tests/fixtures/category_sample.html").read_text(encoding="utf-8")
|
||||
cfg = yaml.safe_load(Path("config.yaml").read_text(encoding="utf-8"))
|
||||
lst = parse_products_from_dom(html, cfg)
|
||||
assert isinstance(lst, list)
|
||||
19
Parser_NEXT/utils.py
Normal file
19
Parser_NEXT/utils.py
Normal file
@ -0,0 +1,19 @@
|
||||
import logging, time, random
|
||||
from pathlib import Path
|
||||
|
||||
def setup_logger():
|
||||
Path("out/logs").mkdir(parents=True, exist_ok=True)
|
||||
logging.basicConfig(
|
||||
filename="out/logs/run.log",
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s %(levelname)s %(message)s"
|
||||
)
|
||||
console = logging.StreamHandler()
|
||||
console.setLevel(logging.INFO)
|
||||
formatter = logging.Formatter("%(levelname)s %(message)s")
|
||||
console.setFormatter(formatter)
|
||||
logging.getLogger().addHandler(console)
|
||||
|
||||
def jitter(min_ms: int, max_ms: int):
|
||||
t = random.randint(min_ms, max_ms) / 1000.0
|
||||
time.sleep(t)
|
||||
Binary file not shown.
@ -28,7 +28,7 @@ def is_temp_or_hidden(name: str) -> bool:
|
||||
|
||||
# === Пути ===
|
||||
script_dir = get_script_dir()
|
||||
folder_path = os.path.join(script_dir, 'Files-todo')
|
||||
folder_path = os.path.join(script_dir, 'Files-todo/manifest')
|
||||
|
||||
# имя результата: All-todo-YYYYMMDD-HHMM.xlsx
|
||||
timestamp = datetime.now().strftime('%Y%m%d-%H%M')
|
||||
|
||||
0
python3.13
Normal file
0
python3.13
Normal file
12
Парсер_IKEA/dictionary_main.txt
Normal file
12
Парсер_IKEA/dictionary_main.txt
Normal file
@ -0,0 +1,12 @@
|
||||
"Wymiary" : "Размеры",
|
||||
"Szerokość" : "Ширина",
|
||||
"Głębokość" : "Глубина",
|
||||
"Obciążenie półki" : "Максимальная нагрузка на полку",
|
||||
"Opakowanie" : "Упаковка",
|
||||
"Wysokość" : "Высота",
|
||||
"Numer artykułu" : "Артикул",
|
||||
"Długość" : "Длинна",
|
||||
"Waga" : "Вес",
|
||||
"Paczka(i)" : "Упаковок",
|
||||
"Pojemność" : "Объем",
|
||||
"Ilość w opakowaniu" : "Количество в упаковке"
|
||||
1
Парсер_IKEA/exclusion_materials.txt
Normal file
1
Парсер_IKEA/exclusion_materials.txt
Normal file
@ -0,0 +1 @@
|
||||
"Bambus", "szkło"
|
||||
Binary file not shown.
121
Парсер_IKEA/leaf_categories copy.txt
Normal file
121
Парсер_IKEA/leaf_categories copy.txt
Normal file
@ -0,0 +1,121 @@
|
||||
|
||||
https://www.ikea.com/pl/pl/cat/haki-20617/
|
||||
https://www.ikea.com/pl/pl/cat/wieszaki-20618/
|
||||
https://www.ikea.com/pl/pl/cat/torby-i-wozki-na-zakupy-16295/
|
||||
https://www.ikea.com/pl/pl/cat/plecaki-i-torby-na-ramie-27821/
|
||||
https://www.ikea.com/pl/pl/cat/poduszki-podrozne-i-akcesoria-turystyczne-16255/
|
||||
https://www.ikea.com/pl/pl/cat/organizery-do-toreb-47441/
|
||||
https://www.ikea.com/pl/pl/cat/torby-chlodzace-46082/
|
||||
https://www.ikea.com/pl/pl/cat/akcesoria-do-przeprowadzki-46078/
|
||||
https://www.ikea.com/pl/pl/cat/kosze-lazienkowe-48940/
|
||||
https://www.ikea.com/pl/pl/cat/polki-i-akcesoria-prysznicowe-10658/
|
||||
https://www.ikea.com/pl/pl/cat/dozowniki-mydla-i-mydelniczki-10656/
|
||||
https://www.ikea.com/pl/pl/cat/szczotki-toaletowe-48944/
|
||||
https://www.ikea.com/pl/pl/cat/uchwyty-na-szczoteczki-do-zebow-48943/
|
||||
https://www.ikea.com/pl/pl/cat/akcesoria-lazienkowe-bez-wiercenia-700699/
|
||||
https://www.ikea.com/pl/pl/cat/pojemniki-na-zywnosc-20606/
|
||||
https://www.ikea.com/pl/pl/cat/zestawy-pojemnikow-na-zywnosc-700586/
|
||||
https://www.ikea.com/pl/pl/cat/akcesoria-do-przechowywania-zywnosci-i-zamykane-torby-700611/
|
||||
https://www.ikea.com/pl/pl/cat/organizery-do-lodowek-700588/
|
||||
https://www.ikea.com/pl/pl/cat/organizery-do-spizarni-i-na-blaty-robocze-700589/
|
||||
https://www.ikea.com/pl/pl/cat/sloiki-puszki-i-chlebaki-15950/
|
||||
https://www.ikea.com/pl/pl/cat/pojemniki-na-przyprawy-15951/
|
||||
https://www.ikea.com/pl/pl/cat/kubki-termiczne-i-bidony-700352/
|
||||
https://www.ikea.com/pl/pl/cat/torby-chlodzace-46082/
|
||||
https://www.ikea.com/pl/pl/cat/stojaki-na-wino-i-butelki-15952/
|
||||
https://www.ikea.com/pl/pl/cat/mix-match-pojemnikow-i-pokrywek-na-zywnosc-700610/
|
||||
https://www.ikea.com/pl/pl/cat/biurka-do-domu-20651/
|
||||
https://www.ikea.com/pl/pl/cat/produkty-akustyczne-mittzon-700539/
|
||||
https://www.ikea.com/pl/pl/cat/przegrody-do-pokoju-i-biurka-mittzon-700540/
|
||||
https://www.ikea.com/pl/pl/cat/stoly-konferencyjne-mittzon-700541/
|
||||
https://www.ikea.com/pl/pl/cat/biurka-mittzon-700542/
|
||||
https://www.ikea.com/pl/pl/cat/biurka-i-stoly-trotten-55993/
|
||||
https://www.ikea.com/pl/pl/cat/kontenerki-i-przechowywanie-trotten-55992/
|
||||
https://www.ikea.com/pl/pl/cat/blaty-i-podstawy-trotten-55991/
|
||||
https://www.ikea.com/pl/pl/cat/akcesoria-trotten-55990/
|
||||
https://www.ikea.com/pl/pl/cat/stoly-konferencyjne-trotten-700337/
|
||||
https://www.ikea.com/pl/pl/cat/przegrody-na-biurko-700336/
|
||||
https://www.ikea.com/pl/pl/cat/idasen-biurka-47426/
|
||||
https://www.ikea.com/pl/pl/cat/idasen-szafki-i-komody-biurowe-47427/
|
||||
https://www.ikea.com/pl/pl/cat/idasen-blaty-biurek-i-ramy-dolne-47425/
|
||||
https://www.ikea.com/pl/pl/cat/przegrody-na-biurko-700336/
|
||||
https://www.ikea.com/pl/pl/cat/kombinacje-biurek-stolow-18623/
|
||||
https://www.ikea.com/pl/pl/cat/nogi-i-kozly-do-stolow-biurek-11845/
|
||||
https://www.ikea.com/pl/pl/cat/blaty-do-stolow-biurek-11844/
|
||||
https://www.ikea.com/pl/pl/cat/biurko-dzieciece-relatera-700562/
|
||||
https://www.ikea.com/pl/pl/cat/blaty-i-podstawy-biurek-relatera-700563/
|
||||
https://www.ikea.com/pl/pl/cat/akcesoria-do-relatera-700564/
|
||||
https://www.ikea.com/pl/pl/cat/biurka-gamingowe-47070/
|
||||
https://www.ikea.com/pl/pl/cat/biurka-do-pracy-na-stojaco-55008/
|
||||
https://www.ikea.com/pl/pl/cat/biurka-do-biura-47069/
|
||||
https://www.ikea.com/pl/pl/cat/biurka-dla-dzieci-24714/
|
||||
https://www.ikea.com/pl/pl/cat/podstawki-i-stoliki-pod-laptopa-24830/
|
||||
https://www.ikea.com/pl/pl/cat/przegrody-na-biurko-700336/
|
||||
https://www.ikea.com/pl/pl/cat/krzesla-do-biurka-w-domu-20653/
|
||||
https://www.ikea.com/pl/pl/cat/krzesla-biurowe-20654/
|
||||
https://www.ikea.com/pl/pl/cat/krzesla-do-biurek-dla-dzieci-24715/
|
||||
https://www.ikea.com/pl/pl/cat/biurka-gamingowe-47070/
|
||||
https://www.ikea.com/pl/pl/cat/krzesla-i-fotele-gamingowe-47067/
|
||||
https://www.ikea.com/pl/pl/cat/akcesoria-gamingowe-55397/
|
||||
https://www.ikea.com/pl/pl/cat/zestawy-gamingowe-biurek-i-krzesel-56516/
|
||||
https://www.ikea.com/pl/pl/cat/stoly-konferencyjne-bekant-54173/
|
||||
https://www.ikea.com/pl/pl/cat/zestawy-stolow-konferencyjnych-i-krzesel-700424/
|
||||
https://www.ikea.com/pl/pl/cat/zestawy-biurek-i-krzesel-53249/
|
||||
https://www.ikea.com/pl/pl/cat/krzesla-konferencyjne-47068/
|
||||
https://www.ikea.com/pl/pl/cat/zarowki-led-700412/
|
||||
https://www.ikea.com/pl/pl/cat/ozdobne-zarowki-led-700413/
|
||||
https://www.ikea.com/pl/pl/cat/inteligentne-zarowki-36813/
|
||||
https://www.ikea.com/pl/pl/cat/lampy-wiszace-i-zyrandole-18751/
|
||||
https://www.ikea.com/pl/pl/cat/lampy-sufitowe-18752/
|
||||
https://www.ikea.com/pl/pl/cat/reflektory-sufitowe-18753/
|
||||
https://www.ikea.com/pl/pl/cat/zyrandole-59307/
|
||||
https://www.ikea.com/pl/pl/cat/lampy-stolowe-10732/
|
||||
https://www.ikea.com/pl/pl/cat/lampy-podlogowe-10731/
|
||||
https://www.ikea.com/pl/pl/cat/klosze-i-abazury-do-lamp-10804/
|
||||
https://www.ikea.com/pl/pl/cat/podstawy-i-oprawki-do-lamp-10805/
|
||||
https://www.ikea.com/pl/pl/cat/lampki-na-biurko-20502/
|
||||
https://www.ikea.com/pl/pl/cat/reflektory-sufitowe-18753/
|
||||
https://www.ikea.com/pl/pl/cat/reflektory-scienne-20505/
|
||||
https://www.ikea.com/pl/pl/cat/oswietlenie-szynowe-25209/
|
||||
https://www.ikea.com/pl/pl/cat/lampy-scienne-i-kinkiety-20504/
|
||||
https://www.ikea.com/pl/pl/cat/reflektory-scienne-20505/
|
||||
https://www.ikea.com/pl/pl/cat/oswietlenie-pokoju-dzieciecego-18773/
|
||||
https://www.ikea.com/pl/pl/cat/lampy-led-20516/
|
||||
https://www.ikea.com/pl/pl/cat/zarowki-led-700412/
|
||||
https://www.ikea.com/pl/pl/cat/ozdobne-zarowki-led-700413/
|
||||
https://www.ikea.com/pl/pl/cat/inteligentne-zarowki-36813/
|
||||
https://www.ikea.com/pl/pl/cat/paski-led-57542/
|
||||
https://www.ikea.com/pl/pl/cat/inteligentne-lampy-59308/
|
||||
https://www.ikea.com/pl/pl/cat/lampy-przenosne-700512/
|
||||
https://www.ikea.com/pl/pl/cat/inteligentne-zarowki-36813/
|
||||
https://www.ikea.com/pl/pl/cat/inteligentne-oswietlenie-zintegrowane-42248/
|
||||
https://www.ikea.com/pl/pl/cat/urzadzenia-sterujace-i-akcesoria-36814/
|
||||
https://www.ikea.com/pl/pl/cat/zestawy-inteligentnego-oswietlenia-36815/
|
||||
https://www.ikea.com/pl/pl/cat/inteligentne-lampy-59308/
|
||||
https://www.ikea.com/pl/pl/cat/panele-led-sufitowe-36816/
|
||||
https://www.ikea.com/pl/pl/cat/oswietlenie-szafek-lazienkowych-55010/
|
||||
https://www.ikea.com/pl/pl/cat/oswietlenie-mebli-kuchennych-16282/
|
||||
https://www.ikea.com/pl/pl/cat/oswietlenie-regalow-16281/
|
||||
https://www.ikea.com/pl/pl/cat/oswietlenie-szaf-16283/
|
||||
https://www.ikea.com/pl/pl/cat/oswietlenie-szafek-lazienkowych-55010/
|
||||
https://www.ikea.com/pl/pl/cat/oswietlenie-sufitowe-do-lazienki-700215/
|
||||
https://www.ikea.com/pl/pl/cat/oswietlenie-scienne-do-lazienki-700214/
|
||||
https://www.ikea.com/pl/pl/cat/lustra-z-oswietleniem-49138/
|
||||
https://www.ikea.com/pl/pl/cat/zewnetrzne-lampy-podlogowe-700615/
|
||||
https://www.ikea.com/pl/pl/cat/zewnetrzne-lampy-scienne-700616/
|
||||
https://www.ikea.com/pl/pl/cat/lampy-stolowe-zewnetrzne-700617/
|
||||
https://www.ikea.com/pl/pl/cat/lampy-wiszace-zewnetrzne-700618/
|
||||
https://www.ikea.com/pl/pl/cat/zewnetrzne-lancuchy-swietlne-700619/
|
||||
https://www.ikea.com/pl/pl/cat/oswietlenie-sciezek-700620/
|
||||
https://www.ikea.com/pl/pl/cat/lampiony-i-latarenki-do-zewnatrz-54942/
|
||||
https://www.ikea.com/pl/pl/cat/dekoracje-swietlne-stolu-700179/
|
||||
https://www.ikea.com/pl/pl/cat/dekoracyjne-lampy-wiszace-700177/
|
||||
https://www.ikea.com/pl/pl/cat/swiatla-lancuchowe-700180/
|
||||
https://www.ikea.com/pl/pl/cat/swiece-led-39266/
|
||||
https://www.ikea.com/pl/pl/cat/oswietlenie-dekoracyjne-led-54943/
|
||||
https://www.ikea.com/pl/pl/cat/paski-led-57542/
|
||||
https://www.ikea.com/pl/pl/cat/sofy-tapicerowane-2-osobowe-10668/
|
||||
https://www.ikea.com/pl/pl/cat/sofy-tapicerowane-3-osobowe-10670/
|
||||
https://www.ikea.com/pl/pl/cat/sofy-materialowe-z-szezlongami-47388/
|
||||
https://www.ikea.com/pl/pl/cat/narozniki-tapicerowane-10671/
|
||||
https://www.ikea.com/pl/pl/cat/sekcje-sofy-modulowej-31786/
|
||||
@ -1 +1,5 @@
|
||||
https://www.ikea.com/pl/pl/cat/poduszki-ergonomiczne-46083/
|
||||
|
||||
https://www.ikea.com/pl/pl/cat/akcesoria-do-przechowywania-zywnosci-i-zamykane-torby-700611/
|
||||
https://www.ikea.com/pl/pl/cat/organizery-do-lodowek-700588/
|
||||
https://www.ikea.com/pl/pl/cat/organizery-do-spizarni-i-na-blaty-robocze-700589/
|
||||
https://www.ikea.com/pl/pl/cat/sloiki-puszki-i-chlebaki-15950/
|
||||
|
||||
Binary file not shown.
@ -1,57 +1 @@
|
||||
https://www.ikea.com/pl/pl/p/majgull-zaslony-zacieniajace-1-para-bezowozolty-na-tasmie-70586026/
|
||||
https://www.ikea.com/pl/pl/p/majgull-zaslony-zaciemniajace-para-szary-na-tasmie-80417815/
|
||||
https://www.ikea.com/pl/pl/p/majgull-zaslony-zaciemniajace-para-szary-na-tasmie-50417812/
|
||||
https://www.ikea.com/pl/pl/p/majgull-zaslony-zaciemniajace-para-ciemnozielony-na-tasmie-30586033/
|
||||
https://www.ikea.com/pl/pl/p/maesterrot-zaslona-2-szt-bialy-bialy-kratka-na-tasmie-20602496/
|
||||
https://www.ikea.com/pl/pl/p/maesterrot-zaslona-2-szt-bezowy-bialy-wzor-w-kropki-na-tasmie-00602567/
|
||||
https://www.ikea.com/pl/pl/p/loennstaevmal-zaslony-zaciemniajace-para-jasny-czerwono-brazowy-na-tasmie-50556370/
|
||||
https://www.ikea.com/pl/pl/p/loennstaevmal-zaslony-zaciemniajace-para-jasnooliwkowy-na-tasmie-80556335/
|
||||
https://www.ikea.com/pl/pl/p/loennstaevmal-zaslony-zaciemniajace-para-bezowy-na-tasmie-70556374/
|
||||
https://www.ikea.com/pl/pl/p/lillyana-firanki-2-szt-bialy-kwiat-na-tunelu-30386524/
|
||||
https://www.ikea.com/pl/pl/p/lill-firanki-1-para-bialy-na-tunelu-10070262/
|
||||
https://www.ikea.com/pl/pl/p/lenda-zaslona-z-wiazaniem-2-szt-kremowy-na-tasmie-50552881/
|
||||
https://www.ikea.com/pl/pl/p/lenda-zaslona-z-wiazaniem-2-szt-jasny-szarozielony-na-tasmie-90559197/
|
||||
https://www.ikea.com/pl/pl/p/lenda-zaslona-z-wiazaniem-2-szt-ciemnoszary-na-tasmie-60552871/
|
||||
https://www.ikea.com/pl/pl/p/lenda-zaslona-z-wiazaniem-2-szt-brazowoczerwony-na-tasmie-30559195/
|
||||
https://www.ikea.com/pl/pl/p/korgmott-zaslony-zaciemniajace-para-ciemnoszary-na-tasmie-30597159/
|
||||
https://www.ikea.com/pl/pl/p/korgmott-zaslony-zaciemniajace-para-bialy-na-tasmie-40597149/
|
||||
https://www.ikea.com/pl/pl/p/korgmott-zaslony-zaciemniajace-para-bezowy-na-tasmie-50597158/
|
||||
https://www.ikea.com/pl/pl/p/hilja-zaslona-2-szt-szary-na-tasmie-90390735/
|
||||
https://www.ikea.com/pl/pl/p/hilja-zaslona-2-szt-bialy-na-tasmie-50430818/
|
||||
https://www.ikea.com/pl/pl/p/hilja-zaslona-2-szt-bialy-na-tasmie-40430814/
|
||||
https://www.ikea.com/pl/pl/p/hildrun-firanki-2-szt-bialy-w-kropki-na-tunelu-00386549/
|
||||
https://www.ikea.com/pl/pl/p/haellebraecka-firanki-2-szt-jasnobezowy-na-tasmie-00556848/
|
||||
https://www.ikea.com/pl/pl/p/haellebraecka-firanki-2-szt-bialy-na-tasmie-70559674/
|
||||
https://www.ikea.com/pl/pl/p/haeggveckmal-zaslony-zacieniajace-1-para-ciemnozielony-na-tasmie-00569110/
|
||||
https://www.ikea.com/pl/pl/p/haeggveckmal-zaslony-zacieniajace-1-para-ciemnoszary-na-tasmie-00562123/
|
||||
https://www.ikea.com/pl/pl/p/haeggveckmal-zaslony-zacieniajace-1-para-bezowy-na-tasmie-20569029/
|
||||
https://www.ikea.com/pl/pl/p/glesgroee-firanki-2-szt-szary-na-tasmie-30548989/
|
||||
https://www.ikea.com/pl/pl/p/gjertrud-firanki-2-szt-bialy-na-tasmie-30386538/
|
||||
https://www.ikea.com/pl/pl/p/gjertrud-firanka-1-szt-bialy-na-tasmie-60558948/
|
||||
https://www.ikea.com/pl/pl/p/ginstmott-zaslona-2-szt-zolty-na-tasmie-60597186/
|
||||
https://www.ikea.com/pl/pl/p/ginstmott-zaslona-2-szt-rozowy-na-tasmie-50597182/
|
||||
https://www.ikea.com/pl/pl/p/ginstmott-zaslona-2-szt-granatowy-na-tasmie-80597190/
|
||||
https://www.ikea.com/pl/pl/p/ginstmott-zaslona-2-szt-bialy-z-oczkami-30605418/
|
||||
https://www.ikea.com/pl/pl/p/ginstmott-zaslona-2-szt-bialy-na-tasmie-60597167/
|
||||
https://www.ikea.com/pl/pl/p/ginstmott-zaslona-2-szt-bezowy-z-oczkami-70605416/
|
||||
https://www.ikea.com/pl/pl/p/ginstmott-zaslona-2-szt-bezowy-na-tasmie-40597173/
|
||||
https://www.ikea.com/pl/pl/p/foensterbomal-magnetyczna-opaska-do-zaslon-bezowy-70498553/
|
||||
https://www.ikea.com/pl/pl/p/fjaedermott-zaslona-2-szt-bialy-szary-na-tasmie-70504587/
|
||||
https://www.ikea.com/pl/pl/p/dytag-zaslona-2-szt-szarozielony-na-tasmie-80552889/
|
||||
https://www.ikea.com/pl/pl/p/dytag-zaslona-2-szt-len-na-tasmie-80607820/
|
||||
https://www.ikea.com/pl/pl/p/dytag-zaslona-2-szt-granatowy-na-tasmie-90552493/
|
||||
https://www.ikea.com/pl/pl/p/dytag-zaslona-2-szt-bialy-na-tasmie-20466719/
|
||||
https://www.ikea.com/pl/pl/p/bymott-zaslona-2-szt-bialy-jasnoszary-w-paski-na-tasmie-30466686/
|
||||
https://www.ikea.com/pl/pl/p/bymott-zaslona-2-szt-bialy-bezowy-w-paski-na-tasmie-80509971/
|
||||
https://www.ikea.com/pl/pl/p/bruksvara-zaslona-zacieniajaca-1-szt-szary-na-tasmie-10574248/
|
||||
https://www.ikea.com/pl/pl/p/bruksvara-zaslona-zacieniajaca-1-szt-jasnoniebieski-na-tasmie-50574171/
|
||||
https://www.ikea.com/pl/pl/p/bergklematis-magnetyczna-opaska-do-zaslon-szary-90498552/
|
||||
https://www.ikea.com/pl/pl/p/bengta-zaslona-zaciemniajaca-1-szt-zielony-na-tasmie-10602166/
|
||||
https://www.ikea.com/pl/pl/p/bengta-zaslona-zaciemniajaca-1-szt-bezowy-na-tasmie-30602090/
|
||||
https://www.ikea.com/pl/pl/p/annakajsa-zaslony-zacieniajace-1-para-jasnoszary-na-tasmie-60583429/
|
||||
https://www.ikea.com/pl/pl/p/annakajsa-zaslony-zacieniajace-1-para-ciemnozielony-na-tasmie-70586074/
|
||||
https://www.ikea.com/pl/pl/p/annakajsa-zaslony-zacieniajace-1-para-bezowy-na-tasmie-30462792/
|
||||
https://www.ikea.com/pl/pl/p/annakajsa-zaslony-zacieniajace-1-para-antracyt-na-tasmie-40583430/
|
||||
https://www.ikea.com/pl/pl/p/alvine-spets-firanki-1-para-kremowy-na-tunelu-80070763/
|
||||
https://www.ikea.com/pl/pl/p/alvine-spets-firanka-1-szt-bialy-na-tunelu-50559811/
|
||||
https://www.ikea.com/pl/pl/p/aengsfryle-firanka-1-szt-bialy-na-tasmie-70569220/
|
||||
https://www.ikea.com/pl/pl/cat/zestawy-pojemnikow-na-zywnosc-700586
|
||||
File diff suppressed because one or more lines are too long
@ -1,15 +1,38 @@
|
||||
import requests
|
||||
import json
|
||||
import os
|
||||
import html
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import os, json, re, math, time, html, requests, datetime
|
||||
from bs4 import BeautifulSoup
|
||||
from openpyxl import Workbook
|
||||
|
||||
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||
INPUT_FILE = os.path.join(BASE_DIR, "product_links.txt")
|
||||
OUTPUT_FILE = os.path.join(BASE_DIR, "result.xlsx")
|
||||
# ───────────────────────── ПУТИ / ФАЙЛЫ ───────────────────────────
|
||||
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||
RECORDS_DIR = os.path.join(BASE_DIR, "records_folder")
|
||||
os.makedirs(RECORDS_DIR, exist_ok=True)
|
||||
|
||||
INPUT_FILE = os.path.join(BASE_DIR, "product_links.txt")
|
||||
OUTPUT_FILE = os.path.join(RECORDS_DIR, "records.xlsx")
|
||||
DICT_FILE = os.path.join(BASE_DIR, "dictionary_main.txt")
|
||||
EXCL_FILE = os.path.join(BASE_DIR, "exclusion_materials.txt")
|
||||
POST_LOG = os.path.join(RECORDS_DIR, "post_log.txt")
|
||||
|
||||
# ───────────────────────── НАСТРОЙКИ POST ─────────────────────────
|
||||
'''
|
||||
На старте спросим:
|
||||
- сохранять ли JSON батчи на диск
|
||||
- отправлять ли батчи на API
|
||||
|
||||
Ответ: 1 (да) / 0 (нет). Пустой ввод = 1.
|
||||
'''
|
||||
POST_URL = os.getenv("IKEA_POST_URL", "http://localhost:3005/parser/data")
|
||||
POST_API_KEY = os.getenv("IKEA_POST_API_KEY", "")
|
||||
POST_TIMEOUT = 20
|
||||
BATCH_SIZE = 50
|
||||
|
||||
# ───────────────────────── НАСТРОЙКИ САЙТА ────────────────────────
|
||||
HEADERS = {"User-Agent": "Mozilla/5.0"}
|
||||
CSS_SELECTOR = ".pip-product__subgrid.product-pip.js-product-pip"
|
||||
|
||||
BLOCKS = [
|
||||
"buyModule",
|
||||
"productSummary",
|
||||
@ -18,9 +41,12 @@ BLOCKS = [
|
||||
"keyFacts",
|
||||
"stockcheckSection",
|
||||
"availabilityGroup",
|
||||
"productGallery"
|
||||
"productGallery",
|
||||
]
|
||||
# ── какие колонки сохраняем ─────────────────────────────────────────
|
||||
|
||||
'''
|
||||
Whitelist колонок для Excel.
|
||||
'''
|
||||
KEEP_COLUMNS = [
|
||||
"availabilityGroup.serverOnlineSellable",
|
||||
"availabilityGroup.storeHeader",
|
||||
@ -31,41 +57,140 @@ KEEP_COLUMNS = [
|
||||
"keyFacts.ariaLabels",
|
||||
"keyFacts.gaLabel",
|
||||
"keyFacts.keyFacts",
|
||||
"keyFacts.keyFacts_formatted",
|
||||
"pipPricePackage.measurementText",
|
||||
"pipPricePackage.productDescription",
|
||||
"productGallery.urls",
|
||||
"productInformationSection.dimensionProps",
|
||||
"productInformationSection.dimensionProps_formatted",
|
||||
"productInformationSection.dimensionProps_formatted_html_translated",
|
||||
"productInformationSection.productDetailsProps",
|
||||
"productInformationSection.productDetailsProps_formatted",
|
||||
"productInformationSection.productDetailsProps_formatted_html",
|
||||
"productSummary.description",
|
||||
"productSummary.visibleItemNo",
|
||||
"stockcheckSection.packagingProps",
|
||||
"stockcheckSection.typeName",
|
||||
"url",
|
||||
"total brutto",
|
||||
"prductVariantColorMeasure",
|
||||
"categoryBreadcrumb",
|
||||
"originalName", # ### NEW: колонка для Excel
|
||||
"url",
|
||||
]
|
||||
|
||||
# ───────────────────────── УТИЛИТЫ I/O ────────────────────────────
|
||||
def ask_bool(prompt: str, default: str = "1") -> bool:
|
||||
'''
|
||||
Спрашивает 1/0; пустой ввод → default.
|
||||
'''
|
||||
try:
|
||||
val = input(f"{prompt} (1=yes, 0=no) [{default}]: ").strip() or default
|
||||
except EOFError:
|
||||
val = default
|
||||
return val == "1"
|
||||
|
||||
def _post_log(msg: str):
|
||||
'''Пишем строку в post_log.txt (молча игнорируем ошибки).'''
|
||||
try:
|
||||
with open(POST_LOG, "a", encoding="utf-8") as f:
|
||||
f.write(msg.rstrip() + "\n")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
def _now_tag():
|
||||
return datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
|
||||
def _save_json_batch(payload: dict, batch_index: int):
|
||||
fname = f"ikea_batch_{_now_tag()}_{batch_index:04d}.json"
|
||||
fpath = os.path.join(RECORDS_DIR, fname)
|
||||
with open(fpath, "w", encoding="utf-8") as fh:
|
||||
json.dump(payload, fh, ensure_ascii=False, indent=2)
|
||||
print(f"💾 JSON saved: {fname}")
|
||||
return fpath
|
||||
|
||||
# ───────────────────────── СЛОВАРИ / ФИЛЬТРЫ ──────────────────────
|
||||
def load_dictionary(path: str) -> dict:
|
||||
'''
|
||||
Читает словарь переводов:
|
||||
"Wymiary" : "Размеры",
|
||||
...
|
||||
'''
|
||||
if not os.path.isfile(path):
|
||||
return {}
|
||||
txt = open(path, "r", encoding="utf-8").read()
|
||||
pairs = re.findall(r'"([^"]+)"\s*:\s*"([^"]+)"', txt)
|
||||
return {k: v for k, v in pairs}
|
||||
|
||||
DICT = load_dictionary(DICT_FILE)
|
||||
|
||||
def translate_token(token: str) -> str:
|
||||
return DICT.get(token, token)
|
||||
|
||||
def load_exclusions(path: str) -> set:
|
||||
'''
|
||||
Загружает токены исключений из exclusion_materials.txt:
|
||||
- можно по одному на строку
|
||||
- или через запятую
|
||||
- регистр игнорируем
|
||||
'''
|
||||
if not os.path.isfile(path):
|
||||
return set()
|
||||
txt = open(path, "r", encoding="utf-8").read()
|
||||
# сначала из кавычек, если есть:
|
||||
quoted = re.findall(r'"([^"]+)"', txt, flags=re.S)
|
||||
tokens = quoted if quoted else re.split(r"[,;\n\r]+", txt)
|
||||
return {t.strip().lower() for t in tokens if t.strip()}
|
||||
|
||||
EXCLUSIONS = load_exclusions(EXCL_FILE)
|
||||
|
||||
def materials_from_details_json(details: dict) -> list[str]:
|
||||
'''
|
||||
Извлекаем ВСЕ строки из ключей "material" на любой глубине productDetailsProps.
|
||||
Встречаются разные схемы, поэтому делаем обход рекурсивно.
|
||||
'''
|
||||
out = []
|
||||
def walk(node):
|
||||
if isinstance(node, dict):
|
||||
for k, v in node.items():
|
||||
if k == "material" and isinstance(v, str):
|
||||
out.append(v)
|
||||
else:
|
||||
walk(v)
|
||||
elif isinstance(node, list):
|
||||
for x in node:
|
||||
walk(x)
|
||||
walk(details or {})
|
||||
return out
|
||||
|
||||
def materials_match_exclusions(details: dict, exclusion_tokens: set) -> bool:
|
||||
'''
|
||||
True — если хоть один токен встречается в любом material (case-insensitive).
|
||||
'''
|
||||
if not exclusion_tokens:
|
||||
return False
|
||||
mats = materials_from_details_json(details)
|
||||
joined = "\n".join(mats).lower()
|
||||
return any(tok in joined for tok in exclusion_tokens)
|
||||
|
||||
# ───────────────────────── ФОРМАТТЕРЫ ─────────────────────────────
|
||||
def _parse_json_value(val):
|
||||
if isinstance(val, (dict, list)) or val is None:
|
||||
return val
|
||||
if isinstance(val, str):
|
||||
s = val.strip()
|
||||
if not s:
|
||||
return val
|
||||
try:
|
||||
return json.loads(s)
|
||||
except Exception:
|
||||
return val
|
||||
return val
|
||||
|
||||
def flatten_block(block_name, data):
|
||||
if not isinstance(data, dict):
|
||||
return {}
|
||||
|
||||
flat = {}
|
||||
|
||||
for k, v in data.items():
|
||||
|
||||
'''
|
||||
# === 1. dimensionProps.images ===
|
||||
if block_name == "productInformationSection" and k == "dimensionProps":
|
||||
if isinstance(v, dict):
|
||||
urls = []
|
||||
for img in v.get("images", []):
|
||||
if isinstance(img, dict):
|
||||
url = img.get("url")
|
||||
if url:
|
||||
urls.append(url)
|
||||
flat[f"{key_name}.images_urls"] = "\n".join(urls)
|
||||
continue
|
||||
'''
|
||||
# === 2. mediaList.content.url → productGallery.urls
|
||||
if block_name == "productGallery" and k == "mediaList":
|
||||
if isinstance(v, list):
|
||||
urls = []
|
||||
@ -74,33 +199,242 @@ def flatten_block(block_name, data):
|
||||
if isinstance(content, dict) and "url" in content:
|
||||
urls.append(content["url"])
|
||||
flat["productGallery.urls"] = "\n".join(urls)
|
||||
return flat # ⬅ возвращаем только urls, остальные поля игнорируем
|
||||
|
||||
continue
|
||||
|
||||
# === Остальные поля — по умолчанию ===
|
||||
return flat
|
||||
key = f"{block_name}.{k}"
|
||||
flat[key] = v
|
||||
|
||||
return flat
|
||||
|
||||
def format_keyfacts(raw_keyfacts):
|
||||
if not isinstance(raw_keyfacts, list):
|
||||
return ""
|
||||
out = []
|
||||
header_added = False
|
||||
for el in raw_keyfacts:
|
||||
lbl = (el or {}).get("label")
|
||||
name = (el or {}).get("name", "Właściwości")
|
||||
if not header_added:
|
||||
out.append(name)
|
||||
header_added = True
|
||||
if lbl:
|
||||
out.append(lbl)
|
||||
return "\n".join(out)
|
||||
|
||||
|
||||
|
||||
def extract_data(url):
|
||||
"""
|
||||
Возвращает словарь с нужными полями товара IKEA.
|
||||
+ NEW: добавляет ключ 'categoryBreadcrumb' вида
|
||||
'Produkty/Tekstylia/Tekstylia do sypialni/Narzuty na łóżko'
|
||||
(берётся из JSON-LD BreadcrumbList).
|
||||
"""
|
||||
def _fmt_float(x):
|
||||
try:
|
||||
response = requests.get(url, timeout=10,
|
||||
headers={"User-Agent": "Mozilla/5.0"})
|
||||
response.raise_for_status()
|
||||
soup = BeautifulSoup(response.text, "html.parser")
|
||||
return f"{float(x):.2f}".rstrip("0").rstrip(".")
|
||||
except Exception:
|
||||
return ""
|
||||
|
||||
def _collect_packaging_total_kg(packaging):
|
||||
total = 0.0
|
||||
if not isinstance(packaging, dict):
|
||||
return total
|
||||
content = (packaging.get("contentProps") or {}).get("packages") or []
|
||||
for pkg in content:
|
||||
qty = ((pkg.get("quantity") or {}).get("value")) or 1
|
||||
ms = pkg.get("measurements") or []
|
||||
for block in ms:
|
||||
if not isinstance(block, list):
|
||||
continue
|
||||
weight_lbl = next((m for m in block if (m.get("type") == "weight" or m.get("label") == "Waga")), None)
|
||||
if weight_lbl and isinstance(weight_lbl.get("value"), (int, float)):
|
||||
total += float(weight_lbl["value"]) * (qty or 1)
|
||||
return total
|
||||
|
||||
def format_dimensions(raw_dim_props, with_html=False, translated=False):
|
||||
if not isinstance(raw_dim_props, dict):
|
||||
return ""
|
||||
lines = []
|
||||
br = "<br/>" if with_html else "\n"
|
||||
|
||||
title = translate_token("Wymiary") if translated else "Wymiary"
|
||||
lines.append(f"<strong>{title}</strong>" if with_html else title)
|
||||
|
||||
for d in raw_dim_props.get("dimensions", []):
|
||||
name = d.get("name", "")
|
||||
meas = d.get("measure", "")
|
||||
if not name and not meas:
|
||||
continue
|
||||
if translated:
|
||||
name_t = translate_token(name)
|
||||
line = f"{name_t}: {meas}".strip()
|
||||
else:
|
||||
line = f"{name}: {meas}".strip()
|
||||
lines.append(line)
|
||||
|
||||
pack = (raw_dim_props.get("packaging") or {})
|
||||
pack_title = translate_token("Opakowanie") if translated else "Opakowanie"
|
||||
lines.append(br if with_html else "")
|
||||
lines.append(f"<strong>{pack_title}</strong>" if with_html else pack_title)
|
||||
|
||||
content = (pack.get("contentProps") or {}).get("packages") or []
|
||||
for pkg in content:
|
||||
name = pkg.get("name") or ""
|
||||
if name:
|
||||
lines.append(name)
|
||||
|
||||
art = (pkg.get("articleNumber") or {}).get("value")
|
||||
if art:
|
||||
art_lbl = "Numer artykułu"
|
||||
if translated:
|
||||
art_lbl = translate_token(art_lbl)
|
||||
lines.append(art_lbl)
|
||||
lines.append(f"{art}")
|
||||
|
||||
ms = pkg.get("measurements") or []
|
||||
for block in ms:
|
||||
if not isinstance(block, list):
|
||||
continue
|
||||
for m in block:
|
||||
lbl = m.get("label", "")
|
||||
txt = m.get("text", "")
|
||||
if translated:
|
||||
lbl = translate_token(lbl) if lbl else lbl
|
||||
if lbl or txt:
|
||||
lines.append(f"{lbl}: {txt}".strip(": "))
|
||||
|
||||
q_val = ((pkg.get("quantity") or {}).get("value"))
|
||||
if q_val:
|
||||
q_lbl = "Paczka(i)"
|
||||
if translated:
|
||||
q_lbl = translate_token(q_lbl)
|
||||
lines.append(f"{q_lbl}: {q_val}")
|
||||
|
||||
if with_html:
|
||||
s = br.join([x for x in lines if x is not None])
|
||||
s = re.sub(r"(" + re.escape(br) + r"){2,}", br*2, s)
|
||||
s = s.strip(br)
|
||||
# ### NEW: страховка — иногда первая "<" теряется в Excel-предпросмотре
|
||||
if s.startswith("strong>"):
|
||||
s = "<" + s
|
||||
return s
|
||||
return "\n".join([x for x in lines if x is not None]).strip()
|
||||
|
||||
def format_product_details(raw_details, add_summary_desc="", with_html=False, skip_assembly=True):
|
||||
if not isinstance(raw_details, dict):
|
||||
return add_summary_desc if with_html else add_summary_desc
|
||||
|
||||
br = "<br/>" if with_html else "\n"
|
||||
out = []
|
||||
|
||||
if add_summary_desc:
|
||||
out.append(add_summary_desc)
|
||||
out.append(br if with_html else "")
|
||||
|
||||
t1 = "Informacje o produkcie"
|
||||
out.append(f"<strong>{t1}</strong>" if with_html else t1)
|
||||
pd = (raw_details.get("productDescriptionProps") or {})
|
||||
paragraphs = pd.get("paragraphs") or []
|
||||
for p in paragraphs:
|
||||
out.append(p)
|
||||
|
||||
dlabel = pd.get("designerLabel")
|
||||
dname = pd.get("designerName")
|
||||
if dlabel and dname:
|
||||
out.append(dlabel)
|
||||
out.append(dname)
|
||||
|
||||
if raw_details.get("productId"):
|
||||
out.append("Numer artykułu")
|
||||
out.append(raw_details["productId"])
|
||||
|
||||
acc = (raw_details.get("accordionObject") or {})
|
||||
gk = ((acc.get("goodToKnow") or {}).get("contentProps") or {}).get("goodToKnow") or []
|
||||
if gk:
|
||||
out.append(br if with_html else "")
|
||||
t2 = "Dobrze wiedzieć"
|
||||
out.append(f"<strong>{t2}</strong>" if with_html else t2)
|
||||
for item in gk:
|
||||
txt = item.get("text")
|
||||
if txt:
|
||||
out.append(txt)
|
||||
|
||||
mac = (acc.get("materialsAndCare") or {}).get("contentProps") or {}
|
||||
mats = mac.get("materials") or []
|
||||
care = mac.get("careInstructions") or []
|
||||
|
||||
t3 = "Materiały i pielęgnacja"
|
||||
if mats or care:
|
||||
out.append(br if with_html else "")
|
||||
out.append(f"<strong>{t3}</strong>" if with_html else t3)
|
||||
|
||||
if mats:
|
||||
out.append("Materiały")
|
||||
for m in mats:
|
||||
ptype = m.get("productType", "")
|
||||
for mat in (m.get("materials") or []):
|
||||
material = mat.get("material", "")
|
||||
if ptype:
|
||||
out.append(ptype)
|
||||
if material:
|
||||
out.append(material)
|
||||
|
||||
if care:
|
||||
detailsCareText = mac.get("detailsCareText", "Pielęgnacja")
|
||||
out.append(detailsCareText)
|
||||
for c in care:
|
||||
ptype = c.get("productType", "")
|
||||
texts = c.get("texts") or []
|
||||
if ptype:
|
||||
out.append(ptype)
|
||||
for t in texts:
|
||||
out.append(t)
|
||||
|
||||
safety = (raw_details.get("safetyAndCompliance") or {}).get("contentProps") or {}
|
||||
sc = safety.get("safetyAndCompliance") or []
|
||||
if sc:
|
||||
out.append(br if with_html else "")
|
||||
t4 = "Bezpieczeństwo i zgodność z przepisami"
|
||||
out.append(f"<strong>{t4}</strong>" if with_html else t4)
|
||||
for s in sc:
|
||||
txt = s.get("text")
|
||||
if txt:
|
||||
out.append(txt)
|
||||
|
||||
'''
|
||||
### Был блок сборки "Instrukcja montażu" — по вашему запросу отключён.
|
||||
if not skip_assembly:
|
||||
...
|
||||
'''
|
||||
|
||||
if with_html:
|
||||
s = br.join([x for x in out if x is not None])
|
||||
s = re.sub(r"(" + re.escape(br) + r"){2,}", br*2, s)
|
||||
return s.strip(br)
|
||||
return "\n".join([x for x in out if x is not None]).strip()
|
||||
|
||||
def build_variant_color_measure(desc: str, type_name: str, measurement: str) -> str:
|
||||
s = (desc or "")
|
||||
t = (type_name or "").strip()
|
||||
if t:
|
||||
pattern = r"^\s*" + re.escape(t) + r"[\s,;:\-–—/]*"
|
||||
s = re.sub(pattern, "", s, flags=re.IGNORECASE)
|
||||
|
||||
if not re.search(r"[0-9A-Za-zА-Яа-яЁёÀ-ž]", s or ""):
|
||||
s = ""
|
||||
|
||||
s = s.strip()
|
||||
meas = (measurement or "").strip()
|
||||
|
||||
if not s:
|
||||
return meas if meas else ""
|
||||
|
||||
s = s[:1].upper() + s[1:]
|
||||
return f"{s}, {meas}" if meas else s
|
||||
|
||||
# ───────────────────── СКРАПИНГ КАРТОЧКИ ──────────────────────────
|
||||
def extract_data(url: str) -> dict:
|
||||
'''
|
||||
Возвращает плоский dict с полями KEEP_COLUMNS.
|
||||
Форматтеры/подсчёты: keyFacts_formatted, dimensionProps_formatted,
|
||||
dimensionProps_formatted_html_translated, productDetailsProps_formatted,
|
||||
productDetailsProps_formatted_html, total brutto, prductVariantColorMeasure, categoryBreadcrumb.
|
||||
'''
|
||||
try:
|
||||
resp = requests.get(url, headers=HEADERS, timeout=15)
|
||||
resp.raise_for_status()
|
||||
soup = BeautifulSoup(resp.text, "html.parser")
|
||||
|
||||
# ── основной JSON из data-hydration-props ──────────────────
|
||||
target = soup.select_one(CSS_SELECTOR)
|
||||
if not target:
|
||||
return {"url": url, "error": "CSS selector not found"}
|
||||
@ -109,92 +443,271 @@ def extract_data(url):
|
||||
if not raw:
|
||||
return {"url": url, "error": "data-hydration-props not found"}
|
||||
|
||||
decoded = html.unescape(raw)
|
||||
full_json = json.loads(decoded)
|
||||
result = {"url": url}
|
||||
decoded = html.unescape(raw)
|
||||
full_json = json.loads(decoded)
|
||||
|
||||
# вытаскиваем нужные блоки
|
||||
result = {"url": url}
|
||||
for block in BLOCKS:
|
||||
result.update(flatten_block(block, full_json.get(block, {})))
|
||||
|
||||
# ── NEW: извлекаем BreadcrumbList → categoryBreadcrumb ────
|
||||
kf_json = _parse_json_value(result.get("keyFacts.keyFacts"))
|
||||
dim_json = _parse_json_value(result.get("productInformationSection.dimensionProps"))
|
||||
det_json = _parse_json_value(result.get("productInformationSection.productDetailsProps"))
|
||||
|
||||
result["keyFacts.keyFacts_formatted"] = format_keyfacts(kf_json)
|
||||
result["productInformationSection.dimensionProps_formatted"] = format_dimensions(dim_json, with_html=False, translated=False)
|
||||
html_trans = format_dimensions(dim_json, with_html=True, translated=True)
|
||||
|
||||
# ### NEW: дополнительная страховка — если вдруг нет '<' в начале:
|
||||
if isinstance(html_trans, str) and html_trans.startswith("strong>"):
|
||||
html_trans = "<" + html_trans
|
||||
|
||||
result["productInformationSection.dimensionProps_formatted_html_translated"] = html_trans
|
||||
|
||||
total_kg = _collect_packaging_total_kg((dim_json or {}).get("packaging") or {})
|
||||
result["total brutto"] = _fmt_float(total_kg)
|
||||
|
||||
summary_desc = result.get("productSummary.description", "") or ""
|
||||
result["productInformationSection.productDetailsProps_formatted"] = format_product_details(det_json, add_summary_desc=summary_desc, with_html=False, skip_assembly=True)
|
||||
result["productInformationSection.productDetailsProps_formatted_html"] = format_product_details(det_json, add_summary_desc=summary_desc, with_html=True, skip_assembly=True)
|
||||
|
||||
desc = result.get("pipPricePackage.productDescription", "") or ""
|
||||
tname = result.get("stockcheckSection.typeName", "") or ""
|
||||
meas = result.get("pipPricePackage.measurementText", "") or ""
|
||||
result["prductVariantColorMeasure"] = build_variant_color_measure(desc, tname, meas)
|
||||
|
||||
# breadcrumb
|
||||
breadcrumb = None
|
||||
for tag in soup.find_all("script",
|
||||
attrs={"type": lambda t: t and "ld+json" in t}):
|
||||
for tag in soup.find_all("script", attrs={"type": lambda t: t and "ld+json" in t}):
|
||||
try:
|
||||
data = json.loads(tag.string)
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
# если это массив JSON-LD, ищем в нём объект Product / Breadcrumb
|
||||
if isinstance(data, list):
|
||||
data = next((d for d in data
|
||||
if d.get("@type") == "BreadcrumbList"), None)
|
||||
|
||||
data = next((d for d in data if isinstance(d, dict) and d.get("@type") == "BreadcrumbList"), None)
|
||||
if isinstance(data, dict) and data.get("@type") == "BreadcrumbList":
|
||||
items = data.get("itemListElement", [])
|
||||
names = [it.get("name", "") for it in items]
|
||||
breadcrumb = "/".join(names)
|
||||
break # нашли нужный блок – выходим из цикла
|
||||
|
||||
break
|
||||
if breadcrumb:
|
||||
result["categoryBreadcrumb"] = breadcrumb
|
||||
|
||||
return result
|
||||
# применяем whitelist
|
||||
filtered = {k: result.get(k) for k in KEEP_COLUMNS if k != "originalName"}
|
||||
|
||||
'''
|
||||
### NEW: originalName = productName + " " + typeName (без двойных пробелов)
|
||||
'''
|
||||
pn = (result.get("buyModule.productName") or "").strip()
|
||||
tn = (result.get("stockcheckSection.typeName") or "").strip()
|
||||
if pn and tn:
|
||||
orig_name = f"{pn} {tn}"
|
||||
else:
|
||||
orig_name = pn or tn
|
||||
filtered["originalName"] = orig_name
|
||||
|
||||
return filtered
|
||||
|
||||
except Exception as e:
|
||||
return {"url": url, "error": str(e)}
|
||||
|
||||
# ───────────────────── ПОСТРОЕНИЕ ВАРИАНТА / POST ─────────────────
|
||||
def _split_color_size(text: str):
|
||||
if not text:
|
||||
return "", ""
|
||||
parts = [p.strip() for p in text.split(",", 1)]
|
||||
if len(parts) == 2:
|
||||
return parts[0], parts[1]
|
||||
return "", parts[0]
|
||||
|
||||
def _ceil_price(v):
|
||||
try:
|
||||
return int(math.ceil(float(v)))
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
def _ceil_int(v):
|
||||
try:
|
||||
return int(math.ceil(float(v)))
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
def build_variant(row: dict) -> dict:
|
||||
category_name = row.get("categoryBreadcrumb") or ""
|
||||
brand_name = "ikea"
|
||||
|
||||
visible = row.get("productSummary.visibleItemNo") or ""
|
||||
sku = visible.replace(" ", "")
|
||||
|
||||
csm = (row.get("prductVariantColorMeasure") or "").strip()
|
||||
color, size = _split_color_size(csm)
|
||||
if not color and not size:
|
||||
size = (row.get("pipPricePackage.measurementText") or "").strip()
|
||||
|
||||
cost = _ceil_price(row.get("buyModule.productPrice"))
|
||||
url = row.get("url") or ""
|
||||
|
||||
'''
|
||||
### NEW: originalName берём из одноимённой колонки (а не только из productName)
|
||||
'''
|
||||
name = row.get("originalName") or row.get("buyModule.productName") or ""
|
||||
|
||||
desc_html = row.get("productInformationSection.productDetailsProps_formatted_html") or ""
|
||||
|
||||
'''
|
||||
### NEW: originalComposition = HTML из dimensionProps_formatted_html_translated
|
||||
'''
|
||||
composition_html = row.get("productInformationSection.dimensionProps_formatted_html_translated") or ""
|
||||
|
||||
imgs = []
|
||||
raw_imgs = row.get("productGallery.urls") or ""
|
||||
if isinstance(raw_imgs, str):
|
||||
imgs = [x for x in raw_imgs.split("\n") if x.strip()]
|
||||
|
||||
in_stock = bool(row.get("availabilityGroup.serverOnlineSellable"))
|
||||
if not in_stock:
|
||||
in_stock = bool(row.get("buyModule.onlineSellable"))
|
||||
|
||||
weight_kg = _ceil_int(row.get("total brutto"))
|
||||
|
||||
variant = {
|
||||
"status_id": 1,
|
||||
"color": color.capitalize() if color else "none",
|
||||
"sku": sku,
|
||||
"size": size,
|
||||
"cost": cost,
|
||||
"originalUrl": url,
|
||||
"originalName": name, # ← ### NEW: в JSON сохраняем originalName
|
||||
"originalDescription": desc_html,
|
||||
"originalComposition": composition_html, # ← ### NEW
|
||||
"images": imgs,
|
||||
"inStock": in_stock,
|
||||
"weight": weight_kg if weight_kg is not None else 0,
|
||||
}
|
||||
|
||||
return {
|
||||
#"category": {"name": category_name},
|
||||
"category": {"name": "TEST/IKEA"},
|
||||
"brand": {"name": "ikea"},
|
||||
"variant": variant,
|
||||
}
|
||||
|
||||
def post_payload(payload: dict) -> dict:
|
||||
headers = {"Content-Type": "application/json"}
|
||||
if POST_API_KEY:
|
||||
headers["Authorization"] = f"Bearer {POST_API_KEY}"
|
||||
|
||||
body = json.dumps(payload, ensure_ascii=False)
|
||||
_post_log(f"→ POST {POST_URL}\nHeaders: {headers}\nBody: {body}")
|
||||
|
||||
try:
|
||||
r = requests.post(POST_URL, headers=headers, data=body.encode("utf-8"), timeout=POST_TIMEOUT)
|
||||
text = r.text
|
||||
_post_log(f"← {r.status_code}\n{text}\n{'-'*60}")
|
||||
ok = 200 <= r.status_code < 300
|
||||
return {"ok": ok, "status": r.status_code, "response": text}
|
||||
except Exception as e:
|
||||
_post_log(f"× ERROR: {e}\n{'-'*60}")
|
||||
return {"ok": False, "status": None, "error": str(e)}
|
||||
|
||||
# ───────────────────────── СЕРДЦЕ СКРИПТА ─────────────────────────
|
||||
def safe_cell(val):
|
||||
if isinstance(val, (dict, list)):
|
||||
return json.dumps(val, ensure_ascii=False)
|
||||
return "" if val is None else val
|
||||
|
||||
def main():
|
||||
# ── читаем ссылки ────────────────────────────────────────────
|
||||
SAVE_JSON = ask_bool("SAVE_JSON (сохранять JSON на диск?)", "1")
|
||||
SEND_JSON = ask_bool("SEND_JSON (отправлять на API?)", "1")
|
||||
|
||||
# читаем ссылки
|
||||
with open(INPUT_FILE, "r", encoding="utf-8") as f:
|
||||
links = [line.strip() for line in f if line.strip()]
|
||||
print(f"Всего ссылок: {len(links)}")
|
||||
|
||||
rows = []
|
||||
# готовим Excel
|
||||
wb = Workbook()
|
||||
ws = wb.active
|
||||
ws.title = "IKEA Products"
|
||||
ws.append(KEEP_COLUMNS)
|
||||
|
||||
# ---- РЕЖИМ КОЛОНОК -----------------------------------------
|
||||
# NEW: фиксированный список колонок (см. KEEP_COLUMNS вверху)
|
||||
all_columns = KEEP_COLUMNS
|
||||
# батч для JSON/API
|
||||
batch_items = []
|
||||
batch_index = 1
|
||||
|
||||
# OLD (восстановить-если-нужно):
|
||||
# all_columns = set() # ← копил все поля
|
||||
# ------------------------------------------------------------
|
||||
def flush_batch():
|
||||
nonlocal batch_items, batch_index
|
||||
if not batch_items:
|
||||
return
|
||||
payload = {"parserName": "ikea", "items": batch_items}
|
||||
if SAVE_JSON:
|
||||
_save_json_batch(payload, batch_index)
|
||||
if SEND_JSON:
|
||||
res = post_payload(payload)
|
||||
ok = res.get("ok")
|
||||
print(f"POST batch {batch_index}: {'OK' if ok else 'FAIL'} (status={res.get('status')})")
|
||||
batch_index += 1
|
||||
batch_items = []
|
||||
|
||||
print("🔍 Извлечение данных...")
|
||||
for idx, link in enumerate(links, 1):
|
||||
print(f"[{idx}/{len(links)}] {link}")
|
||||
row = extract_data(link)
|
||||
|
||||
# NEW: оставляем только нужные 17 полей
|
||||
row = {k: v for k, v in row.items() if k in KEEP_COLUMNS}
|
||||
'''
|
||||
### NEW: originalName уже сформирован в extract_data и попал в row
|
||||
'''
|
||||
|
||||
# OLD (восстановить-если-нужно):
|
||||
# all_columns.update(row.keys()) # ← собирал все ключи
|
||||
# пишем в Excel ВСЁ (без фильтров)
|
||||
ws.append([safe_cell(row.get(col, "")) for col in KEEP_COLUMNS])
|
||||
|
||||
rows.append(row)
|
||||
# ФИЛЬТРЫ для JSON/API
|
||||
try:
|
||||
price = float(row.get("buyModule.productPrice") or 0)
|
||||
except Exception:
|
||||
price = 0.0
|
||||
|
||||
# OLD (восстановить-если-нужно):
|
||||
# if isinstance(all_columns, set):
|
||||
# all_columns = sorted(all_columns) # упорядочивал всё
|
||||
try:
|
||||
total_kg = float(row.get("total brutto") or 0)
|
||||
except Exception:
|
||||
total_kg = 0.0
|
||||
|
||||
def safe(val):
|
||||
"""Преобразует dict / list в JSON-строку, None → ''."""
|
||||
if isinstance(val, (dict, list)):
|
||||
return json.dumps(val, ensure_ascii=False)
|
||||
return "" if val is None else val
|
||||
details_json = row.get("productInformationSection.productDetailsProps") or {}
|
||||
|
||||
print("📤 Сохраняем Excel...")
|
||||
wb = Workbook()
|
||||
ws = wb.active
|
||||
ws.title = "IKEA Products"
|
||||
ws.append(all_columns)
|
||||
# 1) фильтр цены
|
||||
if not (20 <= price <= 1500):
|
||||
pass
|
||||
# 2) фильтр веса
|
||||
elif total_kg > 30:
|
||||
pass
|
||||
# 3) фильтр материалов
|
||||
elif materials_match_exclusions(details_json, EXCLUSIONS):
|
||||
pass
|
||||
else:
|
||||
# прошёл фильтры → добавляем в батч
|
||||
try:
|
||||
item = build_variant(row)
|
||||
batch_items.append(item)
|
||||
except Exception as e:
|
||||
_post_log(f"× build_variant error for {link}: {e}")
|
||||
|
||||
for row in rows:
|
||||
ws.append([safe(row.get(col, "")) for col in all_columns])
|
||||
# авто-сейв Excel каждые 50 строк
|
||||
if idx % 50 == 0:
|
||||
wb.save(OUTPUT_FILE)
|
||||
print(f"💾 autosave: {OUTPUT_FILE}")
|
||||
|
||||
# флаш батча при достижении лимита
|
||||
if len(batch_items) >= BATCH_SIZE:
|
||||
flush_batch()
|
||||
|
||||
# финал: дописать Excel и отправить/сохранить остаток батча
|
||||
wb.save(OUTPUT_FILE)
|
||||
print(f"\n✅ Готово: {OUTPUT_FILE}")
|
||||
print(f"\n✅ Excel готов: {OUTPUT_FILE}")
|
||||
|
||||
flush_batch()
|
||||
print("🎯 Готово.")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
|
||||
|
||||
@ -1,8 +1,21 @@
|
||||
https://www.ikea.com/pl/pl/p/klubbsporre-poduszka-ergonomiczna-sen-bok-plecy-00446096/
|
||||
https://www.ikea.com/pl/pl/p/rosenskaerm-poduszka-ergonomiczna-sen-bok-plecy-90444366/
|
||||
https://www.ikea.com/pl/pl/p/styltmal-poduszka-ergonomiczna-multi-bialy-90518084/
|
||||
https://www.ikea.com/pl/pl/p/isranunkel-poduszka-ergonomiczna-multi-00576733/
|
||||
https://www.ikea.com/pl/pl/p/kvarnven-poduszka-ergonomiczna-sen-bok-plecy-70507350/
|
||||
https://www.ikea.com/pl/pl/p/loekstamfly-poduszka-ergonomiczna-sen-bok-plecy-50596192/
|
||||
https://www.ikea.com/pl/pl/p/papegojbuske-poduszka-ergonomiczna-sen-bok-plecy-00552845/
|
||||
https://www.ikea.com/pl/pl/p/nordstaloert-poduszka-ergonomiczna-sen-bok-plecy-20596240/
|
||||
https://www.ikea.com/pl/pl/p/oevermaett-oslona-zywnosci-kpl-3-szt-silikon-wielobarwny-80417311/
|
||||
https://www.ikea.com/pl/pl/p/bevara-klips-do-torebek-antracyt-ciemnozolty-90524179/
|
||||
https://www.ikea.com/pl/pl/p/istad-torebka-strunowa-wzor-czarny-zolty-50525642/
|
||||
https://www.ikea.com/pl/pl/p/koessebaer-torebka-strunowa-brazowy-70599260/
|
||||
https://www.ikea.com/pl/pl/p/istad-torebka-strunowa-wzor-zielony-40525685/
|
||||
https://www.ikea.com/pl/pl/p/koessebaer-kosz-na-warzywa-i-owoce-topola-60599270/
|
||||
https://www.ikea.com/pl/pl/p/istad-torebka-strunowa-wzor-czerwony-rozowy-80525674/
|
||||
https://www.ikea.com/pl/pl/p/koessebaer-torebka-strunowa-rozne-wzory-50599261/
|
||||
https://www.ikea.com/pl/pl/p/framtung-torba-na-lunch-czarny-40498922/
|
||||
https://www.ikea.com/pl/pl/p/ikea-365-etykieta-20438547/
|
||||
https://www.ikea.com/pl/pl/p/bevara-klips-do-torebek-zestaw-26-szt-rozne-kolory-00524174/
|
||||
https://www.ikea.com/pl/pl/p/fladdrig-torba-na-lunch-wzor-szary-10497212/
|
||||
https://www.ikea.com/pl/pl/p/istad-torebka-strunowa-wzor-niebieski-00525654/
|
||||
https://www.ikea.com/pl/pl/p/kustfyr-torebka-strunowa-wzor-w-koty-szary-90599607/
|
||||
https://www.ikea.com/pl/pl/p/oevermaett-oslona-na-zywnosc-zest-2-szt-silikon-30497923/
|
||||
https://www.ikea.com/pl/pl/p/filfisk-3szt-woreczki-strunowe-wielobarwny-silikon-70514628/
|
||||
https://www.ikea.com/pl/pl/p/oevermaett-przykrycie-jedzenia-silikon-40497932/
|
||||
https://www.ikea.com/pl/pl/p/koessebaer-etykiety-zestaw-25-sztuk-bialy-90599264/
|
||||
https://www.ikea.com/pl/pl/p/koessebaer-stojak-do-suszenia-zywn-z-2-tacami-bambus-80599274/
|
||||
https://www.ikea.com/pl/pl/p/gullrismott-torebka-do-przechowywania-zywnosci-na-ziemniaki-00581796/
|
||||
https://www.ikea.com/pl/pl/p/hejne-3-sekcje-polki-miekkie-dr-s99031408/
|
||||
|
||||
Binary file not shown.
Binary file not shown.
BIN
Парсер_IKEA/~$leaf_categories.xlsx
Normal file
BIN
Парсер_IKEA/~$leaf_categories.xlsx
Normal file
Binary file not shown.
Loading…
Reference in New Issue
Block a user