ZH

2025-10-13 10:27:04 +03:00 · 2025-10-13 10:27:04 +03:00 · 944844aa4c
commit 944844aa4c
parent 7d33842690
4 changed files with 71 additions and 0 deletions
--- a/ZARAHOME\src_2024-09-05categories.xlsx
+++ b/ZARAHOME\src_2024-09-05categories.xlsx
--- a/ZARAHOME/src/categories.xlsx
+++ b/ZARAHOME/src/categories.xlsx
--- a/ZARAHOME/src/extractor
+++ b/ZARAHOME/src/extractor
@ -0,0 +1,71 @@
 from selenium import webdriver
 from selenium.webdriver.common.by import By
 from selenium.webdriver.support.ui import WebDriverWait
 from selenium.webdriver.support import expected_conditions as EC
 from selenium.webdriver.chrome.options import Options
 from bs4 import BeautifulSoup
 import pandas as pd
 from urllib.parse import urljoin
 BASE_URL = "https://www.zarahome.com"
 START_URL = f"{BASE_URL}/pl/en/"
 TIMEOUT   = 30
 opt = Options()
 #opt.add_argument("--headless=new")
 opt.add_argument("--window-size=1920,1080")
 opt.add_argument("--disable-gpu")
 opt.add_argument("--disable-blink-features=AutomationControlled")
 opt.add_experimental_option("excludeSwitches", ["enable-automation"])
 opt.add_experimental_option("useAutomationExtension", False)
 driver = webdriver.Chrome(options=opt)
 wait   = WebDriverWait(driver, TIMEOUT)
 try:
    driver.get(START_URL)
    # cookies
    try:
        wait.until(EC.element_to_be_clickable((
            By.XPATH, "//button[contains(.,'Accept') or contains(.,'Akcept')]"))
        ).click()
    except Exception:
        pass
    # раскрываем бургер (если есть)
    try:
        wait.until(EC.element_to_be_clickable((
            By.CSS_SELECTOR,
            "button[aria-label='Menu'], button[data-testid='menu-button']"))
        ).click()
    except Exception:
        pass
    # ждём пунктов меню
    wait.until(EC.presence_of_element_located((
        By.XPATH, "//nav//ul//a[normalize-space(text())!='']")))
    html = driver.page_source
 finally:
    driver.quit()
 # ── парсинг
 soup  = BeautifulSoup(html, "lxml")
 links = soup.select("nav ul a[href]")          # любой href, не только https
 print("Всего найдено ссылок в DOM:", len(links))
 records = set()
 for a in links:
    name = a.get_text(strip=True)
    href = a["href"]
    if not name or href.startswith("javascript"):
        continue
    full_url = urljoin(BASE_URL, href)        # /pl/en/... → https://www.zarahome.com/pl/en/...
    records.add((full_url, name))
 print("После фильтрации уникальных:", len(records))
 df = pd.DataFrame(sorted(records), columns=["URL", "Category"])
 df.to_excel(r"/Users/va1is/src_2024-09-05categories.xlsx", index=False)
 print(f"✔ Собрано {len(df)} ссылок → categories.xlsx")
--- a/ZARAHOME/src/extractor
+++ b/ZARAHOME/src/extractor