ZH

2025-10-13 10:27:04 +03:00 · 2025-10-13 10:27:04 +03:00 · 944844aa4c
commit 944844aa4c
parent 7d33842690
4 changed files with 71 additions and 0 deletions
--- a/ZARAHOME\src_2024-09-05categories.xlsx
+++ b/ZARAHOME\src_2024-09-05categories.xlsx
--- a/ZARAHOME/src/categories.xlsx
+++ b/ZARAHOME/src/categories.xlsx
--- a/ZARAHOME/src/extractor
+++ b/ZARAHOME/src/extractor
@ -0,0 +1,71 @@
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.chrome.options import Options
+from bs4 import BeautifulSoup
+import pandas as pd
+from urllib.parse import urljoin
+
+BASE_URL = "https://www.zarahome.com"
+START_URL = f"{BASE_URL}/pl/en/"
+TIMEOUT   = 30
+
+opt = Options()
+#opt.add_argument("--headless=new")
+opt.add_argument("--window-size=1920,1080")
+opt.add_argument("--disable-gpu")
+opt.add_argument("--disable-blink-features=AutomationControlled")
+opt.add_experimental_option("excludeSwitches", ["enable-automation"])
+opt.add_experimental_option("useAutomationExtension", False)
+
+driver = webdriver.Chrome(options=opt)
+wait   = WebDriverWait(driver, TIMEOUT)
+
+try:
+    driver.get(START_URL)
+
+    # cookies
+    try:
+        wait.until(EC.element_to_be_clickable((
+            By.XPATH, "//button[contains(.,'Accept') or contains(.,'Akcept')]"))
+        ).click()
+    except Exception:
+        pass
+
+    # раскрываем бургер (если есть)
+    try:
+        wait.until(EC.element_to_be_clickable((
+            By.CSS_SELECTOR,
+            "button[aria-label='Menu'], button[data-testid='menu-button']"))
+        ).click()
+    except Exception:
+        pass
+
+    # ждём пунктов меню
+    wait.until(EC.presence_of_element_located((
+        By.XPATH, "//nav//ul//a[normalize-space(text())!='']")))
+
+    html = driver.page_source
+finally:
+    driver.quit()
+
+# ── парсинг
+soup  = BeautifulSoup(html, "lxml")
+links = soup.select("nav ul a[href]")          # любой href, не только https
+print("Всего найдено ссылок в DOM:", len(links))
+
+records = set()
+for a in links:
+    name = a.get_text(strip=True)
+    href = a["href"]
+    if not name or href.startswith("javascript"):
+        continue
+    full_url = urljoin(BASE_URL, href)        # /pl/en/... → https://www.zarahome.com/pl/en/...
+    records.add((full_url, name))
+
+print("После фильтрации уникальных:", len(records))
+
+df = pd.DataFrame(sorted(records), columns=["URL", "Category"])
+df.to_excel(r"/Users/va1is/src_2024-09-05categories.xlsx", index=False)
+print(f"✔ Собрано {len(df)} ссылок → categories.xlsx")
--- a/ZARAHOME/src/extractor
+++ b/ZARAHOME/src/extractor