This commit is contained in:
va1is 2025-10-13 10:27:04 +03:00
parent 7d33842690
commit 944844aa4c
4 changed files with 71 additions and 0 deletions

Binary file not shown.

View File

@ -0,0 +1,71 @@
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urljoin
BASE_URL = "https://www.zarahome.com"
START_URL = f"{BASE_URL}/pl/en/"
TIMEOUT = 30
opt = Options()
#opt.add_argument("--headless=new")
opt.add_argument("--window-size=1920,1080")
opt.add_argument("--disable-gpu")
opt.add_argument("--disable-blink-features=AutomationControlled")
opt.add_experimental_option("excludeSwitches", ["enable-automation"])
opt.add_experimental_option("useAutomationExtension", False)
driver = webdriver.Chrome(options=opt)
wait = WebDriverWait(driver, TIMEOUT)
try:
driver.get(START_URL)
# cookies
try:
wait.until(EC.element_to_be_clickable((
By.XPATH, "//button[contains(.,'Accept') or contains(.,'Akcept')]"))
).click()
except Exception:
pass
# раскрываем бургер (если есть)
try:
wait.until(EC.element_to_be_clickable((
By.CSS_SELECTOR,
"button[aria-label='Menu'], button[data-testid='menu-button']"))
).click()
except Exception:
pass
# ждём пунктов меню
wait.until(EC.presence_of_element_located((
By.XPATH, "//nav//ul//a[normalize-space(text())!='']")))
html = driver.page_source
finally:
driver.quit()
# ── парсинг
soup = BeautifulSoup(html, "lxml")
links = soup.select("nav ul a[href]") # любой href, не только https
print("Всего найдено ссылок в DOM:", len(links))
records = set()
for a in links:
name = a.get_text(strip=True)
href = a["href"]
if not name or href.startswith("javascript"):
continue
full_url = urljoin(BASE_URL, href) # /pl/en/... → https://www.zarahome.com/pl/en/...
records.add((full_url, name))
print("После фильтрации уникальных:", len(records))
df = pd.DataFrame(sorted(records), columns=["URL", "Category"])
df.to_excel(r"/Users/va1is/src_2024-09-05categories.xlsx", index=False)
print(f"✔ Собрано {len(df)} ссылок → categories.xlsx")