ZH
This commit is contained in:
parent
7d33842690
commit
944844aa4c
Binary file not shown.
Binary file not shown.
@ -0,0 +1,71 @@
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from selenium.webdriver.chrome.options import Options
|
||||
from bs4 import BeautifulSoup
|
||||
import pandas as pd
|
||||
from urllib.parse import urljoin
|
||||
|
||||
BASE_URL = "https://www.zarahome.com"
|
||||
START_URL = f"{BASE_URL}/pl/en/"
|
||||
TIMEOUT = 30
|
||||
|
||||
opt = Options()
|
||||
#opt.add_argument("--headless=new")
|
||||
opt.add_argument("--window-size=1920,1080")
|
||||
opt.add_argument("--disable-gpu")
|
||||
opt.add_argument("--disable-blink-features=AutomationControlled")
|
||||
opt.add_experimental_option("excludeSwitches", ["enable-automation"])
|
||||
opt.add_experimental_option("useAutomationExtension", False)
|
||||
|
||||
driver = webdriver.Chrome(options=opt)
|
||||
wait = WebDriverWait(driver, TIMEOUT)
|
||||
|
||||
try:
|
||||
driver.get(START_URL)
|
||||
|
||||
# cookies
|
||||
try:
|
||||
wait.until(EC.element_to_be_clickable((
|
||||
By.XPATH, "//button[contains(.,'Accept') or contains(.,'Akcept')]"))
|
||||
).click()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# раскрываем бургер (если есть)
|
||||
try:
|
||||
wait.until(EC.element_to_be_clickable((
|
||||
By.CSS_SELECTOR,
|
||||
"button[aria-label='Menu'], button[data-testid='menu-button']"))
|
||||
).click()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# ждём пунктов меню
|
||||
wait.until(EC.presence_of_element_located((
|
||||
By.XPATH, "//nav//ul//a[normalize-space(text())!='']")))
|
||||
|
||||
html = driver.page_source
|
||||
finally:
|
||||
driver.quit()
|
||||
|
||||
# ── парсинг
|
||||
soup = BeautifulSoup(html, "lxml")
|
||||
links = soup.select("nav ul a[href]") # любой href, не только https
|
||||
print("Всего найдено ссылок в DOM:", len(links))
|
||||
|
||||
records = set()
|
||||
for a in links:
|
||||
name = a.get_text(strip=True)
|
||||
href = a["href"]
|
||||
if not name or href.startswith("javascript"):
|
||||
continue
|
||||
full_url = urljoin(BASE_URL, href) # /pl/en/... → https://www.zarahome.com/pl/en/...
|
||||
records.add((full_url, name))
|
||||
|
||||
print("После фильтрации уникальных:", len(records))
|
||||
|
||||
df = pd.DataFrame(sorted(records), columns=["URL", "Category"])
|
||||
df.to_excel(r"/Users/va1is/src_2024-09-05categories.xlsx", index=False)
|
||||
print(f"✔ Собрано {len(df)} ссылок → categories.xlsx")
|
||||
Loading…
Reference in New Issue
Block a user