ZH
This commit is contained in:
parent
7d33842690
commit
944844aa4c
Binary file not shown.
Binary file not shown.
@ -0,0 +1,71 @@
|
|||||||
|
from selenium import webdriver
|
||||||
|
from selenium.webdriver.common.by import By
|
||||||
|
from selenium.webdriver.support.ui import WebDriverWait
|
||||||
|
from selenium.webdriver.support import expected_conditions as EC
|
||||||
|
from selenium.webdriver.chrome.options import Options
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import pandas as pd
|
||||||
|
from urllib.parse import urljoin
|
||||||
|
|
||||||
|
BASE_URL = "https://www.zarahome.com"
|
||||||
|
START_URL = f"{BASE_URL}/pl/en/"
|
||||||
|
TIMEOUT = 30
|
||||||
|
|
||||||
|
opt = Options()
|
||||||
|
#opt.add_argument("--headless=new")
|
||||||
|
opt.add_argument("--window-size=1920,1080")
|
||||||
|
opt.add_argument("--disable-gpu")
|
||||||
|
opt.add_argument("--disable-blink-features=AutomationControlled")
|
||||||
|
opt.add_experimental_option("excludeSwitches", ["enable-automation"])
|
||||||
|
opt.add_experimental_option("useAutomationExtension", False)
|
||||||
|
|
||||||
|
driver = webdriver.Chrome(options=opt)
|
||||||
|
wait = WebDriverWait(driver, TIMEOUT)
|
||||||
|
|
||||||
|
try:
|
||||||
|
driver.get(START_URL)
|
||||||
|
|
||||||
|
# cookies
|
||||||
|
try:
|
||||||
|
wait.until(EC.element_to_be_clickable((
|
||||||
|
By.XPATH, "//button[contains(.,'Accept') or contains(.,'Akcept')]"))
|
||||||
|
).click()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# раскрываем бургер (если есть)
|
||||||
|
try:
|
||||||
|
wait.until(EC.element_to_be_clickable((
|
||||||
|
By.CSS_SELECTOR,
|
||||||
|
"button[aria-label='Menu'], button[data-testid='menu-button']"))
|
||||||
|
).click()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# ждём пунктов меню
|
||||||
|
wait.until(EC.presence_of_element_located((
|
||||||
|
By.XPATH, "//nav//ul//a[normalize-space(text())!='']")))
|
||||||
|
|
||||||
|
html = driver.page_source
|
||||||
|
finally:
|
||||||
|
driver.quit()
|
||||||
|
|
||||||
|
# ── парсинг
|
||||||
|
soup = BeautifulSoup(html, "lxml")
|
||||||
|
links = soup.select("nav ul a[href]") # любой href, не только https
|
||||||
|
print("Всего найдено ссылок в DOM:", len(links))
|
||||||
|
|
||||||
|
records = set()
|
||||||
|
for a in links:
|
||||||
|
name = a.get_text(strip=True)
|
||||||
|
href = a["href"]
|
||||||
|
if not name or href.startswith("javascript"):
|
||||||
|
continue
|
||||||
|
full_url = urljoin(BASE_URL, href) # /pl/en/... → https://www.zarahome.com/pl/en/...
|
||||||
|
records.add((full_url, name))
|
||||||
|
|
||||||
|
print("После фильтрации уникальных:", len(records))
|
||||||
|
|
||||||
|
df = pd.DataFrame(sorted(records), columns=["URL", "Category"])
|
||||||
|
df.to_excel(r"/Users/va1is/src_2024-09-05categories.xlsx", index=False)
|
||||||
|
print(f"✔ Собрано {len(df)} ссылок → categories.xlsx")
|
||||||
Loading…
Reference in New Issue
Block a user