diff --git "a/C:\\Users\\valis\\YandexDisk\\Python3\\Parsing ZARAHOME\\src_2024-09-05categories.xlsx" "b/C:\\Users\\valis\\YandexDisk\\Python3\\Parsing ZARAHOME\\src_2024-09-05categories.xlsx" new file mode 100644 index 0000000..6a48bd4 Binary files /dev/null and "b/C:\\Users\\valis\\YandexDisk\\Python3\\Parsing ZARAHOME\\src_2024-09-05categories.xlsx" differ diff --git a/Parsing ZARAHOME/src/categories.xlsx b/Parsing ZARAHOME/src/categories.xlsx index d6e33a4..269c2e1 100644 Binary files a/Parsing ZARAHOME/src/categories.xlsx and b/Parsing ZARAHOME/src/categories.xlsx differ diff --git a/Parsing ZARAHOME/src/extractor автономный для сбора категорий mac.py b/Parsing ZARAHOME/src/extractor автономный для сбора категорий mac.py new file mode 100644 index 0000000..dd72372 --- /dev/null +++ b/Parsing ZARAHOME/src/extractor автономный для сбора категорий mac.py @@ -0,0 +1,71 @@ +from selenium import webdriver +from selenium.webdriver.common.by import By +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.chrome.options import Options +from bs4 import BeautifulSoup +import pandas as pd +from urllib.parse import urljoin + +BASE_URL = "https://www.zarahome.com" +START_URL = f"{BASE_URL}/pl/en/" +TIMEOUT = 30 + +opt = Options() +#opt.add_argument("--headless=new") +opt.add_argument("--window-size=1920,1080") +opt.add_argument("--disable-gpu") +opt.add_argument("--disable-blink-features=AutomationControlled") +opt.add_experimental_option("excludeSwitches", ["enable-automation"]) +opt.add_experimental_option("useAutomationExtension", False) + +driver = webdriver.Chrome(options=opt) +wait = WebDriverWait(driver, TIMEOUT) + +try: + driver.get(START_URL) + + # cookies + try: + wait.until(EC.element_to_be_clickable(( + By.XPATH, "//button[contains(.,'Accept') or contains(.,'Akcept')]")) + ).click() + except Exception: + pass + + # раскрываем бургер (если есть) + try: + wait.until(EC.element_to_be_clickable(( + By.CSS_SELECTOR, + "button[aria-label='Menu'], button[data-testid='menu-button']")) + ).click() + except Exception: + pass + + # ждём пунктов меню + wait.until(EC.presence_of_element_located(( + By.XPATH, "//nav//ul//a[normalize-space(text())!='']"))) + + html = driver.page_source +finally: + driver.quit() + +# ── парсинг +soup = BeautifulSoup(html, "lxml") +links = soup.select("nav ul a[href]") # любой href, не только https +print("Всего найдено ссылок в DOM:", len(links)) + +records = set() +for a in links: + name = a.get_text(strip=True) + href = a["href"] + if not name or href.startswith("javascript"): + continue + full_url = urljoin(BASE_URL, href) # /pl/en/... → https://www.zarahome.com/pl/en/... + records.add((full_url, name)) + +print("После фильтрации уникальных:", len(records)) + +df = pd.DataFrame(sorted(records), columns=["URL", "Category"]) +df.to_excel(r"/Users/va1is/src_2024-09-05categories.xlsx", index=False) +print(f"✔ Собрано {len(df)} ссылок → categories.xlsx") diff --git a/Parsing ZARAHOME/src/extractor автономный для сбора категорий.py b/Parsing ZARAHOME/src/extractor автономный для сбора категорий win.py similarity index 100% rename from Parsing ZARAHOME/src/extractor автономный для сбора категорий.py rename to Parsing ZARAHOME/src/extractor автономный для сбора категорий win.py