MacOS_Parsers/Парсер_IKEA/ikea_collect_product_links Grok.py

import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import os
import time
from lxml import etree

# Определяем директорию скрипта
script_dir = os.path.dirname(os.path.abspath(__file__))

# Пути к файлам
input_file = os.path.join(script_dir, 'leaf_categories.xlsx')
output_file = os.path.join(script_dir, 'all_liks.xlsx')

# Читаем категории из файла
try:
    df_categories = pd.read_excel(input_file)
    category_urls = df_categories['url'].tolist()
    print(f"Найдено {len(category_urls)} категорий.")
except Exception as e:
    print(f"Ошибка при чтении файла {input_file}: {e}")
    exit(1)

all_product_links = []

# Настраиваем Selenium
chrome_options = Options()
chrome_options.add_argument("--headless")  # Без графического интерфейса
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
driver = webdriver.Chrome(options=chrome_options)

# Обрабатываем каждую категорию
for category_url in category_urls:
    # Пропускаем nan или пустые значения
    if not isinstance(category_url, str) or not category_url:
        print(f"Пропущена некорректная категория: {category_url}")
        continue

    print(f"Обрабатываем категорию: {category_url}")
    page = 1
    while True:
        # Формируем URL с параметром page и якорем
        url = f"{category_url.rstrip('/')}?page={page}#products-page-{page}"

        # Загружаем страницу
        try:
            driver.get(url)
            print(f"HTTP статус: {driver.execute_script('return document.readyState')}")
            time.sleep(2)  # Ждём загрузки JavaScript
        except Exception as e:
            print(f"Ошибка загрузки {url}: {e}")
            break

        soup = BeautifulSoup(driver.page_source, 'html.parser')

        # Собираем ссылки на товары
        product_links = [a['href'] for a in soup.select('div.plp-fragment-wrapper a[href^="https://www.ikea.com/pl/pl/p/"]')]
        all_product_links.extend(product_links)
        print(f"Обработка страницы {url}: найдено {len(product_links)} ссылок.")

        # Находим элемент <progress> по XPath
        parser = etree.HTMLParser()
        tree = etree.fromstring(driver.page_source, parser)
        progress_elements = tree.xpath('//*[@id="product-list"]/div[3]/progress')

        if not progress_elements:
            print("Элемент <progress> по XPath //*[@id=\"product-list\"]/div[3]/progress не найден.")
            print("HTML фрагмент контейнера:", soup.find('div', class_='plp-catalog-bottom-container'))
            print("Завершаем категорию.")
            break

        progress = progress_elements[0]
        progress_html = etree.tostring(progress, encoding='unicode', method='html')
        print(f"Элемент <progress>: {progress_html}")

        try:
            value = int(progress.get('value'))
            max_value = int(progress.get('max'))
            print(f"Прогресс: {value} из {max_value}")
        except (KeyError, ValueError) as e:
            print(f"Ошибка при извлечении value/max: {e}. HTML фрагмент:", progress_html)
            break

        # Проверяем условие остановки
        if value >= max_value:
            print("Все товары загружены. Завершаем категорию.")
            break

        # Пауза после обработки страницы
        time.sleep(1)

        page += 1

    print("Завершена обработка категории.")

# Закрываем браузер
driver.quit()

# Удаляем дубликаты и сохраняем результат
unique_links = list(set(all_product_links))
df = pd.DataFrame({'url': unique_links})
df.to_excel(output_file, index=False)
print(f"Собрано уникальных ссылок: {len(unique_links)}")