from json import load from time import sleep import cloudscraper from os.path import abspath # класс парсера с обходом защиты Cloudflare class Parser: def __init__(self, json_data): self.proxies = { "http": f'{json_data["proxy"]}', "https": f'{json_data["proxy"]}' } if json_data["proxy"] != "" else None self.request_delay = json_data["request_delay"] self.request_repeats = json_data["request_repeats"] self.request_repeat_delay = json_data["request_repeat_delay"] # Инициализация scraper с обходом защиты Cloudflare self.scraper = cloudscraper.create_scraper() if self.proxies: self.scraper.proxies.update(self.proxies) #self.headers = { # 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)', # 'Accept-Language': 'pl-PL,pl;q=0.9,en-US;q=0.8,en;q=0.7', # 'Accept': 'text/html,application/xhtml+xml', # 'Referer': 'https://www.google.com/' #} self.headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "pl-PL,pl;q=0.9,en-US;q=0.8,en;q=0.7", "Connection": "keep-alive", "Referer": "https://www.google.com/", "DNT": "1" } def parse(self, url, method="GET", return_type="text"): sleep(self.request_delay) for i in range(self.request_repeats): try: if method == "GET": response = self.scraper.get(url, headers=self.headers) else: response = self.scraper.post(url, headers=self.headers) except Exception as error: print(f"Request Error: {error} - {url}") continue if response.status_code == 200: if return_type == "text": return response.text else: return response.json() else: print(f"bad response, status code -> {response.status_code} - {url}") if response.status_code == 404: break sleep(self.request_repeat_delay) return None # получение объекта Парсера с настройками из request_settings.json def get_parser(): with open(abspath("request_settings.json"), "r", encoding="utf-8") as file: return Parser(load(file))