70 lines
2.7 KiB
Python
70 lines
2.7 KiB
Python
from json import load
|
||
from time import sleep
|
||
import cloudscraper
|
||
from os.path import abspath
|
||
|
||
# класс парсера с обходом защиты Cloudflare
|
||
class Parser:
|
||
def __init__(self, json_data):
|
||
self.proxies = {
|
||
"http": f'{json_data["proxy"]}',
|
||
"https": f'{json_data["proxy"]}'
|
||
} if json_data["proxy"] != "" else None
|
||
|
||
self.request_delay = json_data["request_delay"]
|
||
self.request_repeats = json_data["request_repeats"]
|
||
self.request_repeat_delay = json_data["request_repeat_delay"]
|
||
|
||
# Инициализация scraper с обходом защиты Cloudflare
|
||
self.scraper = cloudscraper.create_scraper()
|
||
if self.proxies:
|
||
self.scraper.proxies.update(self.proxies)
|
||
|
||
#self.headers = {
|
||
# 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)',
|
||
# 'Accept-Language': 'pl-PL,pl;q=0.9,en-US;q=0.8,en;q=0.7',
|
||
# 'Accept': 'text/html,application/xhtml+xml',
|
||
# 'Referer': 'https://www.google.com/'
|
||
#}
|
||
self.headers = {
|
||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
|
||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||
"Accept-Language": "pl-PL,pl;q=0.9,en-US;q=0.8,en;q=0.7",
|
||
"Connection": "keep-alive",
|
||
"Referer": "https://www.google.com/",
|
||
"DNT": "1"
|
||
}
|
||
|
||
|
||
|
||
def parse(self, url, method="GET", return_type="text"):
|
||
sleep(self.request_delay)
|
||
|
||
for i in range(self.request_repeats):
|
||
try:
|
||
if method == "GET":
|
||
response = self.scraper.get(url, headers=self.headers)
|
||
else:
|
||
response = self.scraper.post(url, headers=self.headers)
|
||
except Exception as error:
|
||
print(f"Request Error: {error} - {url}")
|
||
continue
|
||
|
||
if response.status_code == 200:
|
||
if return_type == "text":
|
||
return response.text
|
||
else:
|
||
return response.json()
|
||
else:
|
||
print(f"bad response, status code -> {response.status_code} - {url}")
|
||
if response.status_code == 404:
|
||
break
|
||
|
||
sleep(self.request_repeat_delay)
|
||
return None
|
||
|
||
# получение объекта Парсера с настройками из request_settings.json
|
||
def get_parser():
|
||
with open(abspath("request_settings.json"), "r", encoding="utf-8") as file:
|
||
return Parser(load(file))
|