drogon/core/scrapping_utils.py

96 lines
4.1 KiB
Python
Raw Normal View History

# -*- coding: utf-8 -*-
import requests
2020-04-26 15:05:40 +02:00
import random
2020-11-02 12:02:56 +01:00
import pickle
2020-04-26 15:06:04 +02:00
2020-04-26 15:05:40 +02:00
def headers_random_generator():
2020-04-26 15:05:40 +02:00
base_headers = {
"Upgrade-Insecure-Requests": "1",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "en-US,en;q=0.5",
"Connection": "keep-alive",
"Host": "www.idealista.com",
"DNT": "1",
"TE": "Trailers",
"user-agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:75.0) Gecko/20100101 Firefox/75.0",
}
2020-04-26 15:06:04 +02:00
potential_user_agents = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1 Safari/605.1.15",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:75.0) Gecko/20100101 Firefox/75.0",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:74.0) Gecko/20100101 Firefox/74.0",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.113 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.5 Safari/605.1.15",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; rv:68.0) Gecko/20100101 Firefox/68.0",
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:75.0) Gecko/20100101 Firefox/75.0",
"Mozilla/5.0 (X11; Linux x86_64; rv:74.0) Gecko/20100101 Firefox/74.0",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.92 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko",
]
2020-04-26 15:05:40 +02:00
random_index = random.randint(0, len(potential_user_agents) - 1)
random_headers = base_headers
random_headers["user-agent"] = potential_user_agents[random_index]
return random_headers
class UrlAttack:
headers = headers_random_generator()
2020-11-02 13:08:37 +01:00
session = requests.Session()
timeout = 20
2020-04-26 15:05:40 +02:00
def __init__(self, url):
self.url = url
self.success = None
self.has_been_attacked = False
2020-04-26 15:05:40 +02:00
def attack(self):
self.has_been_attacked = True
try:
2020-11-02 13:08:37 +01:00
self.response = UrlAttack.session.get(
2020-04-26 15:05:40 +02:00
self.url, headers=self.headers, timeout=self.timeout
)
2020-11-02 12:02:56 +01:00
2020-11-02 12:43:49 +01:00
with open(f"request.pickle", "wb") as output_file:
2020-11-02 12:02:56 +01:00
pickle.dump(self.response.request, output_file)
if self.response.ok:
self.success = True
2020-11-02 12:43:49 +01:00
except Exception as e:
self.success = False
2020-04-26 15:05:40 +02:00
2020-11-03 07:26:06 +01:00
if random.randrange(0, 100) < 2:
self.change_identity()
def change_identity(self):
UrlAttack.headers = headers_random_generator()
UrlAttack.session = requests.Session()
def get_response(self):
return self.response
2020-04-26 15:05:40 +02:00
def get_text(self):
return self.response.text
def get_status_code(self):
2018-10-23 20:49:37 +02:00
try:
return self.response.status_code
except AttributeError:
2020-04-26 15:05:40 +02:00
return None