drogon/core/scrapping_utils.py

130 lines
5.3 KiB
Python
Raw Normal View History

# -*- coding: utf-8 -*-
import requests
2020-04-26 15:05:40 +02:00
import random
from typing import Union
2020-04-26 15:06:04 +02:00
def headers_random_generator() -> dict:
"""
Generates a random set of headers for requests.
:return: a dict with the selected headers.
"""
2020-04-26 15:05:40 +02:00
base_headers = {
"Upgrade-Insecure-Requests": "1",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "en-US,en;q=0.5",
"Connection": "keep-alive",
"Host": "www.idealista.com",
"DNT": "1",
"TE": "Trailers",
"user-agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:75.0) Gecko/20100101 Firefox/75.0",
}
2020-04-26 15:06:04 +02:00
potential_user_agents = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1 Safari/605.1.15",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:75.0) Gecko/20100101 Firefox/75.0",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:74.0) Gecko/20100101 Firefox/74.0",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.113 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.5 Safari/605.1.15",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; rv:68.0) Gecko/20100101 Firefox/68.0",
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:75.0) Gecko/20100101 Firefox/75.0",
"Mozilla/5.0 (X11; Linux x86_64; rv:74.0) Gecko/20100101 Firefox/74.0",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.92 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko",
]
2020-04-26 15:05:40 +02:00
random_index = random.randint(0, len(potential_user_agents) - 1)
random_headers = base_headers
random_headers["user-agent"] = potential_user_agents[random_index]
return random_headers
class UrlAttack:
"""
Stores the flow of attempting an HTTP GET request to a certain URL.
Request headers and an HTTP session are assigned at the class level on
runtime initialization and shared across instances. Refreshing of these
attributes takes place with the probability specified below.
"""
2020-04-26 15:05:40 +02:00
headers = headers_random_generator()
2020-11-02 13:08:37 +01:00
session = requests.Session()
timeout = 20
2020-04-26 15:05:40 +02:00
identity_change_probability = 2
def __init__(self, url: str) -> None:
"""
Initialize with required data.
:param url: URL that will be requested.
"""
self.url = url
self.success = None
self.has_been_attacked = False
self.response = None
def attack(self) -> None:
"""
Execute the request and record the response status. Randomly changes
identity with a predefined probability.
:return: None
"""
self.has_been_attacked = True
try:
2020-11-02 13:08:37 +01:00
self.response = UrlAttack.session.get(
2020-04-26 15:05:40 +02:00
self.url, headers=self.headers, timeout=self.timeout
)
2020-11-02 12:02:56 +01:00
if self.response.ok:
self.success = True
except Exception:
self.success = False
2020-04-26 15:05:40 +02:00
2021-01-03 20:05:34 +01:00
if (
not self.success
or random.randrange(0, 100) < UrlAttack.identity_change_probability
):
self._change_identity()
2020-11-03 07:26:06 +01:00
def _change_identity(self) -> None:
"""
Changes headers and initializes a new session, dropping old cookies and
acquiring new ones. Efectively results in a change of identity to the
target server, from the same IP.
:return: None
"""
2020-11-03 07:26:06 +01:00
UrlAttack.headers = headers_random_generator()
UrlAttack.session = requests.Session()
def get_text(self) -> str:
"""
Return the text of the request response.
:return: the text of the respone
"""
return self.response.text
def get_status_code(self) -> Union[int, None]:
"""
Returns the status code of the response, if there is one.
:return: the status code, if the there is one.
"""
2018-10-23 20:49:37 +02:00
try:
return self.response.status_code
except AttributeError:
2020-04-26 15:05:40 +02:00
return None