# -*- coding: utf-8 -*- import requests import random from typing import Union def headers_random_generator() -> dict: """ Generates a random set of headers for requests. :return: a dict with the selected headers. """ base_headers = { "Upgrade-Insecure-Requests": "1", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "en-US,en;q=0.5", "Connection": "keep-alive", "Host": "www.idealista.com", "DNT": "1", "TE": "Trailers", "user-agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:75.0) Gecko/20100101 Firefox/75.0", } potential_user_agents = [ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1 Safari/605.1.15", "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:75.0) Gecko/20100101 Firefox/75.0", "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:74.0) Gecko/20100101 Firefox/74.0", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.113 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.5 Safari/605.1.15", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; rv:68.0) Gecko/20100101 Firefox/68.0", "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:75.0) Gecko/20100101 Firefox/75.0", "Mozilla/5.0 (X11; Linux x86_64; rv:74.0) Gecko/20100101 Firefox/74.0", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.92 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko", ] random_index = random.randint(0, len(potential_user_agents) - 1) random_headers = base_headers random_headers["user-agent"] = potential_user_agents[random_index] return random_headers class UrlAttack: """ Stores the flow of attempting an HTTP GET request to a certain URL. Request headers and an HTTP session are assigned at the class level on runtime initialization and shared across instances. Refreshing of these attributes takes place with the probability specified below. """ headers = headers_random_generator() session = requests.Session() timeout = 20 identity_change_probability = 2 def __init__(self, url: str) -> None: """ Initialize with required data. :param url: URL that will be requested. """ self.url = url self.success = None self.has_been_attacked = False self.response = None def attack(self) -> None: """ Execute the request and record the response status. Randomly changes identity with a predefined probability. :return: None """ self.has_been_attacked = True try: self.response = UrlAttack.session.get( self.url, headers=self.headers, timeout=self.timeout ) if self.response.ok: self.success = True except Exception: self.success = False if ( not self.success or random.randrange(0, 100) < UrlAttack.identity_change_probability ): self._change_identity() def _change_identity(self) -> None: """ Changes headers and initializes a new session, dropping old cookies and acquiring new ones. Efectively results in a change of identity to the target server, from the same IP. :return: None """ UrlAttack.headers = headers_random_generator() UrlAttack.session = requests.Session() def get_text(self) -> str: """ Return the text of the request response. :return: the text of the respone """ return self.response.text def get_status_code(self) -> Union[int, None]: """ Returns the status code of the response, if there is one. :return: the status code, if the there is one. """ try: return self.response.status_code except AttributeError: return None