diff --git a/core/scrapping_utils.py b/core/scrapping_utils.py index 69a20c8..d0754f4 100644 --- a/core/scrapping_utils.py +++ b/core/scrapping_utils.py @@ -1,39 +1,74 @@ # -*- coding: utf-8 -*- import requests +import random + +def headers_random_generator(): + + base_headers = { + "Upgrade-Insecure-Requests": "1", + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", + "Accept-Encoding": "gzip, deflate, br", + "Accept-Language": "en-US,en;q=0.5", + "Connection": "keep-alive", + "Host": "www.idealista.com", + "DNT": "1", + "TE": "Trailers", + "user-agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:75.0) Gecko/20100101 Firefox/75.0", + } + + potential_user_agents = ["Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1 Safari/605.1.15", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:75.0) Gecko/20100101 Firefox/75.0", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:74.0) Gecko/20100101 Firefox/74.0", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.113 Safari/537.36", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.5 Safari/605.1.15", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; rv:68.0) Gecko/20100101 Firefox/68.0", + "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:75.0) Gecko/20100101 Firefox/75.0", + "Mozilla/5.0 (X11; Linux x86_64; rv:74.0) Gecko/20100101 Firefox/74.0", + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36", + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.92 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"] + + random_index = random.randint(0, len(potential_user_agents) - 1) + + random_headers = base_headers + random_headers["user-agent"] = potential_user_agents[random_index] + + return random_headers -class UrlAttack(): +class UrlAttack: - headers = {'Upgrade-Insecure-Requests': "1", - 'Accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", - 'Accept-Encoding': "gzip, deflate, br", - 'Accept-Language': "en-US,en;q=0.5", - 'Connection': 'keep-alive', - 'Host': 'www.idealista.com', - "DNT": "1", - "TE": "Trailers", - 'user-agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:75.0) Gecko/20100101 Firefox/75.0'} + headers = headers_random_generator() timeout = 20 - + def __init__(self, url): self.url = url self.success = None self.has_been_attacked = False - + def attack(self): self.has_been_attacked = True try: - self.response = requests.get(self.url, headers = self.headers, - timeout = self.timeout) + self.response = requests.get( + self.url, headers=self.headers, timeout=self.timeout + ) if self.response.ok: self.success = True except Exception: self.success = False - + def get_response(self): return self.response - + def get_text(self): return self.response.text @@ -41,4 +76,4 @@ class UrlAttack(): try: return self.response.status_code except AttributeError: - return None \ No newline at end of file + return None