Typing, docstrings, formatting for scrapping_utils.py

This commit is contained in:
pablo 2020-11-03 07:43:21 +01:00
parent a79fc533ee
commit e9ee23f852

View file

@ -1,10 +1,14 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import requests import requests
import random import random
import pickle from typing import Union
def headers_random_generator(): def headers_random_generator() -> dict:
"""
Generates a random set of headers for requests.
:return: a dict with the selected headers.
"""
base_headers = { base_headers = {
"Upgrade-Insecure-Requests": "1", "Upgrade-Insecure-Requests": "1",
@ -49,6 +53,12 @@ def headers_random_generator():
class UrlAttack: class UrlAttack:
"""
Stores the flow of attempting an HTTP GET request to a certain URL.
Request headers and an HTTP session are assigned at the class level on
runtime initialization and shared across instances. Refreshing of these
attributes takes place with the probability specified below.
"""
headers = headers_random_generator() headers = headers_random_generator()
@ -56,39 +66,60 @@ class UrlAttack:
timeout = 20 timeout = 20
def __init__(self, url): identity_change_probability = 2
def __init__(self, url: str) -> None:
"""
Initialize with required data.
:param url: URL that will be requested.
"""
self.url = url self.url = url
self.success = None self.success = None
self.has_been_attacked = False self.has_been_attacked = False
self.response = None
def attack(self): def attack(self) -> None:
"""
Execute the request and record the response status. Randomly changes
identity with a predefined probability.
:return: None
"""
self.has_been_attacked = True self.has_been_attacked = True
try: try:
self.response = UrlAttack.session.get( self.response = UrlAttack.session.get(
self.url, headers=self.headers, timeout=self.timeout self.url, headers=self.headers, timeout=self.timeout
) )
with open(f"request.pickle", "wb") as output_file:
pickle.dump(self.response.request, output_file)
if self.response.ok: if self.response.ok:
self.success = True self.success = True
except Exception as e: except Exception as e:
self.success = False self.success = False
if random.randrange(0, 100) < 2: if random.randrange(0, 100) < UrlAttack.identity_change_probability:
self.change_identity() self._change_identity()
def change_identity(self): def _change_identity(self) -> None:
"""
Changes headers and initializes a new session, dropping old cookies and
acquiring new ones. Efectively results in a change of identity to the
target server, from the same IP.
:return: None
"""
UrlAttack.headers = headers_random_generator() UrlAttack.headers = headers_random_generator()
UrlAttack.session = requests.Session() UrlAttack.session = requests.Session()
def get_response(self): def get_text(self) -> str:
return self.response """
Return the text of the request response.
def get_text(self): :return: the text of the respone
"""
return self.response.text return self.response.text
def get_status_code(self): def get_status_code(self) -> Union[int, None]:
"""
Returns the status code of the response, if there is one.
:return: the status code, if the there is one.
"""
try: try:
return self.response.status_code return self.response.status_code
except AttributeError: except AttributeError: