Typing, docstrings, formatting for scrapping_utils.py

This commit is contained in:
pablo 2020-11-03 07:43:21 +01:00
parent a79fc533ee
commit e9ee23f852

View file

@ -1,10 +1,14 @@
# -*- coding: utf-8 -*-
import requests
import random
import pickle
from typing import Union
def headers_random_generator():
def headers_random_generator() -> dict:
"""
Generates a random set of headers for requests.
:return: a dict with the selected headers.
"""
base_headers = {
"Upgrade-Insecure-Requests": "1",
@ -49,6 +53,12 @@ def headers_random_generator():
class UrlAttack:
"""
Stores the flow of attempting an HTTP GET request to a certain URL.
Request headers and an HTTP session are assigned at the class level on
runtime initialization and shared across instances. Refreshing of these
attributes takes place with the probability specified below.
"""
headers = headers_random_generator()
@ -56,39 +66,60 @@ class UrlAttack:
timeout = 20
def __init__(self, url):
identity_change_probability = 2
def __init__(self, url: str) -> None:
"""
Initialize with required data.
:param url: URL that will be requested.
"""
self.url = url
self.success = None
self.has_been_attacked = False
self.response = None
def attack(self):
def attack(self) -> None:
"""
Execute the request and record the response status. Randomly changes
identity with a predefined probability.
:return: None
"""
self.has_been_attacked = True
try:
self.response = UrlAttack.session.get(
self.url, headers=self.headers, timeout=self.timeout
)
with open(f"request.pickle", "wb") as output_file:
pickle.dump(self.response.request, output_file)
if self.response.ok:
self.success = True
except Exception as e:
self.success = False
if random.randrange(0, 100) < 2:
self.change_identity()
if random.randrange(0, 100) < UrlAttack.identity_change_probability:
self._change_identity()
def change_identity(self):
def _change_identity(self) -> None:
"""
Changes headers and initializes a new session, dropping old cookies and
acquiring new ones. Efectively results in a change of identity to the
target server, from the same IP.
:return: None
"""
UrlAttack.headers = headers_random_generator()
UrlAttack.session = requests.Session()
def get_response(self):
return self.response
def get_text(self):
def get_text(self) -> str:
"""
Return the text of the request response.
:return: the text of the respone
"""
return self.response.text
def get_status_code(self):
def get_status_code(self) -> Union[int, None]:
"""
Returns the status code of the response, if there is one.
:return: the status code, if the there is one.
"""
try:
return self.response.status_code
except AttributeError: