-
Notifications
You must be signed in to change notification settings - Fork 172
/
Copy pathgen.py
195 lines (145 loc) · 6 KB
/
gen.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
#!/usr/bin/python
#
# written by @eric_capuano
# https://github.com/ecapuano/web-traffic-generator
#
# published under MIT license :) do what you want.
#
# 20170714 shyft ADDED python 2.7 and 3.x compatibility and generic config
# 20200225 rarawls ADDED recursive, depth-first browsing, color stdout
from __future__ import print_function
import requests
import re
import time
import random
try:
import config
except ImportError:
class ConfigClass: # minimal config incase you don't have the config.py
MAX_DEPTH = 10 # dive no deeper than this for each root URL
MIN_DEPTH = 3 # dive at least this deep into each root URL
MAX_WAIT = 10 # maximum amount of time to wait between HTTP requests
MIN_WAIT = 5 # minimum amount of time allowed between HTTP requests
DEBUG = False # set to True to enable useful console output
# use this single item list to test how a site responds to this crawler
# be sure to comment out the list below it.
#ROOT_URLS = ["https://digg.com/"]
ROOT_URLS = [
"https://www.reddit.com"
]
# items can be a URL "https://t.co" or simple string to check for "amazon"
blacklist = [
'facebook.com',
'pinterest.com'
]
# must use a valid user agent or sites will hate you
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) ' \
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
config = ConfigClass
class Colors:
RED = '\033[91m'
YELLOW = '\033[93m'
PURPLE = '\033[95m'
NONE = '\033[0m'
def debug_print(message, color=Colors.NONE):
""" A method which prints if DEBUG is set """
if config.DEBUG:
print(color + message + Colors.NONE)
def hr_bytes(bytes_, suffix='B', si=False):
""" A method providing a more legible byte format """
bits = 1024.0 if si else 1000.0
for unit in ['', 'K', 'M', 'G', 'T', 'P', 'E', 'Z']:
if abs(bytes_) < bits:
return "{:.1f}{}{}".format(bytes_, unit, suffix)
bytes_ /= bits
return "{:.1f}{}{}".format(bytes_, 'Y', suffix)
def do_request(url):
""" A method which loads a page """
global data_meter
global good_requests
global bad_requests
debug_print(" Requesting page...".format(url))
headers = {'user-agent': config.USER_AGENT}
try:
r = requests.get(url, headers=headers, timeout=5)
except:
# Prevent 100% CPU loop in a net down situation
time.sleep(30)
return False
page_size = len(r.content)
data_meter += page_size
debug_print(" Page size: {}".format(hr_bytes(page_size)))
debug_print(" Data meter: {}".format(hr_bytes(data_meter)))
status = r.status_code
if (status != 200):
bad_requests += 1
debug_print(" Response status: {}".format(r.status_code), Colors.RED)
if (status == 429):
debug_print(
" We're making requests too frequently... sleeping longer...")
config.MIN_WAIT += 10
config.MAX_WAIT += 10
else:
good_requests += 1
debug_print(" Good requests: {}".format(good_requests))
debug_print(" Bad reqeusts: {}".format(bad_requests))
return r
def get_links(page):
""" A method which returns all links from page, less blacklisted links """
pattern = r"(?:href\=\")(https?:\/\/[^\"]+)(?:\")"
links = re.findall(pattern, str(page.content))
valid_links = [link for link in links if not any(
b in link for b in config.blacklist)]
return valid_links
def recursive_browse(url, depth):
""" A method which recursively browses URLs, using given depth """
# Base: load current page and return
# Recursively: load page, pick random link and browse with decremented depth
debug_print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
debug_print(
"Recursively browsing [{}] ~~~ [depth = {}]".format(url, depth))
if not depth: # base case: depth of zero, load page
do_request(url)
return
else: # recursive case: load page, browse random link, decrement depth
page = do_request(url) # load current page
# give up if error loading page
if not page:
debug_print(
" Stopping and blacklisting: page error".format(url), Colors.YELLOW)
config.blacklist.append(url)
return
# scrape page for links not in blacklist
debug_print(" Scraping page for links".format(url))
valid_links = get_links(page)
debug_print(" Found {} valid links".format(len(valid_links)))
# give up if no links to browse
if not valid_links:
debug_print(" Stopping and blacklisting: no links".format(
url), Colors.YELLOW)
config.blacklist.append(url)
return
# sleep and then recursively browse
sleep_time = random.randrange(config.MIN_WAIT, config.MAX_WAIT)
debug_print(" Pausing for {} seconds...".format(sleep_time))
time.sleep(sleep_time)
recursive_browse(random.choice(valid_links), depth - 1)
if __name__ == "__main__":
# Initialize global variables
data_meter = 0
good_requests = 0
bad_requests = 0
print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
print("Traffic generator started")
print("https://github.com/ecapuano/web-traffic-generator")
print("Diving between 3 and {} links deep into {} root URLs,".format(
config.MAX_DEPTH, len(config.ROOT_URLS)))
print("Waiting between {} and {} seconds between requests. ".format(
config.MIN_WAIT, config.MAX_WAIT))
print("This script will run indefinitely. Ctrl+C to stop.")
while True:
debug_print("Randomly selecting one of {} Root URLs".format(
len(config.ROOT_URLS)), Colors.PURPLE)
random_url = random.choice(config.ROOT_URLS)
depth = random.choice(range(config.MIN_DEPTH, config.MAX_DEPTH))
recursive_browse(random_url, depth)