-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcleanoutput.py
35 lines (28 loc) · 1.29 KB
/
cleanoutput.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import re
def is_valid_domain(url):
# Define a regex to check if a string is a valid domain URL
domain_regex = re.compile(
r'^(?:http|ftp)s?://' # http:// or https://
r'(?:(?![0-9]+\.[0-9]+\.[0-9]+\.[0-9]+)' # Exclude IP addresses
r'(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?))' # domain...
r'(?:/?|[/?]\S+)$',
re.IGNORECASE)
return re.match(domain_regex, url) is not None
def filter_links(input_file, output_file):
with open(input_file, "r") as infile, open(output_file, "w") as outfile:
lines = infile.readlines()
for line in lines:
# Split the line into original URL and redirected URL
parts = line.split(" -> ")
if len(parts) == 2:
original_url, redirected_url = parts
redirected_url = redirected_url.strip()
# Check if the redirected URL is a valid domain URL
if is_valid_domain(redirected_url):
# Write the valid redirected URL to the output file
outfile.write(line)
# Define input and output file paths
input_file = "output.txt"
output_file = "filtered_output.txt"
# Filter the links
filter_links(input_file, output_file)