-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathcrawl.sh
executable file
·88 lines (74 loc) · 2.26 KB
/
crawl.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
#!/usr/bin/env bash
# Recursively crawl a website and save its HTML locally.
#
# Example usage:
#
# ./crawl.sh [-d depth] https://www.consumerfinance.gov
#
# Optionally specify -d depth to limit the crawl depth.
# If a command fails, stop executing this script and return its error code.
set -e
depth=0
while getopts ":d:" opt; do
case $opt in
d )
depth="$OPTARG";
number_regex='^[0-9]+$'
if ! [[ $depth =~ $number_regex ]] ; then
echo "Crawl depth must be a number." 1>&2
exit 1
fi
;;
\? )
echo "Invalid option: -$OPTARG." 1>&2
exit 1
;;
: )
echo "Invalid option: -$OPTARG requires an argument." 1>&2
exit 1
;;
esac
done
shift $((OPTIND -1))
url=$1
if [ -z "$url" ]; then
echo "Must specify URL to crawl."
exit 1
fi
echo "Starting crawl at $url."
domain=$url
domain="${domain#http://}"
domain="${domain#https://}"
domain="${domain%%:*}"
domain="${domain%%\?*}"
domain="${domain%%/*}"
echo "Limiting crawl to domain $domain."
if [ $depth -ne 0 ]; then
echo "Limiting crawl to depth $depth."
fi
# Crawl into a temporary directory to avoid potential unexpected overwriting
# due to use of --trust-server-names.
# See https://nvd.nist.gov/vuln/detail/CVE-2010-2252.
tmp_dir=$(mktemp -d -t wget-$(date +%Y-%m-%d-%H-%M-%S)-XXXXXXXX)
echo "Saving HTML to $tmp_dir."
pushd "$tmp_dir" > /dev/null
time wget \
--domains="$domain" \
--execute robots=off \
--follow-tags=a \
--limit-rate=1m \
--reject '*.css,*.csv,*.CSV,*.do,*.doc,*.docx,*.epub,*.gif,*.ico,*.jpg,*.js,*.mp3,*.pdf,*.PDF,*.png,*.pptx,*.py,*.R,*.sas,*.sps,*.tmp,*.txt,*.wav,*.woff,*.woff2,*.xls,*xlsx,*.xml,*.zip' \
--reject-regex "topics=|authors=|categories=|filter_blog_category=|ext_url=|search_field=|issuer_name=|filter1_topics=|q=|topic=|grade_level=|regs=|title=|from_date=|to_date=|utm_source=|iped=" \
--recursive \
--level="$depth" \
--trust-server-names \
--verbose \
--no-clobber \
--rejected-log=rejected.log \
"$url" 2>&1 | tee wget.log
popd > /dev/null
# Copy back logs and HTML from temporary directory.
cp -a "$tmp_dir"/{wget,rejected}.log .
cp -a "$tmp_dir/$domain/" "./$domain/"
# Clean up temporary directory.
rm -rf "$tmp_dir"