-
Notifications
You must be signed in to change notification settings - Fork 7
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #10 from cfpb/script-improvements
Shell usability improvements, plus logging
- Loading branch information
Showing
6 changed files
with
109 additions
and
33 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,2 @@ | ||
/CHANGELOG.md merge=union | ||
*.log linguist-generated=true |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -75,3 +75,8 @@ bower_components/ | |
.grunt/ | ||
src/vendor/ | ||
dist/ | ||
|
||
# Project specific # | ||
#################### | ||
commit.txt | ||
!*.log.gz |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,16 +1,73 @@ | ||
#!/usr/bin/env bash | ||
|
||
# Recursively crawl a website and save its HTML locally. | ||
# | ||
# Example usage: | ||
# | ||
# ./crawl.sh [-d depth] https://www.consumerfinance.gov | ||
# | ||
# Optionally specify -d depth to limit the crawl depth. | ||
|
||
# If a command fails, stop executing this script and return its error code. | ||
set -e | ||
|
||
depth=0 | ||
|
||
while getopts ":d:" opt; do | ||
case $opt in | ||
d ) | ||
depth="$OPTARG"; | ||
number_regex='^[0-9]+$' | ||
if ! [[ $depth =~ $number_regex ]] ; then | ||
echo "Crawl depth must be a number." 1>&2 | ||
exit 1 | ||
fi | ||
;; | ||
\? ) | ||
echo "Invalid option: -$OPTARG." 1>&2 | ||
exit 1 | ||
;; | ||
: ) | ||
echo "Invalid option: -$OPTARG requires an argument." 1>&2 | ||
exit 1 | ||
;; | ||
esac | ||
done | ||
|
||
shift $((OPTIND -1)) | ||
|
||
url=$1 | ||
|
||
if [ -z "$url" ]; then | ||
echo "Must specify URL to crawl." | ||
exit 1 | ||
fi | ||
|
||
echo "Starting crawl at $url." | ||
|
||
if [ $depth -ne 0 ]; then | ||
echo "Limiting crawl to depth $depth." | ||
fi | ||
|
||
domain=$url | ||
domain="${domain#http://}" | ||
domain="${domain#https://}" | ||
domain="${domain%%:*}" | ||
domain="${domain%%\?*}" | ||
domain="${domain%%/*}" | ||
echo "Limiting crawl to domain $domain." | ||
|
||
time wget \ | ||
--domains=www.consumerfinance.gov \ | ||
--exclude-domains=files.consumerfinance.gov \ | ||
--domains="$domain" \ | ||
--execute robots=off \ | ||
--follow-tags=a \ | ||
--limit-rate=200k \ | ||
--random-wait \ | ||
--limit-rate=1m \ | ||
--accept html \ | ||
--reject-regex "topics=|authors=|categories=|filter_blog_category=|ext_url=|search_field=|issuer_name=" \ | ||
--recursive \ | ||
--level=4 \ | ||
--level="$depth" \ | ||
--trust-server-names \ | ||
--no-verbose \ | ||
--verbose \ | ||
--no-clobber \ | ||
https://www.consumerfinance.gov/ | ||
--rejected-log=rejected.log \ | ||
"$url" 2>&1 | tee wget.log |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,13 +1,13 @@ | ||
#!/usr/bin/env bash | ||
|
||
# Generate a summary of the crawl results into message.txt | ||
git add -A www.consumerfinance.gov | ||
git diff --staged --compact-summary --no-color > message.txt | ||
git reset . | ||
# If a command fails, stop executing this script and return its error code. | ||
set -e | ||
|
||
# Follow these instructions to set the value of the COMMIT_MESSAGE | ||
# environment variable and save it to the GitHub Actions environment: | ||
# https://docs.github.com/en/free-pro-team@latest/actions/reference/workflow-commands-for-github-actions#setting-an-environment-variable | ||
echo 'COMMIT_MESSAGE<<EOF' >> $GITHUB_ENV | ||
echo $(cat message.txt) >> $GITHUB_ENV | ||
echo 'EOF' >> $GITHUB_ENV | ||
# Generate a summary of the crawl results into commit.txt | ||
date > commit.txt | ||
cat >> commit.txt <<EOL | ||
lines |lines | | ||
added |deleted|filename | ||
-------|-------|-------- | ||
EOL | ||
git diff --numstat --no-color -- '*.html' >> commit.txt |