From c77fa9788d28ade6408948641b666904b9af4edb Mon Sep 17 00:00:00 2001
From: Andy Chosak <andy.chosak@cfpb.gov>
Date: Mon, 2 Nov 2020 15:45:52 -0500
Subject: [PATCH] Reject non-HTML instead of accepting only HTML

Trying to accept only files that end in .html causes problems when:

1. Links on a page don't end in a trailing slash (e.g. /foo/bar), and
wget interprets the link of being of type "bar", and thus rejects it.
2. Long URLs get truncated when saved as files and thus don't end in
.html. These get deleted by wget.

This change restores old behavior that provided an explicit rejectlist
instead of only accepting html. This is a little suboptimal; it would be
nice not to have to list out a potentially-ever-growing list of file
extensions, but I'm not sure of a better way to accomplish what we want.
---
 crawl.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crawl.sh b/crawl.sh
index 2a36be3e77fc..6bfb76b93cc2 100755
--- a/crawl.sh
+++ b/crawl.sh
@@ -62,7 +62,7 @@ time wget \
     --execute robots=off \
     --follow-tags=a \
     --limit-rate=1m \
-    --accept html \
+    --reject '*.css,*.csv,*.CSV,*.doc,*.docx,*.epub,*.gif,*.ico,*.jpg,*.js,*.mp3,*.pdf,*.PDF,*.png,*.pptx,*.tmp,*.txt,*.wav,*.woff,*.woff2,*.xls,*xlsx,*.xml,*.zip' \
     --reject-regex "topics=|authors=|categories=|filter_blog_category=|ext_url=|search_field=|issuer_name=" \
     --recursive \
     --level="$depth" \