eberlitz · manueltonneau · Jan 6, 2020 · Jan 6, 2020 · Mar 30, 2020 · Apr 9, 2020
diff --git a/README.md b/README.md
@@ -29,8 +29,16 @@ Sono stati costruiti anche alcuni harmonium con due manuali.
 
 At the time of writing there was 1000400 documents in the ptwiki-dump. =]
 
+Before going to part 3, make sure you have downloaded the `punkt` from nltk. To do so, go in a python shell and type:
 
-3. Now that we have the wikipedia texts, we can start the pre-processing of the files.
+``` 
+import nltk
+nltk.download()
+```
+
+An installation window then appears. Go to the 'Models' tab and select 'punkt' from under the 'Identifier' column. Click download and it will install the necessary files. 
+
+3. Now that we have the wikipedia texts, we can start the pre-processing of the files. 
 
 ```sh
 python scripts/preprocess.py ./data/ptwiki-articles-text/ -o ./data/ptwiki-articles-text-cleaned

diff --git a/scripts/preprocess.py b/scripts/preprocess.py
@@ -38,7 +38,7 @@
 
 
 # nltk.download('punkt')
-sent_tokenizer = nltk.data.load('tokenizers/punkt/portuguese.pickle')
+sent_tokenizer = nltk.data.load('tokenizers/punkt/german.pickle')
 
 # ##### #
 # Regex #
@@ -76,7 +76,7 @@ def clean_single_sentence(text):
     text = re_remove_brackets.sub('', text)
     text = re_changehyphen.sub('-', text)
     text = re_remove_html.sub(' ', text)
-    text = re_transform_numbers.sub('0', text)
+    #text = re_transform_numbers.sub('0', text)
     text = re_transform_url.sub('URL', text)
     text = re_transform_emails.sub('EMAIL', text)
     text = re_quotes_1.sub(r'\1"', text)
@@ -102,14 +102,16 @@ def clean_document(document):
     - Remove any senteces with less then 4 words.
     '''
     for line in document.split('\n'):
-        for sent in sent_tokenizer.tokenize(line):
-            sent = clean_single_sentence(sent)
-            if sent.count(' ') >= 3 and sent[-1] in ['.', '!', '?', ';']:
-                if sent[0:2] == '- ':
-                    sent = sent[2:]
-                elif sent[0] == ' ' or sent[0] == '-':
-                    sent = sent[1:]
-                yield sent
+        sent = line
+        sent = clean_single_sentence(sent)
+        if sent.count(' ') >= 3 and sent[-1] in ['.', '!', '?', ';']:
+            if sent[0:2] == '- ':
+                sent = sent[2:]
+            elif sent[0] == ' ' or sent[0] == '-':
+                sent = sent[1:]
+            yield sent       
+
+
 
 def read_wiki_documents_compressed(dirname):
     '''
@@ -138,6 +140,7 @@ def main():
     parser.add_argument("input", help="ptwiki-compressed-text-folder")
     parser.add_argument("-o", "--output", default="./data/cleaned/",
                         help="directory for extracted files")
+
     args = parser.parse_args()
     input_dirname = args.input
     output_dirname = args.output
@@ -169,4 +172,4 @@ def main():
     print('Vocabulary: ', len(vocab))
 
 if __name__ == '__main__':
-    main()
+    main()