Merge branch 'master' into #73-fix-reproducibility-issue

wri-dssg-omdena · May 26, 2021 · 649d9d6 · 649d9d6
2 parents f1e2ba7 + 741e218
commit 649d9d6
Show file tree

Hide file tree

Showing 218 changed files with 3,718 additions and 5,353 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,5 +1,5 @@
 # Custom
-# input/
+input/
 Omdena_key_S3.json
 Omdena_key.json
 tasks/preprocess_text/notebooks/client_secret.json

diff --git a/README.md b/README.md
@@ -14,6 +14,8 @@ In the long term, we are building a tool that can be extended to any use case re
   - [Incentive Detection](#incentive-detection)
   - [Incentive Instrument Classification](#incentive-instrument-classification)
 - [Development](#development)
+  - [Getting Started](#getting-started)
+  - [Main Components](#main-components)
   - [Contribution Guidelines](#contribution-guidelines)
   - [Project Organization](#project-organization)
 - [Background, Motivation and Impact](#background-motivation-and-impact)
@@ -45,6 +47,14 @@ The modeling side has yielded promising results, and we will be presenting this
 
 ## Development
 
+### Getting Started
+
+**Requirements**
+
+- Python >= 3.6
+- Miniconda or `virtualenv` (or any type of virtual environment tool)
+- pip
+
 ### Contribution Guidelines
 
 #### Steps to contribute to the master branch
@@ -104,8 +114,6 @@ The modeling side has yielded promising results, and we will be presenting this
     ```
 - _If I'm working with someone in the same issue, can I contribute/push to their branch?_
     - Technically yes, but it would be safer if you would work on yours first (maybe divide the issue in smaller issues) and then open a PR to theirs once you feel ready to merge code. Alternatively you could pair program and not worry about overwritting someone else's code :)
-- _Can I push directly to master?_
-    - Please don't :( 
     
 ## Project Organization
 

diff --git a/requirements.txt b/requirements.txt
@@ -2,13 +2,14 @@
 -e .
 
 # external requirements
-click
+click~=7.1.2
 Sphinx
 coverage
 awscli
 flake8
 python-dotenv>=0.5.1
 jupyterlab==2.2.9
+wandb==0.10.28
 pypdf2==1.26.0
 pikepdf==1.19.3
 pillow==7.0.0
@@ -19,14 +20,14 @@ pyspellchecker==0.5.5
 nltk==3.5
 pdf2image==1.14.0
 boto3==1.16.25
-jamspell==0.0.12
 sentence_transformers==0.4.0
 scikit-learn==0.23.2
 torch==1.7.1
 pandas==1.1.3
-spacy==2.3.5
+spacy==3.0.5
 scprep==1.0.11
 phate==1.0.4
-# gensim==3.8.3
-# rake-nltk==1.0.4
-# wordcloud==1.8.0
+tqdm
+numpy
+unidecode
+# jamspell==0.0.12
diff --git a/tasks/Scrapy/scrapy_official_newspapers/spiders/mexico.py b/tasks/Scrapy/scrapy_official_newspapers/spiders/mexico.py
@@ -19,23 +19,29 @@ class MexicoDOF(BaseSpider):
     spider_builder = "Jordi Planas"
     scrapable = "True"
     allowed_domains = ["sidofqa.segob.gob.mx"]
-    start_date = "2021-02-25"
+    start_date = "2010-01-01"
     #This is a category that appears in the database which yields a lot of documents that announce job posts. We exclude them from the search
     authorship_to_exclude = 'CONVOCATORIAS PARA CONCURSOS DE PLAZAS VACANTES DEL SERVICIO PROFESIONAL DE CARRERA EN LA ADMINISTRACION PUBLICA FEDERAL'
-    folder_to_save = "wri.-testing/dof/"
+    folder_to_save = "spanish_documents/text_files/new/"
+    # folder_to_save = "wri.-testing/dof/"
 
     def __init__(self):
         self.keyword_dict = self.import_json('./keywords_and_dictionaries/keywords_knowledge_domain_ES.json')
         self.negative_keyword_dict = self.import_json('./keywords_and_dictionaries/negative_keywords_knowledge_domain_ES.json')
         self.from_date, self.today = self.create_date_span(self.start_date)
 
-        #folder = '/home/propietari/Documents/claus/' # TODO: change to your local path
-        folder = 'C:/Users/jordi/Documents/claus/' # TODO: change to your local path
-        file_name = 'AWS_S3_keys_JordiPlanas_Made_in_game.json' # TODO: Change to your filename
-        self.bucket = "wri-testing" # TODO: Change to the final bucket
-        bucket_region = "eu-central-1" # TODO: Change to fit to the final bucket
-        file = folder + file_name
-        self.s3 = self.open_S3_session(file, self.bucket, bucket_region)
+        # folder = 'C:/Users/jordi/Documents/claus/' # TODO: change to your local path
+        # file_name = 'AWS_S3_keys_JordiPlanas_Made_in_game.json' # TODO: Change to your filename
+        folder = '/home/propietari/Documents/claus/' # TODO: change to your local path
+        file_name = "AWS_S3_keys_wri.json"
+
+        # self.bucket = "wri-testing" # TODO: Change to the final bucket
+        # bucket_region = "eu-central-1" # TODO: Change to fit to the final bucket
+        self.bucket = "wri-nlp-policy"
+        bucket_region = "us-east-1"
+
+        keys_file = folder + file_name
+        self.s3 = self.open_S3_session(keys_file, self.bucket, bucket_region)
 
     def start_requests(self):
         for day in self.create_date_list(self.from_date, self.today, 1, "days", self.country_code):
@@ -44,7 +50,7 @@ def start_requests(self):
             day = day.strftime('%d-%m-%Y')
             #self.debug(day)
             self.start_url = f'https://sidofqa.segob.gob.mx/dof/sidof/notas/{day}'
-            #print(start_urls)
+            #print(f"\n *************** \n {self.start_url}\n ********************")
             yield scrapy.Request(self.start_url, dont_filter=True, callback=self.parse)
 
     def parse(self, response):
@@ -78,24 +84,31 @@ def parse(self, response):
                     doc_url = f'https://www.dof.gob.mx/nota_detalle.php?codigo={codigo_nota}&fecha={self.day_doc_url}&print=true'
                     doc_name = self.HSA1_encoding(doc_url) + ".txt"
                     item['doc_name'] = doc_name
-                    #self.debug("\n       #################       \n")
+                    #self.debug(f"\n       #################       \n {doc_url} \n   ###############")
                     #self.debug(doc_name)
                     yield item
-                    yield scrapy.Request(doc_url, dont_filter=True, callback=self.parse_other, cb_kwargs=dict(document = doc_name))
+                    yield scrapy.Request(doc_url, dont_filter=True, callback=self.parse_other, cb_kwargs=dict(document = doc_name, url = doc_url))
             else:
                 pass
 
-    def parse_other(self, response, document):
-        # self.debug("\n**** in the nota detalle ****\n")
+    def parse_other(self, response, document, url):
         soup = BeautifulSoup(response.css('div#DivDetalleNota').get(), features = "lxml")
         paragraphs = soup.find_all("p")
-        tables = soup.find_all("td")
         text = ""
-        for line in paragraphs[1:]:
-            text = text + line.text + "\n"
-        for cell in tables:
-            text = text + cell.text + "\n"
+        if len(paragraphs) == 0:
+            text = text + soup.text
+        else:
+            tables = soup.find_all("td")
+            for line in paragraphs[1:]:
+                text = text + line.text + "\n"
+            text = text + "<table>" + "\n"
+            for cell in tables:
+                if "En el documento que usted está visualizando" not in cell.text:
+                    text = text + cell.text + "\n"
+            text = text + "<\\table>" + "\n"
         file = self.folder_to_save + document
+        #self.debug(url)
+        #self.debug(text)
         #self.debug("\n       ****************       \n")
         #self.debug(file)
         self.save_to_s3(self.s3, self.bucket, file, text.replace("\t", ""))
diff --git a/tasks/data_augmentation/src/assisted_labeling/sentence_processor.py b/tasks/data_augmentation/src/assisted_labeling/sentence_processor.py
diff --git a/tasks/data_loading/__init__.py b/tasks/data_loading/__init__.py
@@ -0,0 +1,2 @@
+from .src.s3_client import *
+from .src.utils import *
diff --git a/tasks/data_loading/src/s3_client.py b/tasks/data_loading/src/s3_client.py
@@ -5,8 +5,8 @@
 
 
 class S3Client:
-    def __init__(self, creds_filepath, creds_filename, bucket_name, language=None):
-        self.aws_id, self.aws_secret = aws_credentials_from_file(creds_filepath, creds_filename)
+    def __init__(self, creds_filepath, bucket_name, language=None):
+        self.aws_id, self.aws_secret = aws_credentials_from_file(creds_filepath)
         self.s3 = get_s3(self.aws_id, self.aws_secret)
         self.bucket_name = bucket_name
         self.metadata_folder = f"metadata/"
@@ -51,8 +51,9 @@ def move_object(self, obj_name, obj_old_folder, obj_new_folder):
         then deleting it from the old one
         """
         try:
-            self.s3.Object(self.bucket_name, f"{obj_old_folder}/{obj_name}") \
-                .copy_from(CopySource=f"{self.bucket_name}/{obj_new_folder}/{obj_name}")
+            self.s3.meta.client.copy_object(Bucket=self.bucket_name,
+                                                 CopySource=f"{self.bucket_name}/{obj_old_folder}/{obj_name}",
+                                                 Key=f"{obj_new_folder}/{obj_name}")
             _ = self.s3.Object(self.bucket_name, f"{obj_old_folder}/{obj_name}").delete()
         except Exception as e:
             print(f"Error while moving {obj_name} from {obj_old_folder} to {obj_new_folder}.")
@@ -70,14 +71,13 @@ def load_text_files(self, language):
                 text = obj.get()['Body'].read().decode('utf-8')
                 yield file_id, text
 
-    def store_sentences(self, sents, file_name, file_uuid, language):
+    def store_sentences(self, sents, file_uuid, language):
         """
         Store a JSON file containing the metadata and sentences for a given text file in the S3 bucket
         """
         self._update_folder_names(language)
         sents_json = {file_uuid: {"metadata":
                                       {"n_sentences": len(sents),
-                                       "file_name": file_name,
                                        "language": language},
                                   "sentences": sents}}
 

diff --git a/tasks/data_loading/src/s3_client_utils.py b/tasks/data_loading/src/s3_client_utils.py
@@ -2,12 +2,12 @@
 import json
 
 
-def aws_credentials_from_file(path, filename):
+def aws_credentials_from_file(path):
     """
     Get the AWS S3 Id and Secret credentials, from a json file in the format:
         {"AWS_ID": "AWS_SECRET"}
     """
-    with open(f"{path}/{filename}", 'r') as f:
+    with open(path, 'r') as f:
         key_dict = json.load(f)
     for key in key_dict:
         aws_id = key
@@ -35,7 +35,3 @@ def labeled_sentences_from_json(sents_json):
     """
     return {sent_id: sent_labels_map for sent_id, sent_labels_map in [*sents_json.values()][0]["sentences"].items()}
 
-
-## TODO: Move this function somewhere else?
-def format_sents_for_output(sents, doc_id):
-    return {f"{doc_id}_sent_{i}": {"text": sent, "label": []} for i, sent in enumerate(sents)}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		from .src.s3_client import *
		from .src.utils import *