Skip to content

Commit

Permalink
Merge branch 'master' into #73-fix-reproducibility-issue
Browse files Browse the repository at this point in the history
  • Loading branch information
Jordi Planas authored May 26, 2021
2 parents f1e2ba7 + 741e218 commit 649d9d6
Show file tree
Hide file tree
Showing 218 changed files with 3,718 additions and 5,353 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Custom
# input/
input/
Omdena_key_S3.json
Omdena_key.json
tasks/preprocess_text/notebooks/client_secret.json
Expand Down
12 changes: 10 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ In the long term, we are building a tool that can be extended to any use case re
- [Incentive Detection](#incentive-detection)
- [Incentive Instrument Classification](#incentive-instrument-classification)
- [Development](#development)
- [Getting Started](#getting-started)
- [Main Components](#main-components)
- [Contribution Guidelines](#contribution-guidelines)
- [Project Organization](#project-organization)
- [Background, Motivation and Impact](#background-motivation-and-impact)
Expand Down Expand Up @@ -45,6 +47,14 @@ The modeling side has yielded promising results, and we will be presenting this

## Development

### Getting Started

**Requirements**

- Python >= 3.6
- Miniconda or `virtualenv` (or any type of virtual environment tool)
- pip

### Contribution Guidelines

#### Steps to contribute to the master branch
Expand Down Expand Up @@ -104,8 +114,6 @@ The modeling side has yielded promising results, and we will be presenting this
```
- _If I'm working with someone in the same issue, can I contribute/push to their branch?_
- Technically yes, but it would be safer if you would work on yours first (maybe divide the issue in smaller issues) and then open a PR to theirs once you feel ready to merge code. Alternatively you could pair program and not worry about overwritting someone else's code :)
- _Can I push directly to master?_
- Please don't :(
## Project Organization
Expand Down
13 changes: 7 additions & 6 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,14 @@
-e .

# external requirements
click
click~=7.1.2
Sphinx
coverage
awscli
flake8
python-dotenv>=0.5.1
jupyterlab==2.2.9
wandb==0.10.28
pypdf2==1.26.0
pikepdf==1.19.3
pillow==7.0.0
Expand All @@ -19,14 +20,14 @@ pyspellchecker==0.5.5
nltk==3.5
pdf2image==1.14.0
boto3==1.16.25
jamspell==0.0.12
sentence_transformers==0.4.0
scikit-learn==0.23.2
torch==1.7.1
pandas==1.1.3
spacy==2.3.5
spacy==3.0.5
scprep==1.0.11
phate==1.0.4
# gensim==3.8.3
# rake-nltk==1.0.4
# wordcloud==1.8.0
tqdm
numpy
unidecode
# jamspell==0.0.12
51 changes: 32 additions & 19 deletions tasks/Scrapy/scrapy_official_newspapers/spiders/mexico.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,23 +19,29 @@ class MexicoDOF(BaseSpider):
spider_builder = "Jordi Planas"
scrapable = "True"
allowed_domains = ["sidofqa.segob.gob.mx"]
start_date = "2021-02-25"
start_date = "2010-01-01"
#This is a category that appears in the database which yields a lot of documents that announce job posts. We exclude them from the search
authorship_to_exclude = 'CONVOCATORIAS PARA CONCURSOS DE PLAZAS VACANTES DEL SERVICIO PROFESIONAL DE CARRERA EN LA ADMINISTRACION PUBLICA FEDERAL'
folder_to_save = "wri.-testing/dof/"
folder_to_save = "spanish_documents/text_files/new/"
# folder_to_save = "wri.-testing/dof/"

def __init__(self):
self.keyword_dict = self.import_json('./keywords_and_dictionaries/keywords_knowledge_domain_ES.json')
self.negative_keyword_dict = self.import_json('./keywords_and_dictionaries/negative_keywords_knowledge_domain_ES.json')
self.from_date, self.today = self.create_date_span(self.start_date)

#folder = '/home/propietari/Documents/claus/' # TODO: change to your local path
folder = 'C:/Users/jordi/Documents/claus/' # TODO: change to your local path
file_name = 'AWS_S3_keys_JordiPlanas_Made_in_game.json' # TODO: Change to your filename
self.bucket = "wri-testing" # TODO: Change to the final bucket
bucket_region = "eu-central-1" # TODO: Change to fit to the final bucket
file = folder + file_name
self.s3 = self.open_S3_session(file, self.bucket, bucket_region)
# folder = 'C:/Users/jordi/Documents/claus/' # TODO: change to your local path
# file_name = 'AWS_S3_keys_JordiPlanas_Made_in_game.json' # TODO: Change to your filename
folder = '/home/propietari/Documents/claus/' # TODO: change to your local path
file_name = "AWS_S3_keys_wri.json"

# self.bucket = "wri-testing" # TODO: Change to the final bucket
# bucket_region = "eu-central-1" # TODO: Change to fit to the final bucket
self.bucket = "wri-nlp-policy"
bucket_region = "us-east-1"

keys_file = folder + file_name
self.s3 = self.open_S3_session(keys_file, self.bucket, bucket_region)

def start_requests(self):
for day in self.create_date_list(self.from_date, self.today, 1, "days", self.country_code):
Expand All @@ -44,7 +50,7 @@ def start_requests(self):
day = day.strftime('%d-%m-%Y')
#self.debug(day)
self.start_url = f'https://sidofqa.segob.gob.mx/dof/sidof/notas/{day}'
#print(start_urls)
#print(f"\n *************** \n {self.start_url}\n ********************")
yield scrapy.Request(self.start_url, dont_filter=True, callback=self.parse)

def parse(self, response):
Expand Down Expand Up @@ -78,24 +84,31 @@ def parse(self, response):
doc_url = f'https://www.dof.gob.mx/nota_detalle.php?codigo={codigo_nota}&fecha={self.day_doc_url}&print=true'
doc_name = self.HSA1_encoding(doc_url) + ".txt"
item['doc_name'] = doc_name
#self.debug("\n ################# \n")
#self.debug(f"\n ################# \n {doc_url} \n ###############")
#self.debug(doc_name)
yield item
yield scrapy.Request(doc_url, dont_filter=True, callback=self.parse_other, cb_kwargs=dict(document = doc_name))
yield scrapy.Request(doc_url, dont_filter=True, callback=self.parse_other, cb_kwargs=dict(document = doc_name, url = doc_url))
else:
pass

def parse_other(self, response, document):
# self.debug("\n**** in the nota detalle ****\n")
def parse_other(self, response, document, url):
soup = BeautifulSoup(response.css('div#DivDetalleNota').get(), features = "lxml")
paragraphs = soup.find_all("p")
tables = soup.find_all("td")
text = ""
for line in paragraphs[1:]:
text = text + line.text + "\n"
for cell in tables:
text = text + cell.text + "\n"
if len(paragraphs) == 0:
text = text + soup.text
else:
tables = soup.find_all("td")
for line in paragraphs[1:]:
text = text + line.text + "\n"
text = text + "<table>" + "\n"
for cell in tables:
if "En el documento que usted está visualizando" not in cell.text:
text = text + cell.text + "\n"
text = text + "<\\table>" + "\n"
file = self.folder_to_save + document
#self.debug(url)
#self.debug(text)
#self.debug("\n **************** \n")
#self.debug(file)
self.save_to_s3(self.s3, self.bucket, file, text.replace("\t", ""))

This file was deleted.

2 changes: 2 additions & 0 deletions tasks/data_loading/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
from .src.s3_client import *
from .src.utils import *
12 changes: 6 additions & 6 deletions tasks/data_loading/src/s3_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@


class S3Client:
def __init__(self, creds_filepath, creds_filename, bucket_name, language=None):
self.aws_id, self.aws_secret = aws_credentials_from_file(creds_filepath, creds_filename)
def __init__(self, creds_filepath, bucket_name, language=None):
self.aws_id, self.aws_secret = aws_credentials_from_file(creds_filepath)
self.s3 = get_s3(self.aws_id, self.aws_secret)
self.bucket_name = bucket_name
self.metadata_folder = f"metadata/"
Expand Down Expand Up @@ -51,8 +51,9 @@ def move_object(self, obj_name, obj_old_folder, obj_new_folder):
then deleting it from the old one
"""
try:
self.s3.Object(self.bucket_name, f"{obj_old_folder}/{obj_name}") \
.copy_from(CopySource=f"{self.bucket_name}/{obj_new_folder}/{obj_name}")
self.s3.meta.client.copy_object(Bucket=self.bucket_name,
CopySource=f"{self.bucket_name}/{obj_old_folder}/{obj_name}",
Key=f"{obj_new_folder}/{obj_name}")
_ = self.s3.Object(self.bucket_name, f"{obj_old_folder}/{obj_name}").delete()
except Exception as e:
print(f"Error while moving {obj_name} from {obj_old_folder} to {obj_new_folder}.")
Expand All @@ -70,14 +71,13 @@ def load_text_files(self, language):
text = obj.get()['Body'].read().decode('utf-8')
yield file_id, text

def store_sentences(self, sents, file_name, file_uuid, language):
def store_sentences(self, sents, file_uuid, language):
"""
Store a JSON file containing the metadata and sentences for a given text file in the S3 bucket
"""
self._update_folder_names(language)
sents_json = {file_uuid: {"metadata":
{"n_sentences": len(sents),
"file_name": file_name,
"language": language},
"sentences": sents}}

Expand Down
8 changes: 2 additions & 6 deletions tasks/data_loading/src/s3_client_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,12 @@
import json


def aws_credentials_from_file(path, filename):
def aws_credentials_from_file(path):
"""
Get the AWS S3 Id and Secret credentials, from a json file in the format:
{"AWS_ID": "AWS_SECRET"}
"""
with open(f"{path}/{filename}", 'r') as f:
with open(path, 'r') as f:
key_dict = json.load(f)
for key in key_dict:
aws_id = key
Expand Down Expand Up @@ -35,7 +35,3 @@ def labeled_sentences_from_json(sents_json):
"""
return {sent_id: sent_labels_map for sent_id, sent_labels_map in [*sents_json.values()][0]["sentences"].items()}


## TODO: Move this function somewhere else?
def format_sents_for_output(sents, doc_id):
return {f"{doc_id}_sent_{i}": {"text": sent, "label": []} for i, sent in enumerate(sents)}
Loading

0 comments on commit 649d9d6

Please sign in to comment.