diff --git a/compose/local/django/Dockerfile b/compose/local/django/Dockerfile index ced9cfa3c..c7ae85d0d 100644 --- a/compose/local/django/Dockerfile +++ b/compose/local/django/Dockerfile @@ -30,23 +30,23 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ # set up makecert root CA RUN curl http://localhost/rootCA.pem > /usr/local/share/ca-certificates/rootCA.crt && update-ca-certificates -# heroku cli -RUN curl https://cli-assets.heroku.com/install.sh | sh - # install NVM ENV NVM_DIR /usr/local/nvm ENV NODE_VERSION 18.16.0 RUN mkdir $NVM_DIR RUN curl https://raw.githubusercontent.com/nvm-sh/nvm/v0.39.3/install.sh | bash \ - && . $NVM_DIR/nvm.sh \ - && nvm install $NODE_VERSION \ - && nvm alias default $NODE_VERSION \ - && nvm use default + && . $NVM_DIR/nvm.sh \ + && nvm install $NODE_VERSION \ + && nvm alias default $NODE_VERSION \ + && nvm use default ENV NODE_PATH $NVM_DIR/v$NODE_VERSION/lib/node_modules ENV PATH $NVM_DIR/versions/node/v$NODE_VERSION/bin:$PATH +# heroku cli +RUN curl https://cli-assets.heroku.com/install.sh | sh + # Requirements are installed here to ensure they will be cached. COPY ./pip /pip #RUN --mount=type=cache,target=/root/.cache/pip pip install -r /pip/requirements.txt diff --git a/muckrock/foia/classifier.pkl b/muckrock/foia/classifier.pkl deleted file mode 100644 index 1ec2043c4..000000000 Binary files a/muckrock/foia/classifier.pkl and /dev/null differ diff --git a/muckrock/foia/tasks.py b/muckrock/foia/tasks.py index 5ed4b2077..84a5b5ecc 100644 --- a/muckrock/foia/tasks.py +++ b/muckrock/foia/tasks.py @@ -32,9 +32,7 @@ # Third Party import boto3 -import dill as pickle import lob -import numpy as np import requests from anymail.exceptions import AnymailError from constance import config @@ -46,7 +44,6 @@ from phaxio.exceptions import PhaxioError from raven import Client from raven.contrib.celery import register_logger_signal, register_signal -from scipy.sparse import hstack from zipstream import ZIP_DEFLATED, ZipFile # MuckRock @@ -389,32 +386,6 @@ def get_text_ocr(doc_id): return document.full_text - def get_classifier(): - """Load the pickled classifier""" - with open("muckrock/foia/classifier.pkl", "rb") as pkl_fp: - return pickle.load(pkl_fp) - - def predict_status(vectorizer, selector, classifier, text, pages): - """Run the prediction""" - input_vect = vectorizer.transform([text]) - pages_vect = np.array([pages], dtype=np.float).transpose() - input_vect = hstack([input_vect, pages_vect]) - input_vect = selector.transform(input_vect) - probs = classifier.predict_proba(input_vect)[0] - max_prob = max(probs) - status = classifier.classes_[list(probs).index(max_prob)] - return status, max_prob - - def resolve_if_possible(resp_task): - """Resolve this response task if possible based off of ML setttings""" - if config.ENABLE_ML and resp_task.status_probability >= config.CONFIDENCE_MIN: - try: - ml_robot = User.objects.get(username="mlrobot") - resp_task.set_status(resp_task.predicted_status) - resp_task.resolve(ml_robot, {"status": resp_task.predicted_status}) - except User.DoesNotExist: - logger.error("mlrobot account does not exist") - def resolve_gloo_if_possible(resp_task, extracted_data): """Resolve this response task if possible based off of ML setttings""" @@ -471,22 +442,6 @@ def resolve_gloo_if_possible(resp_task, extracted_data): # wait longer for document cloud classify_status.retry(countdown=60 * 30, args=[task_pk], kwargs=kwargs) - # old classify - full_text = resp_task.communication.communication + (" ".join(file_text)) - vectorizer, selector, classifier = get_classifier() - - status, prob = predict_status( - vectorizer, selector, classifier, full_text, total_pages - ) - - if not (config.ENABLE_GLOO and config.USE_GLOO): - resp_task.predicted_status = status - resp_task.status_probability = int(100 * prob) - - resolve_if_possible(resp_task) - - resp_task.save() - # new classify if config.ENABLE_GLOO: try: @@ -494,8 +449,6 @@ def resolve_gloo_if_possible(resp_task, extracted_data): process_request( resp_task.communication.communication, "\n\n".join(file_text), - mlrobot_status=status, - mlrobot_prob=str(int(100 * prob)), task_url=settings.MUCKROCK_URL + resp_task.get_absolute_url(), request_url=settings.MUCKROCK_URL + resp_task.communication.foia.get_absolute_url(), diff --git a/muckrock/foia/tests/test_classification.py b/muckrock/foia/tests/test_classification.py index 25dcbd09c..ebf523ff8 100644 --- a/muckrock/foia/tests/test_classification.py +++ b/muckrock/foia/tests/test_classification.py @@ -7,25 +7,41 @@ # Third Party import nose.tools +from constance.test import override_config from mock import Mock, patch # MuckRock +from muckrock.core.factories import UserFactory from muckrock.foia.factories import FOIACommunicationFactory from muckrock.foia.tasks import classify_status from muckrock.task.factories import ResponseTaskFactory +@override_config(ENABLE_GLOO=True, USE_GLOO=True) class TestFOIAClassify(TestCase): """Test the classification of a new communication""" - @patch("asyncio.run", Mock()) + @patch( + "asyncio.run", + Mock( + return_value=( + Mock( + trackingNumber=None, + price=None, + dateEstimate=None, + ), + "processed", + ) + ), + ) def test_classifier(self): """Classifier should populate the fields on the response task""" + UserFactory(username="gloo") comm = FOIACommunicationFactory( communication="Here are your responsive documents" ) task = ResponseTaskFactory(communication=comm) classify_status.apply(args=(task.pk,), throw=True) task.refresh_from_db() - nose.tools.ok_(task.predicted_status) - nose.tools.ok_(task.status_probability) + nose.tools.eq_(task.predicted_status, "processed") + nose.tools.ok_(task.resolved) diff --git a/pip/requirements.in b/pip/requirements.in index bd71c2837..f222318bc 100644 --- a/pip/requirements.in +++ b/pip/requirements.in @@ -6,7 +6,6 @@ bleach # Used to sanitize any HTML we're rendering boto3 # Used to access AWS celery # Used to run async tasks chardet # Detect character encodings for user uploaded text files -dill # Used to serialize machine learning data django-activity-stream # Used for notifications django-anymail[mailgun] # Use for sending email on production django-autocomplete-light==3.9.0rc5 # Autocomplete drop down inputs @@ -52,7 +51,6 @@ lob # sending mail via lob.com markdown # Used for rendering Markdown, obviously! memoize # cachable properties newrelic # Interface to newrelic service -numpy # Used by machine learning pandas # Used by gloo pdfrw # Used for reading/writing PDFs for form filling in phonenumberslite # Library for validating and formatting phone numbers @@ -75,8 +73,6 @@ redis # Redis integration - for use with celery reportlab # Used for adding text to PDFs for form filling in requests # HTTP for humans rules # Rule based permissions -scikit-learn # Used for machine learning -scipy # Used for machine learning scout-apm # performance monitoring simplejson # json decoder for requests smart-open # Use for streaming files from S3 diff --git a/pip/requirements.txt b/pip/requirements.txt index 22de2778b..36954fb4c 100644 --- a/pip/requirements.txt +++ b/pip/requirements.txt @@ -35,7 +35,6 @@ cssselect==0.9.2 # via premailer cssutils==2.7.1 # via premailer decorator==4.3.0 # via ipython, traitlets defusedxml==0.7.0rc1 # via python3-openid, social-auth-core -dill==0.3.2 # via -r pip/requirements.in django==4.2 # via -r pip/requirements.in, django-activity-stream, django-anymail, django-appconf, django-celery-email, django-choices, django-cors-headers, django-debug-toolbar, django-extensions, django-filter, django-hijack, django-localflavor, django-news-sitemaps, django-opensearch, django-phonenumber-field, django-picklefield, django-redis, django-reversion, django-sslify, django-storages, django-taggit, djangorestframework, dogslow, drf-nested-routers, easy-thumbnails, jsonfield django-activity-stream==1.4.2 # via -r pip/requirements.in django-anymail[mailgun]==9.1 # via -r pip/requirements.in @@ -90,7 +89,6 @@ ipython==7.16.1 # via -r pip/requirements.in ipython-genutils==0.2.0 # via traitlets jedi==0.17.1 # via ipython jmespath==0.10.0 # via boto3, botocore -joblib==0.16.0 # via scikit-learn jsonfield==2.0.2 # via -r pip/requirements.in kombu==4.6.11 # via celery listcrunch==1.0.1 # via python-documentcloud @@ -100,7 +98,7 @@ markdown==3.2.2 # via -r pip/requirements.in, pymdown-extensions memoize==1.0.0 # via -r pip/requirements.in multidict==6.0.4 # via aiohttp, yarl newrelic==2.70.0.51 # via -r pip/requirements.in -numpy==1.19.0 # via -r pip/requirements.in, pandas, scikit-learn, scipy +numpy==1.24.4 # via pandas oauth2client==4.1.2 # via google-api-python-client oauthlib==2.1.0 # via requests-oauthlib, social-auth-core openai==0.28.0 # via gloo-lib @@ -159,8 +157,6 @@ rjsmin==1.2.1 # via django-compressor rsa==3.4.2 # via oauth2client, python-jose rules==2.2 # via -r pip/requirements.in s3transfer==0.6.1 # via boto3 -scikit-learn==0.23.1 # via -r pip/requirements.in -scipy==1.5.0 # via -r pip/requirements.in, scikit-learn scout-apm==2.16.2 # via -r pip/requirements.in scrapelib==2.2.0 # via govqa simplejson==3.16.0 # via -r pip/requirements.in @@ -172,7 +168,6 @@ social-auth-core[openidconnect]==4.1.0 # via -r pip/requirements.in, social-aut sorl-thumbnail==12.9.0 # via -r pip/requirements.in sqlparse==0.4.4 # via django, django-debug-toolbar stripe==1.75.0 # via -r pip/requirements.in -threadpoolctl==2.1.0 # via scikit-learn tiktoken==0.5.1 # via -r pip/requirements.in tomli==2.0.1 # via pytest tqdm==4.66.1 # via openai @@ -180,7 +175,6 @@ traitlets==4.3.2 # via ipython types-requests==2.31.0.5 # via gloo-lib types-urllib3==1.26.25.14 # via types-requests typing-extensions==4.8.0 # via annotated-types, pydantic, pydantic-core, pypdf -tzdata==2023.3 # via pandas unidecode==0.4.19 # via -r pip/requirements.in uritemplate==3.0.0 # via google-api-python-client urllib3[secure]==1.26.16 # via botocore, python-documentcloud, requests, scout-apm, scrapelib