From a8d6b264193ab809c65328194046603f3aa4a91c Mon Sep 17 00:00:00 2001
From: akshowhini <33936764+akshowhini@users.noreply.github.com>
Date: Fri, 6 May 2022 17:09:29 -0400
Subject: [PATCH] Big files download (#49)

* Download big outputs

* [B]: Fix processing splitted PDFs
---
 ExtractTable/FileOperations/__init__.py | 11 ++++-------
 ExtractTable/__init__.py                | 23 ++++++++++++++---------
 ExtractTable/__version__.py             |  2 +-
 requirements.txt                        |  3 ---
 setup.py                                |  3 +--
 5 files changed, 20 insertions(+), 22 deletions(-)
 delete mode 100644 requirements.txt

diff --git a/ExtractTable/FileOperations/__init__.py b/ExtractTable/FileOperations/__init__.py
index 9e3b878..419301b 100644
--- a/ExtractTable/FileOperations/__init__.py
+++ b/ExtractTable/FileOperations/__init__.py
@@ -19,7 +19,7 @@ class CheckFile:
     def __init__(self, filepath: ty.Union[os.PathLike, str]):
         self.filepath = filepath
         self.type_error()
-        self.size_error()
+        self.is_big = self.is_big_size()
 
     def type_error(self) -> ty.Union[Exception, None]:
         """To check file extension"""
@@ -27,11 +27,9 @@ def type_error(self) -> ty.Union[Exception, None]:
             return
         raise ClientFileTypeError(Message=f"Allowed file types are {self.__SUPPORTED_EXTENSIONS__}")
 
-    def size_error(self) -> ty.Union[Exception, None]:
+    def is_big_size(self) -> bool:
         # 1027 to create some buffer
-        if os.stat(self.filepath).st_size <= self.__THRESHOLD_SIZE__*1027*1027:
-            return
-        raise ClientFileSizeError(Message=f"File Size greater than the threshold {self.__THRESHOLD_SIZE__} Mb.")
+        return os.stat(self.filepath).st_size > self.__THRESHOLD_SIZE__*1027*1027
 
 
 class PrepareInput:
@@ -55,11 +53,10 @@ def __init__(self, filepath: ty.Union[os.PathLike, str], pages: str):
             print("[Info]: Aggregating user defined pages..", self.pages)
             gather_pages = self._get_pages(self.filepath, pages)
             self.filepath = self.pdf_separator(gather_pages)
-        CheckFile(self.filepath)
 
     def pdf_separator(self, gather_pages: set):
         """PDF Splitter"""
-        merged_pdf = os.path.join(self.temp_dir, str(self.pages) + os.path.basename(self.filepath))
+        merged_pdf = os.path.join(self.temp_dir, str(self.pages) + "_" + os.path.basename(self.filepath))
         with open(merged_pdf, 'wb') as out_file:
             pdf_reader = PyPDF2.PdfFileReader(self.filepath)
             pdf_writer = PyPDF2.PdfFileWriter()
diff --git a/ExtractTable/__init__.py b/ExtractTable/__init__.py
index 4639cff..3e698ae 100644
--- a/ExtractTable/__init__.py
+++ b/ExtractTable/__init__.py
@@ -11,7 +11,7 @@
 
 import requests as rq
 
-from .FileOperations import PrepareInput
+from .FileOperations import PrepareInput, CheckFile
 from .config import HOST, JobStatus
 from .parsers import ValidateResponse
 from .common import ConvertTo
@@ -98,6 +98,10 @@ def get_result(self, job_id: str, wait_time: int = 10, max_wait_time: int = 300)
             time.sleep(max(10, int(wait_time)))
             max_wait_time -= wait_time
             resp = self._make_request('get', HOST.RESULT, params=params)
+        
+        if resp.get('DownloadUrl', ''):
+            self.ServerResponse = rq.get(resp['DownloadUrl'])
+            self.server_response = resp = self.ServerResponse.json()
 
         return resp
 
@@ -171,15 +175,16 @@ def process_file(
         # To use the reference when saving the output
         self.__setattr__('input_filename', os.path.basename(filepath))
 
-        try:
-            with PrepareInput(filepath, pages=pages) as infile:
-                with open(infile.filepath, 'rb') as fp:
+        with PrepareInput(filepath, pages=pages) as infile:
+            with open(infile.filepath, 'rb') as fp:
+                is_big_file = CheckFile(infile.filepath).is_big
+                if not is_big_file:
                     trigger_resp = self.trigger_process(fp, dup_check=dup_check, **kwargs)
-        except ClientFileSizeError:
-            big_gen = self.bigfile_upload(filepath=os.path.basename(filepath))
-            with open(filepath, 'rb') as ifile:
-                rq.post(big_gen['url'], data=big_gen['fields'], files={'file': ifile})
-            trigger_resp = self.trigger_process(None, signed_filename=big_gen["fields"]["key"], dup_check=dup_check, **kwargs)
+                else:
+                    big_gen = self.bigfile_upload(filepath=os.path.basename(infile.filepath))
+                    with open(infile.filepath, 'rb') as ifile:
+                        rq.post(big_gen['url'], data=big_gen['fields'], files={'file': ifile})
+                    trigger_resp = self.trigger_process(None, signed_filename=big_gen["fields"]["key"], dup_check=dup_check, **kwargs)
 
         for _type, _obj in trigger_resp.items():
             self.__setattr__(_type, _obj)
diff --git a/ExtractTable/__version__.py b/ExtractTable/__version__.py
index 8916c84..6ec7765 100644
--- a/ExtractTable/__version__.py
+++ b/ExtractTable/__version__.py
@@ -1,4 +1,4 @@
-VERSION = (2, 2, 0)
+VERSION = (2, 3, 1)
 PRERELEASE = None  # "alpha", "beta" or "rc"
 REVISION = None
 
diff --git a/requirements.txt b/requirements.txt
deleted file mode 100644
index a22737e..0000000
--- a/requirements.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-requests>=2.21
-pandas>=0.24
-PyPDF2>=1.26
diff --git a/setup.py b/setup.py
index 7945a94..604b965 100644
--- a/setup.py
+++ b/setup.py
@@ -10,8 +10,7 @@
 with open('README.md', 'r') as f:
     readme = f.read()
 
-with open("requirements.txt") as fh:
-    requires = [x.strip() for x in fh.readlines()]
+requires = ['requests>=2.21', 'pandas>=0.24', 'PyPDF2>=1.26']
 
 
 def setup_package():