From 68a0b413fb91350d33a7d1ac056337c24cbee9f0 Mon Sep 17 00:00:00 2001
From: Naser Mahfouz <naser.mahfouz@pnnl.gov>
Date: Sat, 23 Nov 2024 12:37:15 -0500
Subject: [PATCH] reorg

---
 e3sm-diags/Dockerfile        |  15 ++---
 e3sm-diags/download_files.py | 106 +++++++++++++++++++++++++++++++++++
 2 files changed, 110 insertions(+), 11 deletions(-)
 create mode 100644 e3sm-diags/download_files.py

diff --git a/e3sm-diags/Dockerfile b/e3sm-diags/Dockerfile
index 1640138..77d0b26 100644
--- a/e3sm-diags/Dockerfile
+++ b/e3sm-diags/Dockerfile
@@ -15,16 +15,9 @@ ENV TZ=America/Los_Angeles
 ENV LANGUAGE=en_US:en \
     LANG=en_US.UTF-8
 
-RUN wget https://github.com/E3SM-Project/e3sm_diags/archive/refs/heads/main.zip
-RUN unzip main.zip
-RUN cd e3sm_diags-main
-RUN python3 -m pip install .
-RUN python3 -m tests.integration.download_data
-RUN mkdir -p /e3sm_diags_downloaded_data
-RUN mv tests/integration/integration_test_data /e3sm_diags_downloaded_data/integration_test_data
-RUN mv tests/integration/expected /e3sm_diags_downloaded_data/expected
-RUN mv tests/integration/integration_test_data_20230523 /e3sm_diags_downloaded_data/integration_test_data_20230523
-RUN cd ..
-RUN rm -rf main.zip e3sm_diags-main
+RUN mkdir -p /e3sm_diags_downloaded_data/tests/integration
+
+COPY download_files.py /app/download_files.py
+RUN python3 /app/download_files.py
 
 ENTRYPOINT ["/bin/bash", "--rcfile", "/etc/profile", "-l"]
diff --git a/e3sm-diags/download_files.py b/e3sm-diags/download_files.py
new file mode 100644
index 0000000..ac0c87b
--- /dev/null
+++ b/e3sm-diags/download_files.py
@@ -0,0 +1,106 @@
+import os
+import re
+import urllib.request
+from typing import List
+
+from tests.integration.config import TEST_DATA_DIR, TEST_IMAGES_DIR, TEST_ROOT_PATH
+
+TEST_ROOT_PATH = "/e3sm_diags_downloaded_data/tests/integration/"
+TEST_DATA_DIR = "integration_test_data"
+TEST_IMAGES_DIR = "integration_test_images"
+
+
+# https://stackoverflow.com/questions/49113616/how-to-download-file-using-python
+def retrieve_file(url, file_path):
+    dir_path = os.path.join(*os.path.split(file_path)[:-1])
+    # https://stackoverflow.com/questions/12517451/automatically-creating-directories-with-file-output
+    if dir_path:
+        os.makedirs(dir_path, exist_ok=True)
+    print("Downloading {} to {}".format(url, file_path))
+    urllib.request.urlretrieve(url, file_path)
+    return file_path
+
+
+def download_files(url_prefix, url_suffix, directory_prefix=None):
+    print(f"Downloading {url_suffix}")
+    print("url_prefix={}".format(url_prefix))
+    print("url_suffix={}".format(url_suffix))
+    print("(local) directory_prefix={}".format(directory_prefix))
+
+    url = os.path.join(url_prefix, url_suffix)
+
+    if directory_prefix:
+        links_file_path = os.path.join(directory_prefix, url_suffix)
+    else:
+        links_file_path = url_suffix
+
+    links_file_path = "{}.html".format(links_file_path)
+    print(
+        "Downloading files from {}; checking for links on {}".format(
+            url, links_file_path
+        )
+    )
+    html_path = retrieve_file(url, links_file_path)
+    links: List[str] = []
+
+    with open(html_path, "r") as html:
+        for line in html:
+            match_object = re.search(r'href=[\'"]?([^\'" >]+)', line)
+            if match_object:
+                link = match_object.group(1)
+                # Ignore parent directory and sorting links
+                if (
+                    ("../" not in link)
+                    and (not link.startswith("/"))
+                    and ("?" not in link)
+                ):
+                    print("Found a link: {}".format(link))
+                    links.append(link)
+
+    if os.path.exists(links_file_path):
+        os.remove(links_file_path)
+
+    files = []
+    directories = []
+
+    for link in links:
+        if link.endswith("/"):
+            # List directories to download.
+            directories.append(link)
+        else:
+            # List '.csv', '.mat', '.nc', and '.png' files to download.
+            files.append(link)
+
+    print("\n###Downloading files")
+
+    if directory_prefix:
+        new_directory_prefix = os.path.join(directory_prefix, url_suffix)
+    else:
+        new_directory_prefix = url_suffix
+    for f in files:
+        url_to_download = os.path.join(url, f)
+        file_path = os.path.join(new_directory_prefix, f)
+        retrieve_file(url_to_download, file_path)
+
+    print("\n###Downloading directories")
+    for d in directories:
+        new_directory = d.rstrip("/")
+        download_files(url, new_directory, directory_prefix=new_directory_prefix)
+
+
+def download():
+    download_files(
+        "https://web.lcrc.anl.gov/public/e3sm/e3sm_diags_test_data/integration",
+        TEST_DATA_DIR,
+        directory_prefix=TEST_ROOT_PATH,
+    )
+    download_files(
+        "https://web.lcrc.anl.gov/public/e3sm/e3sm_diags_test_data/integration/expected",
+        TEST_IMAGES_DIR,
+        directory_prefix=TEST_ROOT_PATH,
+    )
+    print(f"Downloaded {TEST_DATA_DIR} and {TEST_ROOT_PATH}")
+
+
+if __name__ == "__main__":
+    download()