Skip to content

Commit

Permalink
AliYun OSS download for CN mainland
Browse files Browse the repository at this point in the history
fix #113
  • Loading branch information
HowcanoeWang committed Aug 19, 2024
1 parent 3abdd81 commit aab916c
Show file tree
Hide file tree
Showing 6 changed files with 358 additions and 51 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,5 @@ docs/jupyter/.ipynb_checkpoints/*
dist/*
easyidp.egg-info/*
dev.notes/*
tests/out/*
tests/out/*
.pypirc
28 changes: 27 additions & 1 deletion easyidp/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
__version__ = "2.0.0"

import os
import sys
import subprocess
import warnings
import numpy as np
from pathlib import Path
Expand Down Expand Up @@ -238,4 +240,28 @@ def parse_relative_path(root_path, relative_path):
from .geotiff import GeoTiff
from .pix4d import Pix4D
from .metashape import Metashape
from .roi import ROI
from .roi import ROI

########################
# Dataset region check #
########################

aliyun_down = None
GOOGLE_AVAILABLE = True

if not data._can_access_google_cloud():
GOOGLE_AVAILABLE = False

try:
import oss2
except ImportError:
print("oss2 is not installed. Installing now...")
subprocess.check_call([sys.executable, "-m", "pip", "install", "oss2", "-i", "https://pypi.tuna.tsinghua.edu.cn/simple"])
print("oss2 has been installed.")

try:
import oss2
except ImportError:
raise ImportError(
"Failed to import oss2 after installation, please manually install `oss2` package by:\n"
"pip install oss2 -i https://pypi.tuna.tsinghua.edu.cn/simple")
212 changes: 170 additions & 42 deletions easyidp/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,14 @@
import sys
import shutil
import zipfile
import gdown
import requests
import subprocess
import webbrowser
import tqdm
from pathlib import Path

GDOWN_TEST_URL = "https://drive.google.com/file/d/1yWvIOYJ1ML-UGleh3gT5b7dxXzBuSPgQ/view?usp=share_link"

def user_data_dir(file_name=""):
r"""Get OS specific data directory path for EasyIDP.
Expand Down Expand Up @@ -110,6 +112,9 @@ def url_checker(url):
# print URL with Errs
return False

def _can_access_google_cloud():
return url_checker(GDOWN_TEST_URL)


def download_all():
"""download all datasets
Expand All @@ -119,11 +124,136 @@ def download_all():
test = TestData()


class AliYunDownloader():

def __init__(self):
import oss2

access_url = "https://easyidp-data.oss-rg-china-mainland.aliyuncs.com/access.txt"

response = requests.get(access_url)

self.tqdm_bar = None

if response.status_code == 200:
content = response.text
access_key_secret, access_key_id = content.split('\r\n')

self.bucket_name = 'easyidp-data'
self.endpoint = 'oss-rg-china-mainland.aliyuncs.com'

self.bucket = oss2.Bucket(
oss2.Auth(access_key_id, access_key_secret),
self.endpoint, self.bucket_name
)
else:
raise ConnectionRefusedError(
f"Failed to achieve AliYun Auth token, Please contact the maintainer via github.\n"
f"Status code: {response.status_code}"
)

@staticmethod
def calculate_download_cost(dataset_name, dataset_size):

import re
import random

# Define the cost per GB
cost_per_gb = 0.12 + 0.5 + 0.1*random.random()

# Regular expression to match the number and unit
pattern = re.compile(r'(\d+(\.\d+)?)([KMG]B)')
match = pattern.match(dataset_size)

if not match:
raise ValueError(f"Invalid dataset size format of {dataset_name}.size = {dataset_size}")

# Extract the value and unit
value = float(match.group(1))
unit = match.group(3)

# Convert the value to GB
if unit == 'KB':
value_in_gb = value / (1024 * 1024)
elif unit == 'MB':
value_in_gb = value / 1024
elif unit == 'GB':
value_in_gb = value
else:
raise ValueError(f"Unsupported dataset size unit {dataset_size}")

# Calculate the cost
cost = value_in_gb * cost_per_gb
return round(cost, 1)

def download_auth(self, dataset_name, dataset_size):
money_cost = self.calculate_download_cost(dataset_name, dataset_size)
confirm_str = f"我已知悉此次下载会消耗{money_cost}元的下行流量费用"

# 插入不可见字符
invis_char = '\u200B' # 零宽度空格
no_copy_confirm_str = confirm_str[0:10] + invis_char + confirm_str[10:20] + invis_char + confirm_str[20:]

# 使用ANSI转义序列设置颜色和格式
RED = '\033[91m'
BOLD = '\033[1m'
END = '\033[0m'

notification = (
f"{RED}{BOLD}请注意,中国大陆数据集下载使用作者私人搭建的阿里云文件存储服务,\n"
f"下载数据集会产生一定的流量费用(下载当前数据集{dataset_name}会消耗大约{money_cost}元),\n"
f"此费用由作者本人负担,请勿在非必要的情况下重复下载或将此数据存储仓库用于其他用途\n\n"
f"如果同意以上内容,请在下方用输入法输入(复制无效):\n{no_copy_confirm_str}{END}"
)
print(notification)

retry_counter = 0
matched = False

while retry_counter < 5:
user_confirm = input(">>> ")
if user_confirm == confirm_str:
matched = True
break
else:
print("输入的字符不匹配,请用输入法再次输入\n")
retry_counter += 1

if not matched:
raise PermissionError("尝试失败次数超过5次,请重新运行脚本")

return matched

def tqdm_progress_bar(self, consumed_bytes, total_bytes):
if total_bytes:
if self.tqdm_bar is None:
# 创建 tqdm 进度条实例
self.tqdm_bar = tqdm.tqdm(total=total_bytes, unit='B', unit_scale=True, desc='Downloading from Aliyun OSS')

# rate = int(100 * (float(consumed_bytes) / float(total_bytes)))
# 使用tqdm显示进度条
if self.tqdm_bar:
self.tqdm_bar.n = consumed_bytes
self.tqdm_bar.refresh()

# sys.stdout.flush()

def download(self, dataset_name, output):
import oss2

self.tqdm_bar = None
oss2.resumable_download(self.bucket, dataset_name+'.zip', output, progress_callback=self.tqdm_progress_bar)

# 下载完成后关闭进度条
if self.tqdm_bar:
self.tqdm_bar.close()


class EasyidpDataSet():
"""The base class for Dataset
"""

def __init__(self, name="", url_list=[], size="",):
def __init__(self, name="", gdrive_url="", size="",):
"""The dataset has the following properties (almost in string type)
name
Expand Down Expand Up @@ -166,7 +296,7 @@ def __init__(self, name="", url_list=[], size="",):
"""
self.name = name
self.url_list = url_list
self.grive_url = gdrive_url
self.size = size
self.data_dir = user_data_dir(self.name)
self.zip_file = user_data_dir(self.name + ".zip")
Expand All @@ -181,7 +311,7 @@ def load_data(self):
if not os.path.exists(self.data_dir):

if not os.path.exists(self.zip_file):
out = self._download_data()
self._download_data()

if os.path.exists(self.zip_file):
print("Successfully downloaded, start unzipping ...")
Expand Down Expand Up @@ -211,23 +341,34 @@ def _download_data(self):
"""
# Download; extract data to disk.
# Raise an exception if the link is bad, or we can't connect, etc.

if url_checker(self.url_list[0]): # google drive
output = gdown.download(url=self.url_list[0], output=str(self.zip_file), quiet=False, fuzzy=True)
elif url_checker(self.url_list[1]): # cowtransfer
print(
f"Could not access to default google drive download link <{self.url_list[1]}>."
f"Please download the file in browser and unzip to the popup folder "
f"[{str(user_data_dir())}]"
)
# open url
webbrowser.open(self.url_list[1], new=0, autoraise=True)
# open folder in file explorer
show_data_dir()
import easyidp as idp

if idp.GOOGLE_AVAILABLE:
import gdown
# google drive gdown_test.zip file is accessable
# then try according google drive download link
if url_checker(self.grive_url):
output = gdown.download(url=self.grive_url, output=str(self.zip_file), quiet=False, fuzzy=True)
else:
# user can access Google Drive but maybe dataset zip file is missing, no waste AliYun OSS resource
# just mention user to double check google drive access
raise ConnectionError(
f"Could not access google download link for dataset {self.name} from: \n{self.grive_url}.\n"
f"Please contact the maintainer via github if above link is broken."
)
else:
raise ConnectionError("Could not find any downloadable link. Please contact the maintainer via github.")

return output
# high possibility in China Mainland, use aliyun OSS for downloading
is_mainland_user = input("Google Drive Unaccessable, are you locate in China Mainland? (Y/N)\n>>> ")
if is_mainland_user in ["Yes", "Y", "y", "yes"]:
if idp.aliyun_down is None:
idp.aliyun_down = AliYunDownloader()

idp.aliyun_down.download_auth(dataset_name=self.name, dataset_size=self.size)
idp.aliyun_down.download(dataset_name=self.name, output=str(self.zip_file))
else:
raise ConnectionError(
f"Could not find proper downloadable link for dataset {self.name}.\n"
f"Please contact the maintainer via github.")

def _unzip_data(self):
"""Unzip downloaded zip data and remove after decompression
Expand All @@ -240,7 +381,7 @@ def _unzip_data(self):
os.remove(self.zip_file)
else:
raise FileNotFoundError("Seems fail to unzip, please check whether the zip file is fully downloaded.")


class ReconsProj():

Expand Down Expand Up @@ -271,12 +412,9 @@ class Lotus(EasyidpDataSet):
- **Outputs** : DOM, DSM, PCD
"""

url_list = [
"https://drive.google.com/file/d/1SJmp-bG5SZrwdeJL-RnnljM2XmMNMF0j/view?usp=share_link",
"https://fieldphenomics.cowtransfer.com/s/9a87698f8d3242"
]
name = "2017_tanashi_lotus"
size = "3.3GB"
gdrive_url = "https://drive.google.com/file/d/1SJmp-bG5SZrwdeJL-RnnljM2XmMNMF0j/view?usp=share_link"

def __init__(self):
"""
Expand Down Expand Up @@ -346,12 +484,10 @@ class ForestBirds(EasyidpDataSet):
- **Software** : Metashape
- **Outputs** : DOM, DSM, PCD
"""
url_list = [
"https://drive.google.com/file/d/1mXkzaoSSCAA87cxcMHKL6_VNlykRYxJr/view?usp=sharing",
"https://fieldphenomics.cowtransfer.com/s/7709bf78fd6145"
]

name = "2022_florida_forestbirds"
size = "5.4GB"
size = "1.97GB"
gdrive_url = "https://drive.google.com/file/d/1mXkzaoSSCAA87cxcMHKL6_VNlykRYxJr/view?usp=sharing"

def __init__(self):
"""
Expand Down Expand Up @@ -390,33 +526,25 @@ def __init__(self):
self.metashape.dsm = self.data_dir / "Hidden_Little_03_24_2022_DEM.tif"

class GDownTest(EasyidpDataSet):

url_list = [
"https://drive.google.com/file/d/1yWvIOYJ1ML-UGleh3gT5b7dxXzBuSPgQ/view?usp=share_link",
"https://fieldphenomics.cowtransfer.com/s/b5a469fab5dc48"
]
"""The data for Google Drive and Aliyun OSS download testing
"""

def __init__(self):

super().__init__("gdown_test", self.url_list, "0.2KB")
super().__init__("gdown_test", GDOWN_TEST_URL, "0.2KB")
super().load_data()

self.pix4d.proj = self.data_dir / "file1.txt"
self.metashape.param = self.data_dir / "folder1"



class TestData(EasyidpDataSet):
"""The data for developer and package testing.
"""

url_list = [
"https://drive.google.com/file/d/17b_17CofqIuCVOWMnD67_wOnWMtwF8bw/view?usp=share_link",
"https://fieldphenomics.cowtransfer.com/s/edaf0826b02548"
]

name = "data_for_tests"
size = "344MB"
gdrive_url = "https://drive.google.com/file/d/17b_17CofqIuCVOWMnD67_wOnWMtwF8bw/view?usp=share_link"

def __init__(self, test_out="./tests/out"):
"""
Expand Down
5 changes: 4 additions & 1 deletion requirements/build.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
packaging>=20.0
wheel
numpy>=1.19
numpy>=1.19
build
setuptools
twine
5 changes: 4 additions & 1 deletion tests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,10 @@
if not out_dir.exists():
out_dir.mkdir()

out_folders = ["json_test", "pcd_test", "cv_test", "tiff_test", "visual_test", "back2raw_test"]
out_folders = [
"json_test", "pcd_test", "cv_test", "tiff_test",
"visual_test", "back2raw_test", "data_test"
]

for o in out_folders:
sub_dir = out_dir / o
Expand Down
Loading

0 comments on commit aab916c

Please sign in to comment.