-
Notifications
You must be signed in to change notification settings - Fork 3.9k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
c848767
commit 5eaa65d
Showing
2 changed files
with
397 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
name: Check Python Dependencies | ||
|
||
on: | ||
pull_request: | ||
|
||
defaults: | ||
run: | ||
shell: bash # strict bash | ||
|
||
jobs: | ||
check_dependencies: | ||
runs-on: ubuntu-latest | ||
|
||
steps: | ||
- name: Checkout Repository | ||
uses: actions/checkout@v3 | ||
|
||
- name: Set up Python | ||
uses: actions/setup-python@v4 | ||
with: | ||
python-version: '3.8' | ||
|
||
- name: Run Python script | ||
run: python find_dependencies.py |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,373 @@ | ||
""" | ||
Spider and catalog dependencies. | ||
$ export OLIVE_DIRS=$(gittreeif origin/open-release/olive.master -q pwd) | ||
$ python find_deps.py $OLIVE_DIRS | ||
""" | ||
|
||
import concurrent.futures | ||
import contextlib | ||
import functools | ||
import itertools | ||
import json | ||
import os | ||
import re | ||
import shlex | ||
import shutil | ||
import subprocess | ||
import sys | ||
import tarfile | ||
import time | ||
import zipfile | ||
from pathlib import Path | ||
from typing import Iterable, Optional, Tuple | ||
|
||
import requests | ||
from rich.progress import Progress, MofNCompleteColumn | ||
|
||
# pylint: disable=unspecified-encoding | ||
|
||
cached = functools.lru_cache(maxsize=0) | ||
|
||
@contextlib.contextmanager | ||
def change_dir(new_dir): | ||
""" | ||
Change to a new directory, and then change back. | ||
Will make the directory if needed. | ||
""" | ||
old_dir = os.getcwd() | ||
new_dir = Path(new_dir) | ||
new_dir.mkdir(parents=True, exist_ok=True) | ||
os.chdir(new_dir) | ||
try: | ||
yield new_dir | ||
finally: | ||
os.chdir(old_dir) | ||
|
||
|
||
def run_command(cmd: str, outfile=None) -> Tuple[bool, str]: | ||
""" | ||
Run a command line (with no shell). Write the output to a file. | ||
Returns a tuple: | ||
bool: true if the command succeeded. | ||
str: the output of the command. | ||
""" | ||
proc = subprocess.run( | ||
shlex.split(cmd, posix=False), | ||
shell=False, | ||
check=False, | ||
stdout=subprocess.PIPE, | ||
stderr=subprocess.STDOUT, | ||
) | ||
output = proc.stdout.decode("utf-8") | ||
if outfile: | ||
Path(outfile).write_text(output) | ||
|
||
return proc.returncode == 0, output.strip() | ||
|
||
|
||
@cached | ||
def npm_repo_url(npm_spec: str) -> Optional[str]: | ||
"""Given 'jspkg@0.1.0', return a repo url.""" | ||
pkg, _, ver = npm_spec.rpartition("@") | ||
url = f"https://registry.npmjs.org/{pkg}/{ver}" | ||
try: | ||
resp = requests.get(url, timeout=60) | ||
if resp.status_code != 200: | ||
print(f"{npm_spec}: {url} -> {resp.status_code}") | ||
return None | ||
jdata = resp.json() | ||
except requests.RequestException as exc: | ||
print(f"Couldn't fetch npm data for {npm_spec}: {exc}") | ||
return None | ||
repo = jdata.get("repository") | ||
if repo is None: | ||
return None | ||
if isinstance(repo, dict): | ||
repo = repo["url"] | ||
return repo | ||
|
||
def canonical_url(url: str) -> str: | ||
"""Canonicalize a repo URL, probably on GitHub.""" | ||
for pat, repl in [ | ||
(r"^git\+", ""), | ||
(r"#.$", ""), | ||
(r"\.git$", ""), | ||
(r"^git:", "https:"), | ||
(r"^ssh://git@", "https://"), | ||
(r"^git@github.com:", "https://github.com/"), | ||
]: | ||
url = re.sub(pat, repl, url) | ||
if ":" not in url and url.count("/") == 1: | ||
url = f"https://github.com/{url}" | ||
return url | ||
|
||
@cached | ||
def find_real_url(url: str) -> Optional[str]: | ||
"""Find the eventual real url for a redirected url.""" | ||
while True: | ||
try: | ||
resp = requests.head(url, timeout=60, allow_redirects=True) | ||
except requests.RequestException as exc: | ||
print(f"Couldn't fetch {url}: {exc}") | ||
return None | ||
if resp.status_code == 429: | ||
# I didn't know you could get 429 from https://github.com, but you can... | ||
wait = int(resp.headers.get("Retry-After", 10)) | ||
time.sleep(wait + 1) | ||
else: | ||
break | ||
|
||
if resp.status_code == 200: | ||
return resp.url | ||
return None | ||
|
||
|
||
WORK_DIR = Path("/tmp/unpack_reqs") | ||
|
||
def parallel_map(func, data, description): | ||
"""Run func over data using threads, with a progress bar.""" | ||
data = list(data) | ||
n_workers = os.cpu_count() or 1 | ||
with concurrent.futures.ThreadPoolExecutor(max_workers=n_workers) as executor: | ||
with Progress(*Progress.get_default_columns(), MofNCompleteColumn()) as progress: | ||
pbar = progress.add_task(f"{description:20}", total=len(data)) | ||
for result in executor.map(func, data): | ||
progress.update(pbar, advance=1) | ||
yield result | ||
|
||
def write_list(path: str, lines: Iterable[str]): | ||
"""Write a list of strings to a file.""" | ||
with Path(path).open("w") as flist: | ||
for line in lines: | ||
print(line, file=flist) | ||
|
||
def check_js_dependencies() -> Iterable[str]: | ||
"""Check the JS dependencies in package-lock.json, returning a set of repo URLs.""" | ||
print("Checking JavaScript dependencies") | ||
with Path("package-lock.json").open() as lockf: | ||
lock_data = json.load(lockf) | ||
|
||
deps = set() | ||
for name, pkg in lock_data["packages"].items(): | ||
name = pkg.get("name") or name | ||
name = name.rpartition("node_modules/")[-1] | ||
version = pkg.get("version") | ||
if version is None: | ||
continue | ||
deps.add(f"{name}@{version}") | ||
write_list("deps.txt", sorted(deps)) | ||
|
||
urls = set() | ||
for url in parallel_map(npm_repo_url, deps, "Getting npm URLs"): | ||
if url: | ||
urls.add(canonical_url(url)) | ||
|
||
real_urls = set() | ||
for url in parallel_map(find_real_url, urls, "Getting real URLs"): | ||
if url: | ||
real_urls.add(url) | ||
|
||
print(f"{len(deps)} deps, {len(urls)} urls, {len(real_urls)} real urls") | ||
write_list("repo_urls.txt", sorted(real_urls)) | ||
return real_urls | ||
|
||
def check_py_dependencies() -> Iterable[str]: | ||
"""Check the Python dependencies in base.txt, returning a set of repo URLs.""" | ||
print("Checking Python dependencies") | ||
|
||
print("Creating venv") | ||
run_command("python3 -m venv .venv", "make_venv.log") | ||
run_command(".venv/bin/python3 -m pip install -U pip", "pip_upgrade.log") | ||
print("Downloading packages") | ||
run_command(".venv/bin/python3 -m pip download --dest files -r base.txt", "pip_download.log") | ||
|
||
urls = set() | ||
for url in parallel_map(repo_url_from_wheel, Path("files").glob("*.whl"), "Examining wheels"): | ||
if url: | ||
urls.add(canonical_url(url)) | ||
|
||
for url in parallel_map(repo_url_from_tgz, Path("files").glob("*.tar.gz"), "Examining tar.gz"): | ||
if url: | ||
urls.add(canonical_url(url)) | ||
|
||
with open("base.txt") as fbase: | ||
for line in fbase: | ||
if match := re.search(r"https://github.com[^@ #]*(\.git)?", line): | ||
urls.add(canonical_url(match[0])) | ||
|
||
real_urls = set() | ||
for url in parallel_map(find_real_url, urls, "Getting real URLs"): | ||
if url: | ||
real_urls.add(url) | ||
|
||
write_list("repo_urls.txt", sorted(real_urls)) | ||
return real_urls | ||
|
||
def matching_text(text, regexes): | ||
"""Find a line in text matching a regex, and return the first regex group.""" | ||
for regex in regexes: | ||
for line in text.splitlines(): | ||
if match := re.search(regex, line): | ||
return match[1] | ||
return None | ||
|
||
@cached | ||
def repo_url_from_wheel(wheel_path: str) -> Optional[str]: | ||
"""Read metadata from a .whl file, returning the repo URL.""" | ||
with zipfile.ZipFile(wheel_path) as whl_file: | ||
fmetadata = next((f for f in whl_file.namelist() if f.endswith("/METADATA")), None) | ||
if fmetadata is None: | ||
print(f"No metadata in {wheel_path}") | ||
return None | ||
with whl_file.open(fmetadata) as inner_file: | ||
metadata = inner_file.read().decode("utf-8") | ||
return repo_url_from_metadata(wheel_path, metadata) | ||
|
||
@cached | ||
def repo_url_from_tgz(tgz_path: str) -> Optional[str]: | ||
"""Read metadata from a .tar.gz file, returning the repo URL.""" | ||
with tarfile.open(tgz_path) as tgz_file: | ||
fmetadata = next((f for f in tgz_file.getnames() if f.endswith("/PKG-INFO")), None) | ||
if fmetadata is None: | ||
print(f"No metadata in {tgz_path}") | ||
return None | ||
metadata = tgz_file.extractfile(fmetadata).read().decode("utf-8") | ||
return repo_url_from_metadata(tgz_path, metadata) | ||
|
||
|
||
SOURCE_URL_REGEXES = [ | ||
# These regexes are tried in order. The first group is the extracted URL. | ||
r"(?i)^Project-URL: Source.*,\s*(.*)$", | ||
r"(?i)^Home-page: (.*)$", | ||
r"(?i)^Project-URL: Home.*,\s*(.*)$", | ||
# If they point to GitHub issues, that's their repo. | ||
r"(?i)^Project-URL: [^,]+,\s*(https?://github.com/[^/]+/[^/]+)/issues/?$", | ||
# If we can't find a URL marked as home, then use any GitHub repo URL. | ||
r"(?i)^Project-URL: [^,]+,\s*(https?://github.com/[^/]+/[^/]+)$", | ||
] | ||
|
||
def repo_url_from_metadata(filename, metadata): | ||
"""Find the likely source repo URL from PyPI metadata.""" | ||
repo_url = matching_text(metadata, SOURCE_URL_REGEXES) | ||
if repo_url is None: | ||
print(f"No repo URL in {filename}") | ||
return None | ||
if repo_url == "UNKNOWN": | ||
print(f"Repo URL is UNKNOWN in {filename}") | ||
return None | ||
return repo_url | ||
|
||
# The first of these we find is the requirements file we'll examine: | ||
PY_REQS = [ | ||
"requirements/edx/base.txt", | ||
"requirements/base.txt", | ||
"requirements.txt", | ||
] | ||
|
||
# Files that indicate a repo is a Python project: | ||
PY_INDICATORS = [ | ||
"setup.py", | ||
"setup.cfg", | ||
"pyproject.toml", | ||
] | ||
|
||
def find_py_reqs(): | ||
"""Find the Python requirements file to use.""" | ||
for fname in PY_REQS: | ||
possible_req = Path(fname) | ||
if possible_req.exists(): | ||
return possible_req | ||
if any(Path(ind).exists() for ind in PY_INDICATORS): | ||
print(f"WARNING: {os.getcwd()} is likely a Python package, but we can't find its dependencies.") | ||
return None | ||
|
||
|
||
def process_directory(): | ||
""" | ||
Find all the requirements in the current directory. Returns a set of repo URLs. | ||
Also copies the considered dependencies file into the temp work directory, | ||
for later analysis. | ||
""" | ||
repo_name = Path.cwd().name | ||
repo_work = WORK_DIR / repo_name | ||
repo_work.mkdir(parents=True, exist_ok=True) | ||
repo_urls = set() | ||
if (js_reqs := Path("package-lock.json")).exists(): | ||
shutil.copyfile(js_reqs, repo_work / "package-lock.json") | ||
with change_dir(repo_work): | ||
repo_urls.update(check_js_dependencies()) | ||
if (py_reqs := find_py_reqs()): | ||
shutil.copyfile(py_reqs, repo_work / "base.txt") | ||
with change_dir(repo_work): | ||
repo_urls.update(check_py_dependencies()) | ||
return repo_urls | ||
|
||
FIRST_PARTY_ORGS = ["openedx"] | ||
|
||
SECOND_PARTY_ORGS = [ | ||
"edx", "edx-unsupported", "edx-solutions", | ||
"mitodl", | ||
"overhangio", | ||
"open-craft", "eduNEXT", "raccoongang", | ||
] | ||
|
||
# def urls_in_orgs(urls, orgs): | ||
# """ | ||
# Find urls that are in any of the `orgs`. | ||
# """ | ||
# return sorted( | ||
# url for url in urls | ||
# if any(f"/{org}/" in url for org in orgs) | ||
# ) | ||
|
||
def urls_in_orgs(urls, org): | ||
""" | ||
Find urls that are in any of the `orgs`. | ||
""" | ||
return sorted( | ||
url for url in urls | ||
if f"/{org}/" in url | ||
) | ||
|
||
|
||
def main(dirs=None, org=None): | ||
""" | ||
Analyze the requirements in all of the directories mentioned on the command line. | ||
If arguments have newlines, treat each line as a separate directory. | ||
""" | ||
if dirs is None: | ||
repo_dir = sys.argv[1] | ||
org_flag_index = sys.argv.index("--org") | ||
org = sys.argv[org_flag_index + 1] | ||
#import pdb; pdb.set_trace() | ||
print(f"Creating new work directory: {WORK_DIR}") | ||
shutil.rmtree(WORK_DIR, ignore_errors=True) | ||
|
||
#dirs = sorted(itertools.chain.from_iterable(d.splitlines() for d in dirs)) | ||
repo_urls = set() | ||
|
||
#for i_dir, repo_dir in enumerate(dirs, start=1): | ||
# print(f"-- repo {repo_dir} ({i_dir}/{len(dirs)}) ----------") | ||
with change_dir(repo_dir): | ||
repo_urls.update(process_directory()) | ||
|
||
write_list(WORK_DIR / "repo_urls.txt", sorted(repo_urls)) | ||
|
||
#firsts = urls_in_orgs(repo_urls, FIRST_PARTY_ORGS) | ||
#write_list(WORK_DIR / "first_party_urls.txt", firsts) | ||
seconds = urls_in_orgs(repo_urls, "edx") | ||
#write_list(WORK_DIR / "second_party_urls.txt", seconds) | ||
|
||
print("== DONE ==============") | ||
print("Second-party:") | ||
print("\n".join(seconds)) | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |