-
Notifications
You must be signed in to change notification settings - Fork 247
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
835a6c1
commit 12c752e
Showing
4 changed files
with
365 additions
and
0 deletions.
There are no files selected for viewing
188 changes: 188 additions & 0 deletions
188
.github/scripts/check_project_fos_precision/field_of_science.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,188 @@ | ||
from functools import lru_cache | ||
from typing import Union | ||
import string | ||
|
||
import pandas as pd | ||
|
||
|
||
@lru_cache() | ||
def get_cip_df(): | ||
|
||
cip_df = pd.read_excel("data/SED-CIP-2022.xlsx") | ||
|
||
# Drop the first two rows and make the third row the column title | ||
cip_df.columns = cip_df.iloc[2] | ||
cip_df = cip_df.iloc[3:] | ||
|
||
cip_df["BroadFieldId"] = cip_df['SED-CIP code'].apply(lambda x: get_id(x, 0)) | ||
cip_df["MajorFieldId"] = cip_df['SED-CIP code'].apply(lambda x: get_id(x, 1)) | ||
cip_df["DetailedFieldId"] = cip_df['SED-CIP code'].apply(lambda x: get_id(x, 2)) | ||
|
||
return cip_df | ||
|
||
|
||
def get_matching_rows(cip_df, broad_id, major_id, detailed_id): | ||
|
||
# Check the finest grain first | ||
detailed_rows = cip_df[(cip_df["BroadFieldId"] == broad_id) & (cip_df['MajorFieldId'] == major_id) & ( | ||
cip_df["DetailedFieldId"] == detailed_id)] | ||
|
||
if len(detailed_rows) > 0: | ||
return detailed_rows | ||
|
||
# Check the major grain | ||
major_rows = cip_df[(cip_df["BroadFieldId"] == broad_id) & (cip_df['MajorFieldId'] == major_id)] | ||
|
||
if len(major_rows) > 0: | ||
return major_rows | ||
|
||
# Check the broad grain | ||
broad_rows = cip_df[cip_df["BroadFieldId"] == broad_id] | ||
|
||
if len(broad_rows) > 0: | ||
return broad_rows | ||
|
||
raise ValueError(f"No matching rows for {broad_id}.{major_id}{detailed_id}") | ||
|
||
|
||
def map_id_to_fields_of_science(id: str): | ||
|
||
# Define the fields we hope to populate | ||
broad_field_of_science = None | ||
major_field_of_science = None | ||
detailed_field_of_science = None | ||
|
||
cip_df = get_cip_df() | ||
|
||
# If we have a direct match, return it | ||
direct_match = cip_df[cip_df["SED-CIP code"] == id] | ||
if len(direct_match) > 0: | ||
return [direct_match["New broad field"].values[0], direct_match["New major field"].values[0], direct_match["New detailed field"].values[0]] | ||
|
||
# Add the broad field | ||
broad_id = get_id(id, 0) | ||
major_id = get_id(id, 1) | ||
detailed_id = get_id(id, 2) | ||
|
||
try: | ||
matching_rows = get_matching_rows(cip_df, broad_id, major_id, detailed_id) | ||
except ValueError as e: | ||
print(id) | ||
return [broad_field_of_science, major_field_of_science, detailed_field_of_science] | ||
|
||
possible_broad_fields = set(map(lambda x: x[1]['New broad field'], matching_rows.iterrows())) | ||
if broad_id is not None: | ||
best_option = None | ||
max_rows = 0 | ||
for possible_broad_field in set(map(lambda x: x[1]['New broad field'], matching_rows.iterrows())): | ||
l = len(cip_df[(cip_df["BroadFieldId"] == broad_id) & (cip_df["New broad field"] == possible_broad_field)]) | ||
|
||
if l > max_rows: | ||
max_rows = l | ||
best_option = possible_broad_field | ||
|
||
print(f"Broad Field: {broad_id}.{major_id}{detailed_id} has possible values {possible_broad_fields} we picked {best_option}") | ||
|
||
broad_field_of_science = best_option | ||
|
||
possible_major_fields = set(map(lambda x: x[1]['New major field'], matching_rows.iterrows())) | ||
if major_id is not None: | ||
best_option = None | ||
max_rows = 0 | ||
for possible_major_field in possible_major_fields: | ||
l = len(cip_df[(cip_df["BroadFieldId"] == broad_id) & (cip_df['MajorFieldId'] == major_id) & ( | ||
cip_df["New major field"] == possible_major_field)]) | ||
if l > max_rows: | ||
max_rows = l | ||
best_option = possible_major_field | ||
|
||
print(f"Major Field: {broad_id}.{major_id}{detailed_id} has rows {possible_major_fields} we picked {best_option}") | ||
|
||
major_field_of_science = best_option | ||
|
||
possible_detailed_fields = set(map(lambda x: x[1]['New detailed field'], matching_rows.iterrows())) | ||
if detailed_id is not None: | ||
best_option = None | ||
max_rows = 0 | ||
for possible_detailed_field in possible_detailed_fields: | ||
l = len(cip_df[(cip_df["BroadFieldId"] == broad_id) & (cip_df['MajorFieldId'] == major_id) & ( | ||
cip_df["DetailedFieldId"] == detailed_id) & (cip_df["New detailed field"] == possible_detailed_field)]) | ||
if l > max_rows: | ||
max_rows = l | ||
best_option = possible_detailed_field | ||
|
||
print(f"Detailed Field: {broad_id}.{major_id}{detailed_id} has rows {possible_detailed_fields} we picked {best_option}") | ||
|
||
detailed_field_of_science = best_option | ||
|
||
return [broad_field_of_science, major_field_of_science, detailed_field_of_science] | ||
|
||
|
||
def get_id(id: Union[float, str], granularity: int): | ||
|
||
# Check if None | ||
if pd.isna(id): | ||
return None | ||
|
||
# Fix up issues from reading the id as a float | ||
digits = [x for x in str(id) if x in string.digits] | ||
|
||
# If the first part is preceded with a 0, (01.2023) | ||
if len(str(id).split(".")[0]) == 1: | ||
digits = ['0', *digits] | ||
|
||
# If the number ends with a 0, (10.2320) | ||
if len(digits) % 2 == 1: | ||
digits = [*digits, '0'] | ||
|
||
|
||
if len(digits) % 2 == 1: | ||
digits = ['0', *digits] | ||
|
||
if granularity == 0: | ||
return "".join(digits[:2]) | ||
|
||
if granularity == 1: | ||
|
||
if len(digits) < 4: | ||
return None | ||
|
||
return "".join(digits[2:4]) | ||
|
||
if granularity == 2: | ||
|
||
if len(digits) < 6: | ||
return None | ||
|
||
return "".join(digits[4:]) | ||
|
||
|
||
def tests(): | ||
|
||
if get_id(1.0, 0) != "01": | ||
raise ValueError("Test failed") | ||
|
||
if get_id(1.0, 1) != "00": | ||
raise ValueError("Test failed") | ||
|
||
if get_id(10.2320, 2) != "20": | ||
raise ValueError("Test failed") | ||
|
||
if get_id(10.2320, 1) != "23": | ||
raise ValueError("Test failed") | ||
|
||
if get_id(10.2320, 0) != "10": | ||
raise ValueError("Test failed") | ||
|
||
if get_id(01.23, 2) != None: | ||
raise ValueError("Test failed") | ||
|
||
if get_id(01.23, 0) != "01": | ||
raise ValueError("Test failed") | ||
|
||
if map_id_to_fields_of_science("26.15") != ['Biological and biomedical sciences','Neurobiology and neurosciences', None]: | ||
raise ValueError("Test failed") | ||
|
||
if __name__ == "__main__": | ||
tests() | ||
print("All tests passed") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
import sys | ||
import datetime | ||
|
||
import yaml | ||
import requests | ||
|
||
from field_of_science import get_id | ||
|
||
|
||
def get_active_projects(start_date: datetime.datetime): | ||
response = requests.get( | ||
"https://gracc.opensciencegrid.org/q/gracc.osg.summary/_search", | ||
json={ | ||
"size": 0, | ||
"query": { | ||
"bool": { | ||
"filter": [ | ||
{ | ||
"term": { | ||
"ResourceType": "Payload" | ||
} | ||
}, | ||
{ | ||
"range": { | ||
"EndTime": { | ||
"lte": int(datetime.datetime.now().timestamp() * 1000), | ||
"gte": int(start_date.timestamp() * 1000) | ||
} | ||
} | ||
} | ||
] | ||
}, | ||
}, | ||
"aggs": { | ||
"projects": { | ||
"terms": { | ||
"field": "ProjectName", | ||
"size": 99999999 | ||
}, | ||
"aggs": { | ||
"projectJobsRan": { | ||
"sum": { | ||
"field": "Njobs" | ||
} | ||
} | ||
} | ||
} | ||
} | ||
} | ||
) | ||
|
||
data = response.json() | ||
|
||
active_projects = [x['key'] for x in data['aggregations']['projects']['buckets']] | ||
|
||
return active_projects | ||
|
||
|
||
|
||
def has_detailed_precision(id: str): | ||
return get_id(id, granularity=1) is not None | ||
|
||
|
||
def main(): | ||
one_year_ago = datetime.datetime.now() - datetime.timedelta(days=365) | ||
active_project_names = get_active_projects(one_year_ago) | ||
|
||
print(active_project_names) | ||
|
||
exceptions = [] | ||
for project_name in active_project_names: | ||
try: | ||
project_data = yaml.load(open(f"../../../projects/{project_name}.yaml"), Loader=yaml.Loader) | ||
|
||
if "FieldOfScienceID" not in project_data or not has_detailed_precision(project_data["FieldOfScienceID"]): | ||
exceptions.append(f"Project {project_name} is running in the OSPool without detailed precision.") | ||
|
||
except FileNotFoundError as e: | ||
pass | ||
|
||
|
||
if exceptions: | ||
print("\n".join(exceptions), sys.stderr) | ||
raise Exception("Projects without detailed precision need to be updated.") | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
68 changes: 68 additions & 0 deletions
68
.github/scripts/check_project_fos_precision/requirements.txt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,68 @@ | ||
asn1==2.7.0 | ||
async-generator==1.10 | ||
attrs==21.4.0 | ||
beautifulsoup4==4.11.1 | ||
blinker==1.6.3 | ||
certifi==2024.2.2 | ||
cffi==1.15.0 | ||
chardet==5.1.0 | ||
click==6.7 | ||
configobj==5.0.8 | ||
cryptography==37.0.2 | ||
Deprecated==1.2.13 | ||
enum-compat==0.0.3 | ||
Flask==1.0.4 | ||
Flask-WTF==0.14.3 | ||
gitdb==4.0.11 | ||
GitPython==3.1.43 | ||
gunicorn==20.1.0 | ||
h11==0.13.0 | ||
icalendar==5.0.12 | ||
idna==3.7 | ||
iniconfig==1.1.1 | ||
itsdangerous==0.24 | ||
Jinja2==2.11.3 | ||
ldap3==2.9.1 | ||
MarkupSafe==2.0.1 | ||
numpy==1.26.4 | ||
outcome==1.1.0 | ||
packaging==21.3 | ||
pandas==2.2.2 | ||
pluggy==1.0.0 | ||
prometheus-client==0.20.0 | ||
py==1.11.0 | ||
pyasn1==0.5.1 | ||
pyasn1-modules==0.2.8 | ||
pycparser==2.21 | ||
PyGithub==1.57 | ||
PyJWT==2.6.0 | ||
PyNaCl==1.5.0 | ||
pyOpenSSL==22.0.0 | ||
pyparsing==3.0.7 | ||
PySocks==1.7.1 | ||
pytest==7.1.1 | ||
pytest-mock==3.7.0 | ||
python-dateutil==2.8.2 | ||
python-gnupg==0.5.2 | ||
python-ldap==3.3.1 | ||
pytz==2024.1 | ||
PyYAML==6.0.1 | ||
requests==2.25.1 | ||
selenium==4.1.3 | ||
six==1.16.0 | ||
smmap==5.0.1 | ||
sniffio==1.2.0 | ||
sortedcontainers==2.4.0 | ||
soupsieve==2.3.2.post1 | ||
tomli==2.0.1 | ||
tqdm==4.64.0 | ||
trio==0.20.0 | ||
trio-websocket==0.9.2 | ||
tzdata==2024.1 | ||
urllib3==1.26.6 | ||
webdriverdownloader==1.1.0.3 | ||
Werkzeug==0.15.6 | ||
wrapt==1.14.1 | ||
wsproto==1.1.0 | ||
WTForms==3.0.1 | ||
xmltodict==0.13.0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
name: Check Project FOS Precision | ||
on: | ||
pull_request: | ||
branches: | ||
- main | ||
schedule: | ||
- cron: '0 0 * * *' | ||
|
||
jobs: | ||
check: | ||
name: Check | ||
runs-on: ubuntu-latest | ||
steps: | ||
- uses: actions/checkout@v3 | ||
- name: Set up Python | ||
uses: actions/setup-python@v4 | ||
with: | ||
python-version: 3.9.15 | ||
cache: 'pip' # caching pip dependencies | ||
- run: pip install -r ./.github/scripts/check_project_fos_precision/requirements.txt | ||
- run: python ./.github/scripts/check_project_fos_precision/main.py |