diff --git a/Dockerfile b/Dockerfile deleted file mode 100644 index fd5cd6a..0000000 --- a/Dockerfile +++ /dev/null @@ -1,13 +0,0 @@ -FROM python:3.8.0-slim - -WORKDIR /main - -COPY . /main - -ENV PYTHONPATH="/main/scripts:${PYTHONPATH}" - -RUN pip install --trusted-host pypi.python.org -r requirements.txt - -WORKDIR ./scripts - -CMD ./update.sh python diff --git a/README.md b/README.md index 33f4111..d381bc2 100644 --- a/README.md +++ b/README.md @@ -21,8 +21,8 @@ Each script describes its input and output. If you would like to get updated metric data, for the same list of libraries we have (found in `SharedFiles/LibraryData.json`, please follow the following steps: - You first need to set up some of the configuration parameters in the file `Config.json`: - - Change the value of `TOKEN` to your own GitHub generated token. - - Change the value of `SO_TOKEN` to your stack exchange key. + - Change the value of `TOKEN` to your own GitHub generated token. ([How to create Github TOKEN](https://github.com/ualberta-smr/LibraryMetricScripts/wiki/Creating-access-tokens#github-token)) + - Change the value of `SO_TOKEN` to your stack exchange key. ([How to create StackOverflow TOKEN](https://github.com/ualberta-smr/LibraryMetricScripts/wiki/Creating-access-tokens#stackoverflow-token)) - You also need to set a DB to fill with the results of running the script. You will need to create a MySQL database (we call ours libcomp). In the `librarycomparison/settings.py`, change the username, database name, and password in the `DATABASE` information to that of your created database. Afterwards run `python3 manage.py makemigrations` and then `python3 manage.py migrate`. This will create the database schema for you. See notes below about the database schema. - Run `python3 -m scripts` from within the main repo directory which will call all the metric scripts. This script runs all metrics and fills the MySQL database with all the results from all scripts. diff --git a/database/metric-setup.sql b/database/metric-setup.sql new file mode 100644 index 0000000..0463a33 --- /dev/null +++ b/database/metric-setup.sql @@ -0,0 +1,10 @@ +use libcomp; +alter table Metric add unique (name); +insert into Metric(name) value("popularity"); +insert into Metric(name) value("release frequency"); +insert into Metric(name) value("last discussed on so"); +insert into Metric(name) value("last modification date"); +insert into Metric(name) value("breaking changes"); +insert into Metric(name) value("issue response"); +insert into Metric(name) value("issue closing"); +insert into Metric(name) value("issue classification"); diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..b38c6de --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,55 @@ +version: "3.9" +services: + metric-script: + build: + context: . + dockerfile: ./docker/Dockerfile + restart: always + stdin_open: true + tty: true + volumes: + - shared_data:/home/scripts + container_name: "metric-script" + depends_on: + - "db" + web: + image: ualbertasmr/librarycomparisons_web:latest + hostname: web + restart: always + stdin_open: true + tty: true + ports: + - "8000:8000" + volumes: + - shared_data:/home/scripts + networks: + default: + container_name: "librarycomparisons_web" + depends_on: + - "db" + db: + image: ualbertasmr/librarycomparisons_db:latest + hostname: db + restart: always + command: + --default-authentication-plugin=mysql_native_password + ports: + - "3306:3306" + volumes: + - ./database:/docker-entrypoint-initdb.d + - db_data:/var/lib/mysql + networks: + default: + environment: + MYSQL_HOST: localhost + MYSQL_PORT: 3306 + MYSQL_DATABASE: "libcomp" + MYSQL_PASSWORD: "mypwd" + MYSQL_ROOT_PASSWORD: "mypwd" + +volumes: + db_data: + shared_data: + +networks: + default: diff --git a/docker/Dockerfile b/docker/Dockerfile new file mode 100644 index 0000000..8c2ce44 --- /dev/null +++ b/docker/Dockerfile @@ -0,0 +1,26 @@ +FROM python:3.8.0-slim + +# Install OpenJDK 11 +ENV DEBIAN_FRONTEND=noninteractive +RUN mkdir -p /usr/share/man/man1 /usr/share/man/man2 +RUN apt-get update && apt-get install -y --no-install-recommends openjdk-11-jre +# Prints installed java version, just for checking +RUN java --version + +WORKDIR /main + +COPY . /main + +ENV PYTHONPATH="/main/scripts:${PYTHONPATH}" + +RUN apt-get update; \ + apt-get -y install sudo; \ + sudo apt-get -y install default-libmysqlclient-dev \ + gcc \ + default-mysql-client \ + default-mysql-server \ + git \ + libpangocairo-1.0-0; \ + pip install --trusted-host pypi.python.org -r requirements.txt; + +ENTRYPOINT [ "bash", "./docker/start.sh" ] diff --git a/docker/README.md b/docker/README.md new file mode 100644 index 0000000..8fc60f2 --- /dev/null +++ b/docker/README.md @@ -0,0 +1,102 @@ +Temp notes for updating ReadMe: + +- had to run the down with -v to delete data +- followed exact instructions... but noticed that you have to run createmetrics from inside the script in docker-compose step (seems in parallel with leaving the other things running). Need to clarify that in instructions +- Had a problem with the SO key because my key was on old version of API. Had to do it for API V2.0 +- Next step is to check: if I exit metrics container, is data still saved in DB? Can I import the DB dump in DB and have it reflect in website? Can I trigger update in the container from localhost? Right now, the release data is taking time to be calculated. +- Remove note about MAXSIZE since it's not used anymore +- Add note that for testing, reduce num of libs in the lib file + +# How to calculate metrics & run visualization website using docker + +This is a simple way to run the metrics and also get a local website setup to view the metrics (similar to [https://smr.cs.ualberta.ca/comparelibraries/](https://smr.cs.ualberta.ca/comparelibraries/) + +You'll need to have [docker](https://docs.docker.com/get-docker/) and [docker-compose](https://docs.docker.com/compose/install/) installed. + +- After cloning this repo, you first need to set up some of the configuration parameters in the file `scripts/Config.json`: + - Change the value of `TOKEN` to your own GitHub generated token. ([How to create Github TOKEN](https://github.com/ualberta-smr/LibraryMetricScripts/wiki/Creating-access-tokens#github-token)). + - Change the value of `SO_TOKEN` to your stack exchange key. ([How to create StackOverflow TOKEN](https://github.com/ualberta-smr/LibraryMetricScripts/wiki/Creating-access-tokens#stackoverflow-token)). Please make sure to create a token for v2.0 of the API. + - Change `"OUTPUT_PATH"` to `"../home/scripts/"`. + +- You can update the `MAXSIZE` to 100 in `Config.json` for testing purpose. + +## Creating the image +### 1. Builds/Rebuilds the image (not start the containers) in the docker-compose.yml file: + +``` +docker-compose build --no-cache +``` + +### 2. Starts the containers + +**Starts the containers && Starts the website** +``` +docker-compose up +``` +To access the website, use http://127.0.0.1:8000/comparelibraries/ + +**Run metric script:** +The above step will have the website running, but right now, there is no data in the DB yet to be displayed. To calculate the metrics, run: + +``` +docker-compose run metric-script +``` + +This will open an interactive shell into the container and you can then invoke `createmetrics` to calculate the Metrics: + +``` +root@e7c767ab1a70:/main# createmetrics +``` + +**(Optional) Open librarycomparisons website command shell:** +``` +docker-compose run --service-ports web +``` +- `start`: Starts the Django server. The librarycomparison web will run in the `8000` port by default. +- `migrate`: Runs Django migrations +- `make`: Runs Django makemigrations +- `createsuperuser`: Runs Django createsuperuser + +To access the website, use http://127.0.0.1:8000/comparelibraries/ + +### 3. Stops containers and removes containers, networks, volumes, and images created by up + +``` +docker-compose down +``` +Remove volume: `docker-compose down -v`. Warning: this will permanently delete the contents in the db_data volume, wiping out any previous database you had there + +### 4. Setup Metric Table if you create the docker volumn for the first time +``` +docker exec librarymetricscripts_db_1 /bin/sh -c 'mysql -uroot -p"mypwd" libcomp < docker-entrypoint-initdb.d/metric-setup.sql' +``` + +## Accessing docker container mysql databases +1. docker exec -it MyContainer mysql -uroot -pMyPassword +eg: `docker exec -it librarymetricscripts_db_1 mysql -uroot -p"mypwd"` +2. Show MySQL Databases: `show databases;` +``` +mysql> show databases; ++--------------------+ +| Database | ++--------------------+ +| information_schema | +| libcomp | +| mysql | +| performance_schema | +| sys | ++--------------------+ +``` +3. Show MySQL Tables: +``` +use libcomp; +show tables; +``` +4. Show Table's schema +``` +describe libcomp.Metric; +``` +5. Show the values of Metric table +``` +select * from libcomp.Metric; +``` diff --git a/docker/start.sh b/docker/start.sh new file mode 100644 index 0000000..9ded93e --- /dev/null +++ b/docker/start.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +echo "alias createmetrics='python -m scripts'" >> ~/.bashrc +echo "alias updatemetrics='./updatemetrics.sh'" >> ~/.bashrc + +/bin/bash diff --git a/librarycomparison/settings.py b/librarycomparison/settings.py index fd11b7a..564dbf9 100644 --- a/librarycomparison/settings.py +++ b/librarycomparison/settings.py @@ -85,7 +85,9 @@ 'ENGINE': 'django.db.backends.mysql', 'NAME': 'libcomp', 'USER': 'root', - 'PASSWORD': '' + 'HOST': 'db', + 'PORT' : 3306, + 'PASSWORD': 'mypwd' } } diff --git a/requirements.txt b/requirements.txt index baf0754..e8519c2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,3 +12,4 @@ cairosvg gitpython mysqlclient>=1.4.6 djangorestframework>=3.11 +beautifulsoup4>=4.9.3 diff --git a/scripts/Config.json b/scripts/Config.json index 7be2b9e..c948dd8 100644 --- a/scripts/Config.json +++ b/scripts/Config.json @@ -8,5 +8,6 @@ "MAXSIZE": "1000", "POPULARITY_OUTPUT_FILE":"scripts/Popularity/popularity_results.txt", "TIME_SPAN":"365", - "SO_TOKEN":"enter your SO token" + "SO_TOKEN":"enter your SO token", + "OUTPUT_PATH": "../home/scripts/" } diff --git a/scripts/IssueMetrics/performanceclassifier.py b/scripts/IssueMetrics/performanceclassifier.py index 916b4da..6db1644 100644 --- a/scripts/IssueMetrics/performanceclassifier.py +++ b/scripts/IssueMetrics/performanceclassifier.py @@ -9,6 +9,7 @@ from github import Github, Repository import string +nltk.download('punkt') stemmer = PorterStemmer() def stem_words(tokens): diff --git a/scripts/Popularity/GHDepPopularity.py b/scripts/Popularity/GHDepPopularity.py new file mode 100644 index 0000000..2d2d292 --- /dev/null +++ b/scripts/Popularity/GHDepPopularity.py @@ -0,0 +1,67 @@ +import requests +from bs4 import BeautifulSoup +import re +import json +from scripts.CommonUtilities import Common_Utilities +from scripts.SharedFiles.utility_tool import read_json_file + +"""Gets number of dependent repos as calculated by github dependency graph https://docs.github.com/en/code-security/supply-chain-security/understanding-your-software-supply-chain/about-the-dependency-graph#:~:text=The%20dependency%20graph%20is%20a,packages%20that%20depend%20on%20it + + Parameters + ---------- + repo : str + Github repo represented as user/repo + + Returns + ------- + num_dependents + number of dependents +""" +def get_num_dependents(repo): + #inspired from Bertrand Martel's answer on https://stackoverflow.com/questions/58734176/how-to-use-github-api-to-get-a-repositorys-dependents-information-in-github + url = 'https://github.com/{}/network/dependents'.format(repo) + dependent_href = '/{}/network/dependents?dependent_type=REPOSITORY'.format(repo) + r = requests.get(url) + soup = BeautifulSoup(r.content, "html.parser") + + if len(soup.body.findAll("We haven’t found any dependents for this repository yet.")) != 0: + return 0 + + dependents = soup.find('a', href= dependent_href) #returns, for example, "1,234,000 Repositories" + #regex from https://www.regexpal.com/98336 + num_dependents = re.search(r'(\d{0,3},)?(\d{3},)?\d{0,3}', dependents.text.strip()).group(0) + print(num_dependents) + return num_dependents + +def read_libraries(file_path): + libdict = {} + f = read_json_file(file_path) + for line in f: + libdict[line['Package']]=line['FullRepoName'] + + return libdict + +def send_totals_to_file(output_file, keyword, num_found): + output_file = open(output_file, "a") + output_file.write(keyword + ":" + str(num_found) + "\n") + output_file.close() + +def get_popularity(): + print("Getting popularity") + config_dict = Common_Utilities.read_config_file() # read all config data + + library_dict = read_libraries(config_dict["LIBRARY_LIST"]) # read all libraries to search against + + output_file_name = config_dict["POPULARITY_OUTPUT_FILE"] # this is the output file that we are going to send libraries with their total counts to + + output_file = open(output_file_name, "w") + output_file.close() + + for keyword,repo in library_dict.items(): + print("for lib", repo) + num_dependents = get_num_dependents(repo) + send_totals_to_file(output_file_name, repo, num_dependents) + + +if __name__ == "__main__": + get_popularity() \ No newline at end of file diff --git a/scripts/Popularity/GitHub_Phase2.py b/scripts/Popularity/GitHub_Phase2.py index f69090e..f2ef4d1 100644 --- a/scripts/Popularity/GitHub_Phase2.py +++ b/scripts/Popularity/GitHub_Phase2.py @@ -1,4 +1,5 @@ ''' +This script is no longer used due to the limits of the API. Instead, we are going to depend on the github dependency graph This script searches the 1500 repositories in Top_Repo.txt for import statements of all the library packages in library.txt Requires: A configuration file called GitHubSearch.ini Ensure a GitHub token generated by your account is in the configuration account so that the script may connect to github diff --git a/scripts/SharedFiles/LibraryData.json b/scripts/SharedFiles/LibraryData.json index efda770..58d9c25 100644 --- a/scripts/SharedFiles/LibraryData.json +++ b/scripts/SharedFiles/LibraryData.json @@ -62,10 +62,10 @@ { "LibraryName": "tinylog", "Domain": "Logging", - "FullRepoName": "pmwmedia/tinylog", + "FullRepoName": "tinylog-org/tinylog", "SOtags": "tinylog", "Package": "org.tinylog", - "GitHubURL": "git://github.com/pmwmedia/tinylog.git", + "GitHubURL": "git://github.com/tinylog-org/tinylog.git", "JIRAURL": "", "MavenURL":"https://mvnrepository.com/artifact/org.tinylog/tinylog" }, @@ -172,10 +172,10 @@ { "LibraryName": "conceal", "Domain": "Cryptography", - "FullRepoName": "facebook/conceal", + "FullRepoName": "facebookarchive/conceal", "SOtags": "facebook-conceal", "Package": "com.facebook.crypto", - "GitHubURL": "git://github.com/facebook/conceal.git", + "GitHubURL": "git://github.com/facebookarchive/conceal.git", "JIRAURL": "", "MavenURL":"https://mvnrepository.com/artifact/com.facebook.conceal/conceal" }, @@ -432,10 +432,10 @@ { "LibraryName": "jcommon", "Domain": "Collections", - "FullRepoName": "facebook/jcommon", + "FullRepoName": "facebookarchive/jcommon", "SOtags": "facebook-jcommon", "Package": "com.facebook.util", - "GitHubURL": "git://github.com/facebook/jcommon.git", + "GitHubURL": "git://github.com/facebookarchive/jcommon.git", "JIRAURL": "", "MavenURL":"https://mvnrepository.com/artifact/com.facebook.jcommon/util" }, diff --git a/scripts/app.py b/scripts/app.py index b52eeac..2c75d09 100644 --- a/scripts/app.py +++ b/scripts/app.py @@ -7,8 +7,8 @@ import django django.setup() from scripts.addlibraries import addlibraries -from scripts.Popularity.GitHub_Phase1 import get_top_repos -from scripts.Popularity.GitHub_Phase2 import search_top_repos +from scripts.CommonUtilities import Common_Utilities +from scripts.Popularity.GHDepPopularity import get_popularity from scripts.ReleaseFrequency.releasefrequency import get_release_freq from scripts.License.license import getLicenses from scripts.LastModificationDate.lastmodificationdate import getLastModificationDates @@ -22,38 +22,40 @@ import subprocess def run(): - addlibraries() - get_top_repos() - search_top_repos() - get_release_freq() - getLicenses() - getLastModificationDates() - getLastDiscussedDates() - get_issues() - get_breaking_changes() #must be called after releases are fetched so after release frequency metric - - shutil.copy2('scripts/Popularity/popularity_results.txt', 'scripts/') - - for file in glob.glob(r'scripts/License/*.pkl'): - shutil.copy2(file, 'scripts/') - - for file in glob.glob(r'scripts/LastModificationDate/*.pkl'): - shutil.copy2(file, 'scripts/') - - for file in glob.glob(r'scripts/LastDiscussedOnStackOverflow/*.pkl'): - shutil.copy2(file, 'scripts/') - - filldb() - - try: - os.mkdir("scripts/charts") - except: - print("Charts directory already exists") - - for file in glob.glob(r'scripts/*_chart.pkl'): - shutil.copy2(file, 'scripts/charts/') - - for file in glob.glob(r'scripts/*.pkl'): - os.remove(file) - - os.remove("scripts/popularity_results.txt") \ No newline at end of file + addlibraries() + get_popularity() + get_release_freq() + getLicenses() + getLastModificationDates() + getLastDiscussedDates() + get_issues() + get_breaking_changes() #must be called after releases are fetched so after release frequency metric + + config_dict = Common_Utilities.read_config_file() + output_path = config_dict["OUTPUT_PATH"] + + shutil.copy2('scripts/Popularity/popularity_results.txt', output_path) + + for file in glob.glob(r'scripts/License/*.pkl'): + shutil.copy2(file, output_path) + + for file in glob.glob(r'scripts/LastModificationDate/*.pkl'): + shutil.copy2(file, output_path) + + for file in glob.glob(r'scripts/LastDiscussedOnStackOverflow/*.pkl'): + shutil.copy2(file, output_path) + + filldb() + + try: + os.mkdir(output_path + "charts") + except: + print("Charts directory already exists") + + for file in glob.glob(output_path + r'*_chart.pkl'): + shutil.copy2(file, output_path + "charts") + + for file in glob.glob(output_path + r'*.pkl'): + os.remove(file) + + os.remove(output_path + "popularity_results.txt") diff --git a/scripts/filldb.py b/scripts/filldb.py index cb6aed0..2465526 100644 --- a/scripts/filldb.py +++ b/scripts/filldb.py @@ -11,8 +11,13 @@ import traceback import pytz +global output_path +config_dict = Common_Utilities.read_config_file() +output_path = config_dict["OUTPUT_PATH"] + def saveData(data, filename): - with open("scripts/" + filename, 'wb') as output: + global output_path + with open(output_path + filename, 'wb') as output: pickle.dump(data, output, pickle.DEFAULT_PROTOCOL) def get_latest_metrics_entry(library): @@ -413,7 +418,8 @@ def create_issue_classification_chart(domain): save_chart_in_db(line_chart,domain, metric_name="issue classification") def fillPopularityData(): - with open("scripts/popularity_results.txt") as f: + global output_path + with open(output_path + "popularity_results.txt") as f: lines = f.readlines() lines = [x.strip() for x in lines] for line in lines: @@ -428,7 +434,7 @@ def fillPopularityData(): if metricsentry == None or metricsentry.created_on.date() != datetime.today().date(): metricsentry = MetricsEntry() metricsentry.library = library - metricsentry.popularity = int(popularity) + metricsentry.popularity = int(popularity.replace(',', '')) metricsentry.save() else: print("DID NOT CREATE new entry for:", library.name) @@ -637,7 +643,7 @@ def filldb(): fillPopularityData() print("Calculating release frequency...") calculateReleaseFrequency() - print("Calculating breaking changes...") + print("Filling breaking changes...") fillBreakingChanges() print("Filling last modification data...") fillLastModificationDateData() @@ -645,7 +651,7 @@ def filldb(): fillLastDiscussedSOData() print("Filling license data...") fillLicenseData() - print("Calculating issue data...") + print("Filling issue data...") fillIssueData() print("Calculating overall score...") fillOverallScore() diff --git a/updatemetrics.sh b/updatemetrics.sh index db7e2e0..9de119e 100755 --- a/updatemetrics.sh +++ b/updatemetrics.sh @@ -20,6 +20,11 @@ mysqldump --no-tablespaces --no-create-info --complete-insert --skip-triggers li #pull latest repo changes git pull +#pull latest repo changes but stash first to maintain config changes +git stash +git pull +git stash pop + #invoke script mkdir -p logs