From bf828c8faf6802faa00cc9bfc23e1a23dca12494 Mon Sep 17 00:00:00 2001 From: codinuum Date: Wed, 26 May 2021 09:40:35 +0900 Subject: [PATCH 01/11] minor --- cca/factutils/python/factutils/entity.py | 1 - cca/factutils/python/factutils/fact.py | 1 - cca/factutils/python/factutils/fileid.py | 1 - cca/factutils/python/factutils/range.py | 1 - cca/factutils/python/factutils/rdf.py | 1 - 5 files changed, 5 deletions(-) diff --git a/cca/factutils/python/factutils/entity.py b/cca/factutils/python/factutils/entity.py index 33897d9..ea3536c 100644 --- a/cca/factutils/python/factutils/entity.py +++ b/cca/factutils/python/factutils/entity.py @@ -28,7 +28,6 @@ from . import fileid from . import range -#import pathsetup class External(Resource): diff --git a/cca/factutils/python/factutils/fact.py b/cca/factutils/python/factutils/fact.py index fc68742..5cd1120 100644 --- a/cca/factutils/python/factutils/fact.py +++ b/cca/factutils/python/factutils/fact.py @@ -29,7 +29,6 @@ from .const import SPEC_NS, PREDICATE_NS, RELEASE_NS, SVNREV_NS, GITREV_NS, GUARD_NS from .rdf import Resource, Literal, Predicate, RDFNode -#import pathsetup logger = logging.getLogger() diff --git a/cca/factutils/python/factutils/fileid.py b/cca/factutils/python/factutils/fileid.py index 8628053..12840da 100644 --- a/cca/factutils/python/factutils/fileid.py +++ b/cca/factutils/python/factutils/fileid.py @@ -26,7 +26,6 @@ from .exn import Invalid_argument from .const import SUB_SEP, SUB_SUB_SEP -#import pathsetup logger = logging.getLogger() diff --git a/cca/factutils/python/factutils/range.py b/cca/factutils/python/factutils/range.py index ea2f88a..9b663a7 100644 --- a/cca/factutils/python/factutils/range.py +++ b/cca/factutils/python/factutils/range.py @@ -23,7 +23,6 @@ from .const import SUB_SEP -#import pathsetup logger = logging.getLogger() diff --git a/cca/factutils/python/factutils/rdf.py b/cca/factutils/python/factutils/rdf.py index fe323a1..428c1f1 100644 --- a/cca/factutils/python/factutils/rdf.py +++ b/cca/factutils/python/factutils/rdf.py @@ -21,7 +21,6 @@ import RDF import logging -#import pathsetup logger = logging.getLogger() From 3ed2e0cb934e8170246a71fa23c705ead71f5661 Mon Sep 17 00:00:00 2001 From: codinuum Date: Sun, 30 May 2021 16:41:11 +0900 Subject: [PATCH 02/11] refactoring python scripts --- Dockerfile | 41 +- cca.py | 96 +- cca/factutils/python/factutils/__init__.py | 3 - cca/factutils/python/factutils/fact.py | 266 -- cca/factutils/python/factutils/pathsetup.py | 7 - cca/factutils/python/factutils/rdf.py | 166 -- cca/scripts/cca_diff_versions_for_fact.py | 25 - cca/scripts/diff_dirs.py | 5 + cca/scripts/java_token_diff.py | 346 +-- cca/scripts/pathsetup.py | 25 - cca/scripts/sim.py | 78 +- cca/scripts/sparql.py | 291 +- python/README.md | 6 + python/pyproject.toml | 3 + python/setup.cfg | 19 + .../scripts => python/src/cca/ccautil}/AST.py | 7 +- .../src/cca/ccautil}/Git2.py | 6 +- .../scripts => python/src/cca/ccautil}/SVN.py | 24 +- python/src/cca/ccautil/__init__.py | 1 + .../src/cca/ccautil}/cca_command.py | 24 +- .../src/cca/ccautil}/cca_config.py | 11 +- .../src/cca/ccautil}/cca_factextractor.py | 14 +- .../src/cca/ccautil}/cca_options.py | 5 +- .../src/cca/ccautil}/common.py | 0 .../src/cca/ccautil}/core_count.py | 3 - .../src/cca/ccautil}/diffinfo.py | 9 +- .../src/cca/ccautil}/diffts.py | 86 +- .../src/cca/ccautil}/factextractor.py | 23 +- .../src/cca/ccautil}/factloader.py | 17 +- .../src/cca/ccautil/find_change_patterns.py | 2536 +++++++++++++++++ python/src/cca/ccautil/find_refactoring.py | 367 +++ .../src/cca/ccautil}/fragment.py | 10 +- python/src/cca/ccautil/java_token_diff.py | 351 +++ .../src/cca/ccautil}/load_into_virtuoso.py | 33 +- .../cca/ccautil}/load_ont_into_virtuoso.py | 9 +- .../src/cca/ccautil}/materialize_fact.py | 13 +- .../materialize_fact_for_refactoring.py | 155 + {cca/scripts => python/src/cca/ccautil}/ns.py | 0 python/src/cca/ccautil/patchast.py | 64 + python/src/cca/ccautil/plain_patch.py | 396 +++ .../src/cca/ccautil}/proc.py | 3 +- .../src/cca/ccautil}/project.py | 4 +- .../src/cca/ccautil}/run_workers.py | 3 +- python/src/cca/ccautil/sim.py | 82 + .../src/cca/ccautil}/siteconf.py | 26 +- python/src/cca/ccautil/sparql.py | 290 ++ .../src/cca/ccautil}/srcdiff.py | 103 +- {cca/scripts => python/src/cca/ccautil}/tp.py | 16 +- .../src/cca/ccautil}/verdiff.py | 37 +- .../src/cca/ccautil}/virtuoso.py | 38 +- python/src/cca/factutil/__init__.py | 2 + .../src/cca/factutil}/common.py | 0 .../src/cca/factutil}/const.py | 4 +- .../src/cca/factutil}/entity.py | 15 +- .../src/cca/factutil}/exn.py | 4 +- python/src/cca/factutil/fact.py | 89 + .../src/cca/factutil}/fileid.py | 4 +- .../src/cca/factutil}/range.py | 4 +- python/src/cca/factutil/rdf.py | 363 +++ 59 files changed, 5068 insertions(+), 1560 deletions(-) delete mode 100644 cca/factutils/python/factutils/__init__.py delete mode 100644 cca/factutils/python/factutils/fact.py delete mode 100644 cca/factutils/python/factutils/pathsetup.py delete mode 100644 cca/factutils/python/factutils/rdf.py delete mode 100755 cca/scripts/cca_diff_versions_for_fact.py create mode 100755 cca/scripts/diff_dirs.py delete mode 100644 cca/scripts/pathsetup.py create mode 100644 python/README.md create mode 100644 python/pyproject.toml create mode 100644 python/setup.cfg rename {cca/scripts => python/src/cca/ccautil}/AST.py (97%) rename {cca/scripts => python/src/cca/ccautil}/Git2.py (99%) rename {cca/scripts => python/src/cca/ccautil}/SVN.py (97%) create mode 100644 python/src/cca/ccautil/__init__.py rename {cca/scripts => python/src/cca/ccautil}/cca_command.py (95%) mode change 100755 => 100644 rename {cca/scripts => python/src/cca/ccautil}/cca_config.py (98%) rename {cca/scripts => python/src/cca/ccautil}/cca_factextractor.py (93%) rename {cca/scripts => python/src/cca/ccautil}/cca_options.py (98%) rename {cca/scripts => python/src/cca/ccautil}/common.py (100%) rename {cca/scripts => python/src/cca/ccautil}/core_count.py (94%) mode change 100755 => 100644 rename {cca/scripts => python/src/cca/ccautil}/diffinfo.py (99%) rename {cca/scripts => python/src/cca/ccautil}/diffts.py (88%) mode change 100755 => 100644 rename {cca/scripts => python/src/cca/ccautil}/factextractor.py (81%) rename {cca/scripts => python/src/cca/ccautil}/factloader.py (97%) mode change 100755 => 100644 create mode 100644 python/src/cca/ccautil/find_change_patterns.py create mode 100644 python/src/cca/ccautil/find_refactoring.py rename {cca/scripts => python/src/cca/ccautil}/fragment.py (98%) create mode 100644 python/src/cca/ccautil/java_token_diff.py rename {cca/scripts => python/src/cca/ccautil}/load_into_virtuoso.py (88%) mode change 100755 => 100644 rename {cca/scripts => python/src/cca/ccautil}/load_ont_into_virtuoso.py (91%) rename {cca/scripts => python/src/cca/ccautil}/materialize_fact.py (96%) create mode 100644 python/src/cca/ccautil/materialize_fact_for_refactoring.py rename {cca/scripts => python/src/cca/ccautil}/ns.py (100%) create mode 100644 python/src/cca/ccautil/patchast.py create mode 100644 python/src/cca/ccautil/plain_patch.py rename {cca/scripts => python/src/cca/ccautil}/proc.py (98%) rename {cca/scripts => python/src/cca/ccautil}/project.py (95%) rename {cca/scripts => python/src/cca/ccautil}/run_workers.py (99%) create mode 100644 python/src/cca/ccautil/sim.py rename {cca/scripts => python/src/cca/ccautil}/siteconf.py (67%) create mode 100644 python/src/cca/ccautil/sparql.py rename {cca/scripts => python/src/cca/ccautil}/srcdiff.py (93%) mode change 100755 => 100644 rename {cca/scripts => python/src/cca/ccautil}/tp.py (98%) rename {cca/scripts => python/src/cca/ccautil}/verdiff.py (96%) rename {cca/scripts => python/src/cca/ccautil}/virtuoso.py (92%) create mode 100644 python/src/cca/factutil/__init__.py rename {cca/factutils/python/factutils => python/src/cca/factutil}/common.py (100%) rename {cca/factutils/python/factutils => python/src/cca/factutil}/const.py (89%) rename {cca/factutils/python/factutils => python/src/cca/factutil}/entity.py (94%) rename {cca/factutils/python/factutils => python/src/cca/factutil}/exn.py (85%) create mode 100644 python/src/cca/factutil/fact.py rename {cca/factutils/python/factutils => python/src/cca/factutil}/fileid.py (98%) rename {cca/factutils/python/factutils => python/src/cca/factutil}/range.py (99%) create mode 100644 python/src/cca/factutil/rdf.py diff --git a/Dockerfile b/Dockerfile index 3b29f2d..69dba6a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -3,10 +3,11 @@ FROM ubuntu:20.04 MAINTAINER codinuum RUN set -x && \ - useradd -r -s /bin/nologin cca && \ mkdir -p /opt/cca/modules && \ mkdir -p /var/lib/cca && \ - mkdir /root/src + useradd -r -d /opt/cca -s /bin/nologin cca && \ + chown -R cca:cca /opt/cca && \ + chown -R cca:cca /var/lib/cca COPY LICENSE /opt/cca/ COPY cca /opt/cca/ @@ -27,36 +28,39 @@ RUN set -x && \ unixodbc \ openjdk-8-jdk \ python3 python3-dev \ - python3-distutils \ - python3-psutil \ python3-pygit2 \ - python3-distutils \ + python3-svn \ + python3-daemon \ + python3-venv \ wget ca-certificates \ git rsync && \ wget https://bootstrap.pypa.io/get-pip.py && \ python3 get-pip.py && \ - pip3 install pyodbc simplejson && \ + pip3 install pyodbc setuptools build javalang && \ rm get-pip.py -RUN set -x && \ - cd /root && \ - git clone https://github.com/dajobe/redland-bindings && \ - cd redland-bindings && \ - ./autogen.sh --with-python=python3 && \ - make install && \ - cd /root && \ - rm -r redland-bindings - RUN set -x && \ cd /root && \ git clone https://github.com/openlink/virtuoso-opensource && \ cd virtuoso-opensource && \ ./autogen.sh && \ - env CFLAGS='-O2 -m64' ./configure --prefix=/opt/virtuoso --with-layout=opt --with-readline=/usr --program-transform-name="s/isql/isql-v/" --disable-dbpedia-vad --disable-demo-vad --enable-fct-vad --enable-ods-vad --disable-sparqldemo-vad --disable-tutorial-vad --enable-isparql-vad --enable-rdfmappers-vad && \ + env CFLAGS='-O2 -m64' ./configure --prefix=/opt/virtuoso --with-layout=opt --with-readline=/usr \ + --program-transform-name="s/isql/isql-v/" --disable-dbpedia-vad --disable-demo-vad \ + --enable-fct-vad --enable-ods-vad --disable-sparqldemo-vad --disable-tutorial-vad \ + --enable-isparql-vad --enable-rdfmappers-vad && \ make && make install && \ cd /root && \ rm -r virtuoso-opensource +COPY python /root/python + +RUN set -x && \ + cd /root/python && \ + python3 -m build && \ + pip3 install dist/cca-*.tar.gz && \ + cd /root && \ + rm -r python + COPY src /root/src/ RUN set -x && \ @@ -74,7 +78,10 @@ RUN set -x && \ cp modules/Mfortran*.cmxs /opt/cca/modules/ && \ cp modules/Mcpp*.cmxs /opt/cca/modules/ && \ cd /root && \ - rm -r src + rm -r src && \ + echo 'test -r /root/.opam/opam-init/init.sh && . /root/.opam/opam-init/init.sh > /dev/null 2> /dev/null || true' >> .bashrc && \ + echo 'export PATH=/opt/cca/bin:${PATH}' >> .bashrc + RUN set -x && \ apt-get autoremove -y && \ diff --git a/cca.py b/cca.py index f2d6cbf..370c334 100755 --- a/cca.py +++ b/cca.py @@ -3,7 +3,7 @@ ''' A driver script for CCA container image - Copyright 2020 Codinuum Software Lab + Copyright 2021 Codinuum Software Lab Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -21,12 +21,14 @@ import os import sys import time +import shutil from datetime import datetime, timedelta -from subprocess import Popen, call +from subprocess import Popen, run from threading import Thread from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter IMAGE_NAME = 'codinuum/cca' +#IMAGE_NAME = 'ccax' # @@ -37,9 +39,13 @@ CCA_SOURCE_DIR = CCA_VAR+'/source' CCA_CACHE_DIR = CCA_VAR+'/cache' +CCA_WORK_DIR_NAME = '__CCA__' + CONTAINER_CMD = 'docker' +TIMEOUT = 5 BUFSIZE = 0 # unbuffered +STAT_NAME = 'status' DEFAULT_CACHE_DIR = os.path.join(os.environ['HOME'], '.cca', 'cache') @@ -77,10 +83,35 @@ tzname = time.tzname[0] offset = STDOFFSET - TZ = '%s%s%s' % (tzname, SIGN, offset) + TZ = '{}{}{}'.format(tzname, SIGN, offset) ### +def progress(proc, stat_path, timeout=TIMEOUT): + stat_mtime = None + + print('\nMonitoring thread started.') + + while True: + try: + st = os.stat(stat_path) + if st.st_mtime != stat_mtime and st.st_size > 0: + with open(stat_path, 'r') as f: + mes = f.read() + print('[{}]'.format(mes)) + + stat_mtime = st.st_mtime + + except OSError as e: + pass + + if proc.poll() is not None: + break + + proc.wait() + if proc.returncode > 0: + print('Execution failed: {}'.format(proc.returncode)) + def ensure_dir(dpath): if not os.path.isdir(dpath): try: @@ -95,8 +126,8 @@ def get_image_name(image_name, devel=False): image = image_name+suffix return image -def run_diffast(original, modified, cache=DEFAULT_CACHE_DIR, clear_cache=False, view=False, - dry_run=False, devel=False, image=IMAGE_NAME, verbose=False): +def run_diffast(container_cmd, original, modified, cache=DEFAULT_CACHE_DIR, clear_cache=False, view=False, + dry_run=False, devel=False, image=IMAGE_NAME, verbose=False, debug=False): if dry_run: verbose = True @@ -108,7 +139,7 @@ def run_diffast(original, modified, cache=DEFAULT_CACHE_DIR, clear_cache=False, if not dry_run: ensure_dir(cache) - cca_cmd_path = '%s/bin/%s.opt' % (CCA_HOME, 'diffast') + cca_cmd_path = '{}/bin/{}.opt'.format(CCA_HOME, 'diffast') cca_cmd = cca_cmd_path if clear_cache: cca_cmd += ' -clearcache' @@ -128,22 +159,22 @@ def run_diffast(original, modified, cache=DEFAULT_CACHE_DIR, clear_cache=False, vol_opt = '-v "{}:{}"'.format(common_path, CCA_SOURCE_DIR) vol_opt += ' -v "{}:{}"'.format(cache, CCA_CACHE_DIR) - run_cmd = '%s run' % CONTAINER_CMD + run_cmd = '{} run'.format(container_cmd) run_cmd += ' --rm' run_cmd += ' -t' if TZ: - run_cmd += ' -e "TZ=%s"' % TZ + run_cmd += ' -e "TZ={}"'.format(TZ) - run_cmd += ' %s' % vol_opt - run_cmd += ' %s %s' % (get_image_name(image, devel=devel), cca_cmd) + run_cmd += ' {}'.format(vol_opt) + run_cmd += ' {} {}'.format(get_image_name(image, devel=devel), cca_cmd) if verbose: print(run_cmd) if not dry_run: try: - rc = call(run_cmd, bufsize=BUFSIZE, shell=True, universal_newlines=True) + rc = run(run_cmd, bufsize=BUFSIZE, shell=True, universal_newlines=True) if view: app_path = os.path.join(os.path.dirname(sys.argv[0]), @@ -156,7 +187,7 @@ def run_diffast(original, modified, cache=DEFAULT_CACHE_DIR, clear_cache=False, view_cmd = 'open -n {} --args{}{}'.format(app_path, cache_opt, files_opt) if verbose: print(view_cmd) - rc = call(view_cmd, shell=True) + rc = run(view_cmd, shell=True) else: print('DiffViewer not found. See diffviewer/README.md.') @@ -167,48 +198,70 @@ def run_diffast(original, modified, cache=DEFAULT_CACHE_DIR, clear_cache=False, print('Execution failed: {}'.format(e)) +def gen_work_dir_name(): + dt = datetime.now() + ts = '{:04}{:02}{:02}{:02}{:02}{:02}'.format(dt.year, dt.month, dt.day, dt.hour, dt.minute, dt.second) + dn = '{}{}'.format(CCA_WORK_DIR_NAME, ts) + return dn + def update(args): - cmd = '%s pull %s' % (CONTAINER_CMD, get_image_name(args.image, devel=args.devel)) + cmd = '{} pull {}'.format(args.container_cmd, get_image_name(args.image, devel=args.devel)) if args.verbose or args.dry_run: print(cmd) if not args.dry_run: try: - call(cmd, shell=True) + run(cmd, shell=True) except OSError as e: print('Execution failed: {}'.format(e)) def diffast(args): - run_diffast(args.original, args.modified, cache=args.cache, clear_cache=args.force, view=args.view, - dry_run=args.dry_run, devel=args.devel, image=args.image, verbose=args.verbose) + run_diffast(args.container_cmd, + args.original, args.modified, cache=args.cache, clear_cache=args.force, view=args.view, + dry_run=args.dry_run, devel=args.devel, image=args.image, verbose=args.verbose, debug=args.debug) def main(): - parser = ArgumentParser(description='CCA driver', + parser = ArgumentParser(description='A CCA driver', + add_help=False, formatter_class=ArgumentDefaultsHelpFormatter) parser.add_argument('-n', '--dry-run', dest='dry_run', action='store_true', help='only print container commands') + parser.add_argument('--container-command', dest='container_cmd', metavar='CMD', + help='specify container command', default=CONTAINER_CMD) + parser.add_argument('-i', '--image', dest='image', type=str, metavar='IMAGE', default=IMAGE_NAME, help='specify container image') parser.add_argument('-v', '--verbose', dest='verbose', action='store_true', - help='be verbose') + help='enable verbose printing') + + parser.add_argument('-d', '--debug', dest='debug', action='store_true', + help='enable debug printing') parser.add_argument('-x', '--experimental', dest='devel', action='store_true', help='use experimental image') - subparsers = parser.add_subparsers(title='subcommands') + p = ArgumentParser(add_help=True) + + subparsers = p.add_subparsers(title='subcommands') + + # Docker image update parser_update = subparsers.add_parser('update', description='Update docker image of CCA', + parents=[parser], formatter_class=ArgumentDefaultsHelpFormatter) parser_update.set_defaults(func=update) + # Diff/AST + parser_diffast = subparsers.add_parser('diffast', description='Compare two programs', + parents=[parser], formatter_class=ArgumentDefaultsHelpFormatter) parser_diffast.add_argument('original', type=str, metavar='ORIGINAL', help='original source file') @@ -226,14 +279,15 @@ def main(): parser_diffast.set_defaults(func=diffast) + # - args = parser.parse_args() + args = p.parse_args() try: args.func(args) except: #raise - parser.print_help() + p.print_help() if __name__ == '__main__': diff --git a/cca/factutils/python/factutils/__init__.py b/cca/factutils/python/factutils/__init__.py deleted file mode 100644 index 0415291..0000000 --- a/cca/factutils/python/factutils/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -#!/usr/bin/env python3 - -__all__ = ['common'] diff --git a/cca/factutils/python/factutils/fact.py b/cca/factutils/python/factutils/fact.py deleted file mode 100644 index 5cd1120..0000000 --- a/cca/factutils/python/factutils/fact.py +++ /dev/null @@ -1,266 +0,0 @@ -#!/usr/bin/env python3 - -''' - Factutils: helper scripts for source code entities - - Copyright 2012-2020 Codinuum Software Lab - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -''' - -import os.path -import tempfile -import gzip -import RDF -import logging - -from functools import reduce - -from .const import SPEC_NS, PREDICATE_NS, RELEASE_NS, SVNREV_NS, GITREV_NS, GUARD_NS -from .rdf import Resource, Literal, Predicate, RDFNode - - -logger = logging.getLogger() - -class Statement(object): - def __init__(self, subject=None, predicate=None, object=None, **args): - try: - stmt = args['statement'] - self.subject = stmt.subject - self.predicate = stmt.predicate - self.object = stmt.object - self._stmt = RDF.Statement(statement=stmt._stmt) - - except KeyError: - self.subject = subject - self.predicate = predicate - self.object = object - s = None - p = None - o = None - if isinstance(subject, Resource): - s = subject.as_node() - if isinstance(predicate, Predicate): - p = predicate.as_node() - if isinstance(object, RDFNode): - o = object.as_node() - - self._stmt = RDF.Statement(s, p, o) - - - def __eq__(self, other): - res = False - if isinstance(other, Statement): - res = reduce(lambda x,y: x and y, [self.subject == other.subject, - self.predicate == other.predicate, - self.object == other.object]) - return res - - - -class Fact(object): - def __init__(self, ns_tbl, large=False): - - if large: - self._storage = RDF.HashStorage('db4', options="new='yes',hash-type='bdb'") - else: - self._storage = RDF.MemoryStorage() - - self._model = RDF.Model(self._storage) - - self._g_pred_map = {} - self._pred_tbl = {} - - self.l_true = Literal('true') - self.l_false = Literal('false') - - self.namespace_tbl = ns_tbl - - - def set_namespace(self, prefix, uri): - self.namespace_tbl[prefix] = uri - - def contains(self, s, p, o): - stmt = self._create_statement(s, p, o) - return (stmt in self._model) - - def create_release_version(self, rel): - return Resource(RELEASE_NS + rel) - - def create_svn_revision(self, rev): - s = '%s%s' % (SVNREV_NS, rev) - return Resource(s) - - def create_git_revision(self, rev): - s = '%s%s' % (GITREV_NS, rev) - return Resource(s) - - def get_guard_pred(self, pred): - g_pred = None - try: - g_pred = self._g_pred_map[pred] - - except KeyError: - g_pred = Predicate(GUARD_NS+'?orig='+pred.get_namespace(), - pred.get_local_name()) - self._g_pred_map[pred] = g_pred - - return g_pred - - def list_guards(self, s, p, o): - guards = [] - gp = self.get_guard_pred(p) - q = self._create_statement(None, gp, s) - for stmt in self._model.find_statements(q): - g = Resource(node=stmt.subject) - if self.contains(g, gp, o): - guards.append(g) - - return guards - - def size(self): - return self._model.size() - - def _add(self, subj, pred, obj): - self._model.add(subj.as_node(), pred.as_node(), obj.as_node()) - - def add(self, subj, pred, obj, attr=None, value=None): - self._add(subj, pred, obj) - if attr and value: - blk = None - guards = self.list_guards(subj, pred, obj) - - if len(guards) == 0: - blk = Resource() - g_pred = self.get_guard_pred(pred) - self._add(blk, g_pred, subj) - self._add(blk, g_pred, obj) - - else: - blk = guards[0] - - if blk == None: - blk = Resource() - - self._add(blk, attr, value) - - - def addStatement(self, stmt, attr=None, value=None): - self.add(stmt.subject, stmt.predicate, stmt.object, attr, value) - - def _create_statement(self, subj, pred, obj): - s = None - p = None - o = None - if subj: - s = subj.as_node() - if pred: - p = pred.as_node() - if obj: - o = obj.as_node() - return RDF.Statement(s, p, o) - - def _guess_fmt(self, path): - fmt = '' - if path.endswith('.nt'): - fmt = 'ntriples' - elif path.endswith('.ttl'): - fmt = 'turtle' - elif path.endswith('.rdf'): - fmt = 'rdfxml' - if path.endswith('.nt.gz'): - fmt = 'ntriples' - elif path.endswith('.ttl.gz'): - fmt = 'turtle' - elif path.endswith('.rdf.gz'): - fmt = 'rdfxml' - - return fmt - - def _mktemp(self): - (fd, path) = tempfile.mkstemp() - os.close(fd) - return path - - def _gzipped(self, path): - return path.endswith('.gz') - - def _gzip(self, from_file, to_file): - f_from = open(from_file, 'rb') - f_to = gzip.open(to_file, 'wb') - f_to.writelines(f_from) - f_to.close() - f_from.close() - - def _gunzip(self, from_file, to_file): - f_from = gzip.open(from_file, 'rb') - f_to = open(to_file, 'wb') - f_to.writelines(f_from) - f_to.close() - f_from.close() - - def write(self, path, fmt='', base_uri=None): - if fmt == '': - fmt = self._guess_fmt(path) - - gzipped_path = None - - if self._gzipped(path): - gzipped_path = path - tmp = self._mktemp() - path = tmp - - serializer = RDF.Serializer(name=fmt) - for (prefix, uri) in self.namespace_tbl.items(): - serializer.set_namespace(prefix, uri) - - logger.info('writing to "%s"...' % path) - d = os.path.dirname(path) - if d != '' and not os.path.exists(d): - logger.warning('No such directory: "%s"' % d) - logger.info('creating "%s"...' % d) - os.makedirs(d) - serializer.serialize_model_to_file(path, self._model, base_uri=base_uri) - logger.info('done.') - - if gzipped_path: - self._gzip(path, gzipped_path) - os.unlink(path) - - def read(self, path, fmt='', base_uri=None): - if fmt == '': - fmt = self._guess_fmt(path) - - gzipped = False - - if self._gzipped(path): - gzipped = True - tmp = self._mktemp() - self._gunzip(path, tmp) - path = tmp - - parser = RDF.Parser(name=fmt) - logger.info('reading "%s"...' % path) - parser.parse_into_model(self._model, - 'file://' + os.path.abspath(path), - base_uri=base_uri) - logger.info('done.') - - if gzipped: - os.unlink(tmp) - - - def query(self, qstr, base_uri=None): - q = RDF.SPARQLQuery(qstr, base_uri=base_uri) - results = q.execute(self._model) - return results diff --git a/cca/factutils/python/factutils/pathsetup.py b/cca/factutils/python/factutils/pathsetup.py deleted file mode 100644 index 9ecda18..0000000 --- a/cca/factutils/python/factutils/pathsetup.py +++ /dev/null @@ -1,7 +0,0 @@ - -import sys - -# lib_dir = 'CCA_HOME/scripts' - -# if lib_dir not in sys.path: -# sys.path.append(lib_dir) diff --git a/cca/factutils/python/factutils/rdf.py b/cca/factutils/python/factutils/rdf.py deleted file mode 100644 index 428c1f1..0000000 --- a/cca/factutils/python/factutils/rdf.py +++ /dev/null @@ -1,166 +0,0 @@ -#!/usr/bin/env python3 - -''' - Factutils: helper scripts for source code entities - - Copyright 2012-2020 Codinuum Software Lab - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -''' - -import RDF -import logging - - -logger = logging.getLogger() - -def mkuri(s): - return RDF.Uri(s) - -def uri_split(uri): - lname = uri.split('/')[-1].split('#')[-1] - ns = uri[:len(uri)-len(lname)] - return ns, lname - - -class RDFNode(object): - def __init__(self, nd): - self._valid = True - self._node = nd - - def __eq__(self, other): - res = False - if isinstance(other, RDFNode): - res = self._node == other._node - - return res - - def is_valid(self): - return self._valid - - def as_node(self): - return self._node - -class Resource(RDFNode): - def __init__(self, uri=None, **args): - nd = args.get('node', None) - if nd != None: - RDFNode.__init__(self, nd) - else: - if uri != None: - # if isinstance(uri, str): - # uri = uri.encode() - try: - RDFNode.__init__(self, RDF.Node(uri_string=uri)) - except: - logger.warning('uri="%s"(%s)' % (uri, str(type(uri)))) - raise - else: - RDFNode.__init__(self, RDF.Node()) # blank node - - def __eq__(self, other): - res = False - if isinstance(other, Resource): - if self._node.is_resource() and other._node.is_resource(): - res = self.get_uri() == other.get_uri() - else: - res = self._node == other._node - - return res - - def __lt__(self, other): - return str(self.get_uri()) < str(other.get_uri()) - - def __gt__(self, other): - return str(self.get_uri()) > str(other.get_uri()) - - def __le__(self, other): - self.__eq__(other) or self.__lt__(other) - - def __ge__(self, other): - self.__eq__(other) or self.__gt__(other) - - def __hash__(self): - return str(self.get_uri()).__hash__() - - - def __str__(self): - return '<%s>' % self.get_uri() - - def get_uri(self): - return str(self.as_node().uri) - - def get_namespane(self): - ns, ln = uri_split(self.get_uri()) - return ns - - def get_local_name(self): - ns, ln = uri_split(self.get_uri()) - return ln - - -class Literal(RDFNode): - def __init__(self, literal="", **args): - nd = args.get('node', None) - if nd != None: - RDFNode.__init__(self, nd) - else: - RDFNode.__init__(self, RDF.Node(literal=literal, **args)) - - def __eq__(self, other): - res = False - if isinstance(other, Literal): - res = self.get_content() == other.get_content() - - return res - - def __str__(self): - return '"%s"' % self.get_content() - - def get_content(self): - return self._node.literal_value - - - -class Predicate(Resource): - def __init__(self, ns=None, lname=None, **args): - self._lname = None - self._ns = None - - uri = None - node = args.get('node', None) - - if ns == None or lname==None: - uri = args.get('uri', None) - - if uri == None: - if node != None: - uri = str(node.uri) - - self._ns, self._lname = uri_split(uri) - - else: - self._ns = ns - self._lname = lname - uri = ns + lname - - Resource.__init__(self, uri, **args) - - def __str__(self): - return '<%s>' % self.get_uri() - - def get_namespace(self): - return self._ns - - def get_local_name(self): - return self._lname diff --git a/cca/scripts/cca_diff_versions_for_fact.py b/cca/scripts/cca_diff_versions_for_fact.py deleted file mode 100755 index 1ec2c62..0000000 --- a/cca/scripts/cca_diff_versions_for_fact.py +++ /dev/null @@ -1,25 +0,0 @@ -#!/usr/bin/env python3 - - -''' - cca_diff_versions_for_fact.py - - Copyright 2012-2020 Codinuum Software Lab - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -''' - -import verdiff - -if __name__ == '__main__': - verdiff.compute(load_fact=True) diff --git a/cca/scripts/diff_dirs.py b/cca/scripts/diff_dirs.py new file mode 100755 index 0000000..887d049 --- /dev/null +++ b/cca/scripts/diff_dirs.py @@ -0,0 +1,5 @@ +#!/usr/bin/env python3 + +if __name__ == '__main__': + from cca.ccautil.srcdiff import test_diff_dirs + test_diff_dirs() diff --git a/cca/scripts/java_token_diff.py b/cca/scripts/java_token_diff.py index c8d5912..eb36fa0 100755 --- a/cca/scripts/java_token_diff.py +++ b/cca/scripts/java_token_diff.py @@ -1,347 +1,5 @@ #!/usr/bin/env python3 -''' - java_token_diff.py - - Copyright 2018-2019 Chiba Institute of Technology - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -''' - -__author__ = 'Masatomo Hashimoto ' - -import sys -import os -import logging - -import filecmp -from difflib import SequenceMatcher -from javalang import tokenizer - -logger = logging.getLogger() - -def is_src(f): - return f.endswith('.java') - -def get_tokens(path): - toks = [] - try: - with open(path, 'r') as f: - for tok in tokenizer.tokenize(f.read()): - toks.append(tok.value) - except Exception as e: - pass - - seq = [] - - while True: - try: - tok = toks.pop(0) - - if tok == '.': - try: - nxt = toks.pop(0) - r = '.' + nxt - if seq: - if seq[-1] not in (',','('): - seq[-1] += r - else: - seq.append(r) - else: - seq.append(r) - - except IndexError: - seq.append(tok) - - elif tok == ',': - try: - nxt = toks.pop(0) - if nxt in ('}', ';'): - seq.append(nxt) - else: - seq.append(tok) - seq.append(nxt) - - except IndexError: - seq.append(tok) - - else: - seq.append(tok) - - except IndexError: - break - - return seq - -def count_tokens(path): - c = len(get_tokens(path)) - return c - -def get_files(x): - l = [] - for (d, dns, ns) in os.walk(x): - for n in ns: - p = os.path.join(d, n) - if is_src(p): - l.append(p) - return l - -def get_pre_context(toks, i): - if i > 2: - pre = ' '.join(toks[i-3:i]) - elif i == 2: - pre = ' '.join(toks[i-2:i]) - elif i == 1: - pre = ' '.join(toks[i-1:i]) - else: - pre = ' '.join(toks[0:i]) - return pre - -def get_post_context(toks, i): - post = ' '.join(toks[i:i+5]) - return post - -def get_context(toks, i): - return (get_pre_context(toks, i), get_post_context(toks, i)) - -def diff_to_str(d, toks1, toks2): - dels = d['delete'] - repls = d['replace'] - inss = d['insert'] - - lines = [] - - if dels: - for ((a, b), _) in dels: - pre, post = get_pre_context(toks1, a), get_post_context(toks1, b) - lines.append('[DELETE] {}-{} ({}):\n'.format(a, b-1, b-a)) - lines.append(' {}\n'.format(pre)) - lines.append('- ') - lines.append(' '.join(toks1[a:b])) - lines.append('\n') - lines.append(' {}\n'.format(post)) - if repls: - for ((a, b), (a2, b2)) in repls: - pre, post = get_pre_context(toks1, a), get_post_context(toks1, b) - lines.append('[REPLACE] {}-{} -> {}-{} ({}->{}):\n'.format(a, b-1, a2, b2-1, - b-a, b2-a2)) - lines.append(' {}\n'.format(pre)) - lines.append('- ') - lines.append(' '.join(toks1[a:b])) - lines.append('\n-----\n') - lines.append('+ ') - lines.append(' '.join(toks2[a2:b2])) - lines.append('\n') - lines.append(' {}\n'.format(post)) - if inss: - for ((i, _), (a, b)) in inss: - pre, post = get_context(toks1, i) - lines.append('[INSERT] {} -> {}-{} ({}):\n'.format(i, a, b-1, b-a)) - lines.append(' {}\n'.format(pre)) - lines.append('+ ') - lines.append(' '.join(toks2[a:b])) - lines.append('\n') - lines.append(' {}\n'.format(post)) - - s = ''.join(lines) - - return s - -def print_diff(d, toks1, toks2): - print(diff_to_str(d, toks1, toks2)) - -def size_of_diff(d): - sz = 0 - for ((i1, i2), _) in d['delete']: - sz += i2 - i1 - - for ((i1, i2), (j1, j2)) in d['replace']: - sz += i2 - i1 + j2 - j1 - - for (_, (j1, j2)) in d['insert']: - sz += j2 - j1 - - return sz - -def diff_tokens(toks1, toks2): - m = SequenceMatcher(isjunk=None, a=toks1, b=toks2) - d = {'replace':[],'delete':[],'insert':[]} - for (tag, i1, i2, j1, j2) in m.get_opcodes(): - if tag != 'equal': - d[tag].append(((i1, i2), (j1, j2))) - d['sim'] = m.ratio() - nm = 0 - for nt in m.get_matching_blocks(): - nm += nt.size - d['nmatches'] = nm - return d - -def is_equivalent_file(path1, path2): - if filecmp.cmp(path1, path2, shallow=False): - logger.info('same files') - return True - - logger.info('comparing {} with {}'.format(path1, path2)) - - toks1 = get_tokens(path1) - toks2 = get_tokens(path2) - b = toks1 == toks2 - return b - -def all_different(paths): - n = len(paths) - for i in range(n-1): - for j in range(i+1, n): - if filecmp.cmp(paths[i], paths[j], shallow=False): - logger.info('same files: {} {}'.format(paths[i], paths[j])) - return False - - toks_list = [None for _ in paths] - - for i in range(n-1): - for j in range(i+1, n): - if toks_list[i] == None: - toks_list[i] = get_tokens(paths[i]) - if toks_list[j] == None: - toks_list[j] = get_tokens(paths[j]) - if toks_list[i] == toks_list[j]: - logger.info('equivalent files: {} {}'.format(paths[i], paths[j])) - return False - - return True - -def compare_files(path1, path2, simple=False): - if filecmp.cmp(path1, path2, shallow=False): - logger.info('same files') - return {'count':0,'diff':'','sim':1.0} - elif simple: - logger.info('different files') - return {} - - logger.info('comparing {} with {}'.format(path1, path2)) - - toks1 = get_tokens(path1) - toks2 = get_tokens(path2) - d = diff_tokens(toks1, toks2) - c = size_of_diff(d) - s = diff_to_str(d, toks1, toks2) - sim = d['sim'] - nm = d['nmatches'] - dist = float(c) / (float(nm) if nm > 0 else 1.0) - ret = {'count':c,'diff':s,'sim':sim,'dist':dist} - return ret - -def compare_dirs(d1, d2, simple=False): - #print('comparing {} with {}'.format(d1, d2)) - - dcmp = filecmp.dircmp(d1, d2) - removed_files = [] - added_files = [] - modified_files = [] - - removed_dirs = [] - added_dirs = [] - - def scan(dc): - for f in dc.left_only: - p = os.path.join(dc.left, f) - if is_src(f): - removed_files.append(p) - - elif os.path.isdir(p): - removed_dirs.append(p) - - for f in dc.right_only: - p = os.path.join(dc.right, f) - if is_src(f): - added_files.append(p) - - elif os.path.isdir(p): - added_dirs.append(p) - - for f in dc.diff_files: - if is_src(f): - p1 = os.path.join(dc.left, f) - p2 = os.path.join(dc.right, f) - modified_files.append((p1, p2)) - - for subd in dc.subdirs.values(): - scan(subd) - - scan(dcmp) - - count = 0 - - for f in removed_files: - count += count_tokens(f) - - for f in added_files: - count += count_tokens(f) - - for d in removed_dirs: - for f in get_files(d): - count += count_tokens(f) - - for d in added_dirs: - for f in get_files(d): - count += count_tokens(f) - - for (f1, f2) in modified_files: - r = compare_files(f1, f2, simple=simple) - if r: - count += r['count'] - - return count - - if __name__ == '__main__': - from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter - - parser = ArgumentParser(description='compute size of token sequence delta', - formatter_class=ArgumentDefaultsHelpFormatter) - - parser.add_argument('path1', type=str) - parser.add_argument('path2', type=str) - - parser.add_argument('-v', '--verbose', dest='verbose', action='store_true', - help='enable verbose printing') - - parser.add_argument('-d', '--debug', dest='debug', action='store_true', - help='enable debug printing') - - parser.add_argument('-s', '--simple', dest='simple', action='store_true', - help='only checks if file1 is equivalent to file2') - - args = parser.parse_args() - - log_level = logging.WARNING - if args.verbose: - log_level = logging.INFO - if args.debug: - log_level = logging.DEBUG - logging.basicConfig(format='[%(levelname)s][%(funcName)s] %(message)s', level=log_level) - - c = None - - if os.path.isfile(args.path1) and os.path.isfile(args.path2): - r = compare_files(args.path1, args.path2, simple=args.simple) - if r: - d = r['diff'] - if d: - logger.debug('differences:\n{}'.format(d)) - c = r['count'] - - elif os.path.isdir(args.path1) and os.path.isdir(args.path2): - c = compare_dirs(args.path1, args.path2, simple=args.simple) - - print(c) + from cca.ccautil.java_token_diff import main + main() diff --git a/cca/scripts/pathsetup.py b/cca/scripts/pathsetup.py deleted file mode 100644 index 734f8a1..0000000 --- a/cca/scripts/pathsetup.py +++ /dev/null @@ -1,25 +0,0 @@ - -import os -import sys - -from siteconf import CCA_HOME - -HOME = os.environ['HOME'] - -LOG_DIR = os.path.join(CCA_HOME, 'log') - -#if not os.path.exists(LOG_DIR): -# os.makedirs(LOG_DIR) - -_CONFIGS_DIR = os.path.join(CCA_HOME, 'configs') -CONFIGS_DIR = os.getenv('CCA_CONFIGS_DIR', _CONFIGS_DIR) -FACTUTILS_DIR = os.path.join(CCA_HOME, 'factutils', 'python') - -dirs = [ CONFIGS_DIR, - FACTUTILS_DIR, - ] - -for d in dirs: - if d not in sys.path: - sys.path.append(d) - diff --git a/cca/scripts/sim.py b/cca/scripts/sim.py index 7cd4f76..ab24d48 100755 --- a/cca/scripts/sim.py +++ b/cca/scripts/sim.py @@ -1,79 +1,5 @@ #!/usr/bin/env python3 - -''' - sim.py - - Copyright 2012-2020 Codinuum Software Lab - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -''' - -import filecmp -import difflib - -import java_token_diff as java - - -def line_sim(f1, f2): - if filecmp.cmp(f1, f2): - return 0.0 - lines1 = open(f1, 'U').readlines() - lines2 = open(f2, 'U').readlines() - matcher = difflib.SequenceMatcher(None, lines1, lines2) - similarity = matcher.quick_ratio() - return similarity - - -def java_sim(f1, f2): - if filecmp.cmp(f1, f2): - return 0.0 - toks1 = java.get_tokens(f1) - toks2 = java.get_tokens(f2) - matcher = difflib.SequenceMatcher(isjunk=None, a=toks1, b=toks2) - similarity = matcher.quick_ratio() - return similarity - - -def sim(f1, f2, plain=False): - similarity = 0.0 - if not filecmp.cmp(f1, f2): - if plain: - similarity = line_sim(f1, f2) - elif java.is_src(f1) and java.is_src(f2): - similarity = java_sim(f1, f2) - else: - similarity = line_sim(f1, f2) - - return similarity - - if __name__ == '__main__': - from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter - - parser = ArgumentParser(description='compute similarity of files', - formatter_class=ArgumentDefaultsHelpFormatter) - - parser.add_argument('path1', type=str) - parser.add_argument('path2', type=str) - - parser.add_argument('--plain', dest='plain', action='store_true', - help='perform language agnostic differencing') - - args = parser.parse_args() - - try: - s = sim(args.path1, args.path2, plain=args.plain) - print(s) - except IOError as e: - print('ERROR: {}'.format(str(e))) + from cca.ccautil.sim import main + main() diff --git a/cca/scripts/sparql.py b/cca/scripts/sparql.py index e227121..d3dfc11 100755 --- a/cca/scripts/sparql.py +++ b/cca/scripts/sparql.py @@ -1,292 +1,5 @@ #!/usr/bin/env python3 - -''' - A SPARQL driver - - Copyright 2012-2020 Codinuum Software Lab - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -''' - -# Fortran namespaces added by Masatomo Hashimoto - -import logging - -import pathsetup -from siteconf import SPARQL_ENDPOINT -from virtuoso import ODBCDriver, VIRTUOSO_PW, VIRTUOSO_PORT, get_odbc_connect_string -import ns -from factutils.const import ENTITY_NS, VARIANT_NS, SVNREV_NS, GITREV_NS, RELEASE_NS -from common import setup_logger - -logger = logging.getLogger() - - -NAMESPACES = { 'xsd' : ns.XSD_NS, - 'owl' : ns.OWL_NS, - 'rdf' : ns.RDF_NS, - 'fb' : ns.FB_NS, - 'src' : ns.SRC_NS, - 'ver' : ns.VER_NS, - 'chg' : ns.CHG_NS, - 'git' : ns.GIT_NS, - - 'ent' : ENTITY_NS, - 'variant' : VARIANT_NS, - 'svnrev' : SVNREV_NS, - 'gitrev' : GITREV_NS, - 'rel' : RELEASE_NS, - - 'f' : ns.F_NS, - 'pa' : ns.PA_NS, - 'fjpa' : ns.FJPA_NS, - 'fpt' : ns.FPT_NS, - - 'fjpadata' : ns.PREFIX_TBL['fjpadata'], - 'entpair' : ns.PREFIX_TBL['entpair'], - 'chgpat' : ns.PREFIX_TBL['chgpat'], - 'chginst' : ns.PREFIX_TBL['chginst'], - } - - - -def get_localname(s): - res = s - if s: - try: - if s.startswith('http://'): - res = (s.split('/'))[-1].split('#')[-1] - except Exception as e: - logger.warning(str(e)) - - return res - - - -class Driver(object): - def __init__(self): - self._ns_tbl = {} - for (n, p) in NAMESPACES.items(): - self._ns_tbl[p] = n - - def to_prefixed_form(self, v): - r = v - if v: - try: - for p in self._ns_tbl.keys(): - if str(v).startswith(p): - r = '%s:%s' % (self._ns_tbl[p], v[len(p):]) - break - except Exception as e: - logger.warning('"%s": %s' % (v, e)) - - return r - - - def execute(self, q): - pass - - def query(self, q, abbrev=False): - return None - - def fetchone(self, q, abbrev=False): - return None - - -class VirtuosoODBCDriver(ODBCDriver, Driver): - def __init__(self, pw=VIRTUOSO_PW, port=VIRTUOSO_PORT): - connect_string = get_odbc_connect_string(pwd=pw, port=port) - ODBCDriver.__init__(self, connect_string) - Driver.__init__(self) - - def conv_row(self, row, abbrev=False): - if row and abbrev: - for (k, v) in row.items(): - row[k] = self.to_prefixed_form(v) - - return row - - def query(self, q, abbrev=False): - #logger.debug('query:\n{}'.format(q)) - for qvs, row in ODBCDriver.query(self, 'SPARQL\n'+q): - yield qvs, self.conv_row(row, abbrev) - - def execute(self, q): - ODBCDriver.execute(self, 'SPARQL\n'+q) - - def fetchone(self, q, abbrev=False): - r = ODBCDriver.fetchone(self, 'SPARQL\n'+q) - if r: - r = self.conv_row(r, abbrev) - return r - - - -class VirtuosoHTTPDriver(Driver): - def __init__(self, endpoint=SPARQL_ENDPOINT): - self._endpoint = endpoint - - def conv_binding(self, b, abbrev=False): - d = {} - for k in b.keys(): - data = b[k] - v = str(data['value']) - ty = data['type'] - if ty == 'typed-literal': - dty = self.to_prefixed_form(data['datatype']) - logger.debug('%s (%s)' % (v, dty)) - if dty == 'xsd:decimal': - v = float(v) - elif dty == 'xsd:integer': - v = int(v) - - if abbrev: - if ty == 'uri': - v = self.to_prefixed_form(v) - - d[k] = v - return d - - def _exec(self, q, limit=-1): - import json - from urllib.parse import urlencode - from urllib.request import Request, urlopen - - format = 'application/json' - - if limit < 0: - maxrows = '' - else: - maxrows = str(limit) - - params = { - 'query' : q, - 'format' : format, - 'maxrows' : maxrows, - } - - qpart = urlencode(params) - - req = Request(self._endpoint, qpart) - - response = urlopen(req).read() - - result = json.loads(response) - - return result - - def execute(self, q): - self._exec(q) - - def fetchone(self, q, abbrev=False): - row = None - try: - r = self._exec(q, limit=1) - b = r['results']['bindings'][0] - row = self.conv_binding(b, abbrev) - except: - pass - - return row - - def query(self, q, abbrev=False, limit=-1): - result = self._exec(q, limit) - for b in result['results']['bindings']: - qvs = [str(v) for v in result['head']['vars']] - yield qvs, self.conv_binding(b, abbrev) - - - - -def get_driver(method='http', pw=VIRTUOSO_PW, port=VIRTUOSO_PORT): - driver = None - if method == 'http': - driver = VirtuosoHTTPDriver() - elif method == 'odbc': - driver = VirtuosoODBCDriver(pw=pw, port=port) - else: - logger.error('unknown method: "%s"' % method) - return driver - - -def query(): - from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter - - parser = ArgumentParser(description='Execute SPARQL Query', - formatter_class=ArgumentDefaultsHelpFormatter) - - parser.add_argument('query_file', type=str, help='query file') - - parser.add_argument('-d', '--debug', dest='debug', action='store_true', help='enable debug printing') - - parser.add_argument('--port', dest='port', default=VIRTUOSO_PORT, - metavar='PORT', type=int, help='set port number') - - parser.add_argument('--pw', dest='pw', metavar='PASSWORD', default=VIRTUOSO_PW, - help='set password to access DB') - - parser.add_argument('-m', '--method', dest='method', default='odbc', - metavar='METHOD', type=str, help='execute query via METHOD (http|odbc)') - - - args = parser.parse_args() - - log_level = logging.INFO - if args.debug: - log_level = logging.DEBUG - setup_logger(logger, log_level) - - qfile = args.query_file - - logger.info('method: "%s"' % args.method) - logger.info('query: "%s"' % qfile) - - - driver = get_driver(args.method, pw=args.pw, port=args.port) - - count = 0 - - try: - f = open(qfile, 'r') - q = f.read() - f.close() - - for vs, r in driver.query(q, abbrev=True): - row = [] - for v in vs: - row.append(' %s="%s"' % (v, r[v])) - print('* row[%d]' % count) - print('\n'.join(row)) - count += 1 - - except Exception as e: - #logger.error(str(e)) - raise - - print('%d rows' % count) - - -def test(): - #sparql = VirtuosoODBCDriver() - sparql = VirtuosoHTTPDriver() - - q = 'DEFINE input:inference "ont.cpi" SELECT ?s ?p ?o WHERE { ?s ?p ?o } LIMIT 10' - - for r in sparql.query(q): - print(r) - - if __name__ == '__main__': - query() - + from cca.ccautil.sparql import main + main() diff --git a/python/README.md b/python/README.md new file mode 100644 index 0000000..ec7d14a --- /dev/null +++ b/python/README.md @@ -0,0 +1,6 @@ +# Code Continuity Analysis Framework + +This package is composed of basic helper scripts for the following: +* reading repositories, +* differencing source files with Diff/AST, and +* querying/manipulating factbases. diff --git a/python/pyproject.toml b/python/pyproject.toml new file mode 100644 index 0000000..7e01ab2 --- /dev/null +++ b/python/pyproject.toml @@ -0,0 +1,3 @@ +[build-system] +requires = ['setuptools', 'wheel'] +build-backend = 'setuptools.build_meta' diff --git a/python/setup.cfg b/python/setup.cfg new file mode 100644 index 0000000..982f938 --- /dev/null +++ b/python/setup.cfg @@ -0,0 +1,19 @@ +[metadata] +name = cca +version = attr: cca.ccautil.__version__ +url = https://github.com/codinuum/cca +author = Codinuum Software Lab +author_email = codinuum@me.com +license = Apache-2.0 +description = Helper scripts for the Code Continuity Analysis Framework +long_description = file: README.md + +[options] +package_dir = + =src +packages = find_namespace: +install_requires = + rdflib >= 5.0 + +[options.packages.find] +where = src diff --git a/cca/scripts/AST.py b/python/src/cca/ccautil/AST.py similarity index 97% rename from cca/scripts/AST.py rename to python/src/cca/ccautil/AST.py index 6adb4df..3bf10d9 100644 --- a/cca/scripts/AST.py +++ b/python/src/cca/ccautil/AST.py @@ -24,11 +24,8 @@ import re import logging -import pathsetup -from cca_config import ast_ext, compress_cmd -import cca_options -import tp -import project +from .cca_config import ast_ext, compress_cmd +from . import cca_options, tp, project logger = logging.getLogger() diff --git a/cca/scripts/Git2.py b/python/src/cca/ccautil/Git2.py similarity index 99% rename from cca/scripts/Git2.py rename to python/src/cca/ccautil/Git2.py index 5c7d0ec..cd5ea65 100644 --- a/cca/scripts/Git2.py +++ b/python/src/cca/ccautil/Git2.py @@ -24,11 +24,11 @@ import pygit2 from datetime import datetime, timedelta, tzinfo import re +import logging -import pathsetup -from factutils.fileid import FileDigest, HashAlgo - +from cca.factutil.fileid import FileDigest, HashAlgo +logger = logging.getLogger() ############### diff --git a/cca/scripts/SVN.py b/python/src/cca/ccautil/SVN.py similarity index 97% rename from cca/scripts/SVN.py rename to python/src/cca/ccautil/SVN.py index 6fc0322..1b48eb4 100644 --- a/cca/scripts/SVN.py +++ b/python/src/cca/ccautil/SVN.py @@ -24,12 +24,10 @@ import pysvn from urllib.request import url2pathname from urllib.parse import urlparse -import datetime +import datetime import re import logging -import pathsetup - logger = logging.getLogger() @@ -105,7 +103,7 @@ def get_login(r, u, s): def get_head_rev(self): - entries = self.svn_cli.info2(self._svn_url, + entries = self.svn_cli.info2(self._svn_url, revision=pysvn.Revision(pysvn.opt_revision_kind.head), recurse=False) for (p, info) in entries: @@ -127,7 +125,7 @@ def get_root_url(self, revnum=None): return root return None - + def get_kind(self, item, revnum): rev = pysvn.Revision(pysvn.opt_revision_kind.number, revnum) @@ -160,7 +158,7 @@ def get_diff(self, revnum1, revnum2, outfile=None, options=['-u']): diff_options=options, #use_git_diff_format=git, ) - + if delta != '' and outfile != None: f = None try: @@ -171,7 +169,7 @@ def get_diff(self, revnum1, revnum2, outfile=None, options=['-u']): finally: if f: f.close() - + return delta def get_changed_items(self, revnum1, revnum2): @@ -272,10 +270,10 @@ def checkout(self, dest, revnum=None, directory=None, verbose=False): else: rev = pysvn.Revision(pysvn.opt_revision_kind.head) - self.svn_cli.checkout(url, - dest, + self.svn_cli.checkout(url, + dest, recurse=True, - revision=rev, + revision=rev, ignore_externals=False) def mkdir(self, d, verbose=False): @@ -289,7 +287,7 @@ def mkdir(self, d, verbose=False): p = os.path.dirname(d) self.mkdir(p) os.mkdir(d) - + def checkout_source(self, path, revnum=None, verbose=False): revnum_s = str(revnum) @@ -346,7 +344,7 @@ def get_log(path): rmap[rev] = {message} except KeyError: lmap[author] = {rev:{message}} - + except pysvn.ClientError as e: logger.warning(str(e)) @@ -373,7 +371,7 @@ def blame(path): rev = _rev.number # print('rev={} author={}'.format(rev, author)) - + num = d['number'] + 1 last_col = max(len(d['line']) - 1, 0) diff --git a/python/src/cca/ccautil/__init__.py b/python/src/cca/ccautil/__init__.py new file mode 100644 index 0000000..11d27f8 --- /dev/null +++ b/python/src/cca/ccautil/__init__.py @@ -0,0 +1 @@ +__version__ = '0.1' diff --git a/cca/scripts/cca_command.py b/python/src/cca/ccautil/cca_command.py old mode 100755 new mode 100644 similarity index 95% rename from cca/scripts/cca_command.py rename to python/src/cca/ccautil/cca_command.py index 11c3668..b6a9c37 --- a/cca/scripts/cca_command.py +++ b/python/src/cca/ccautil/cca_command.py @@ -4,7 +4,7 @@ ''' cca_command.py - Copyright 2012-2020 Codinuum Software Lab + Copyright 2012-2021 Codinuum Software Lab Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -27,9 +27,9 @@ from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter, Action import logging -import pathsetup -import proc -from common import setup_logger +from . import proc +from .siteconf import CCA_HOME +from .common import setup_logger logger = logging.getLogger() @@ -81,16 +81,16 @@ def sub(args, PHASE, PROJ, WDIR_BASE, NPROCS): run_workers_cmd_path = os.path.join(DIST_DIR, 'run_workers.py') workcmd = cmd_fmt % { 'log' : log_base + '.work', - 'subcmd' : '{} -n {} -c {} {} {}'.format(run_workers_cmd_path, - NPROCS, - SUB_CMD_NAME, - sub_args, + 'subcmd' : '{} -n {} -c {} {} {}'.format(run_workers_cmd_path, + NPROCS, + SUB_CMD_NAME, + sub_args, WDIR), } collcmd = cmd_fmt % { 'log' : log_base + '.collect', - 'subcmd' : '{} {} -c collect -b {} {} {}'.format(SUB_CMD_PATH, - PROJ, + 'subcmd' : '{} {} -c collect -b {} {} {}'.format(SUB_CMD_PATH, + PROJ, WDIR_BASE, args.sargs, WDIR), @@ -163,7 +163,7 @@ def main(): log_stdout = os.path.join(WDIR_BASE, 'cca.'+PROJ+'.stdout.log') log_stderr = os.path.join(WDIR_BASE, 'cca.'+PROJ+'.stderr.log') context = DaemonContext( - working_directory=pathsetup.CCA_HOME, + working_directory=CCA_HOME, umask=0o022, stdout=open(log_stdout, 'w+'), stderr=open(log_stderr, 'w+') @@ -182,4 +182,4 @@ def main(): if __name__ == '__main__': - main() + pass diff --git a/cca/scripts/cca_config.py b/python/src/cca/ccautil/cca_config.py similarity index 98% rename from cca/scripts/cca_config.py rename to python/src/cca/ccautil/cca_config.py index 2173278..ba7a4fd 100644 --- a/cca/scripts/cca_config.py +++ b/python/src/cca/ccautil/cca_config.py @@ -23,12 +23,11 @@ import re import logging -import pathsetup -import Git2 -from factutils.fileid import HashAlgo, VerKind -import ns -from Git2 import shorten_sha -from siteconf import PROJECTS_DIR, PROJECTS_DIR_NAME +from . import Git2 +from cca.factutil.fileid import HashAlgo, VerKind +from . import ns +from .Git2 import shorten_sha +from .siteconf import PROJECTS_DIR, PROJECTS_DIR_NAME #from common import setup_logger logger = logging.getLogger() diff --git a/cca/scripts/cca_factextractor.py b/python/src/cca/ccautil/cca_factextractor.py similarity index 93% rename from cca/scripts/cca_factextractor.py rename to python/src/cca/ccautil/cca_factextractor.py index b0ec1be..7a451e5 100644 --- a/cca/scripts/cca_factextractor.py +++ b/python/src/cca/ccautil/cca_factextractor.py @@ -23,15 +23,11 @@ import sys import logging -import pathsetup -import tp -import project -import cca_options -import factextractor -from factextractor import Enc, HashAlgo -import factutils.fileid +from . import tp, project, cca_options, factextractor +from .factextractor import Enc, HashAlgo +from .factloader import make_factbase_dir -from factloader import make_factbase_dir +import cca.factutil.fileid logger = logging.getLogger() @@ -39,7 +35,7 @@ class TaskPoolBase(tp.base, factextractor.base): def __init__(self, proj_id, basedir='.', working_dir='.', clear_cache=True, - factbase_dir=None, + factbase_dir=None, encoding=Enc.FDLCO, algo=HashAlgo.MD5, fact_out_dir=None): diff --git a/cca/scripts/cca_options.py b/python/src/cca/ccautil/cca_options.py similarity index 98% rename from cca/scripts/cca_options.py rename to python/src/cca/ccautil/cca_options.py index 3af8b15..1bda817 100644 --- a/cca/scripts/cca_options.py +++ b/python/src/cca/ccautil/cca_options.py @@ -25,8 +25,7 @@ import tempfile import logging -import pathsetup -from common import setup_logger +from .common import setup_logger logger = logging.getLogger() @@ -58,7 +57,7 @@ def __init__(self): self.argparser.add_argument('-d', '--debug', action='store_true', dest='debug', help='enable debug output') - self.argparser.add_argument('-k', '--keepcache', action='store_true', + self.argparser.add_argument('-k', '--keepcache', action='store_true', dest='keep_cache', help='keep caches') diff --git a/cca/scripts/common.py b/python/src/cca/ccautil/common.py similarity index 100% rename from cca/scripts/common.py rename to python/src/cca/ccautil/common.py diff --git a/cca/scripts/core_count.py b/python/src/cca/ccautil/core_count.py old mode 100755 new mode 100644 similarity index 94% rename from cca/scripts/core_count.py rename to python/src/cca/ccautil/core_count.py index b3d8a0c..a5a0bfd --- a/cca/scripts/core_count.py +++ b/python/src/cca/ccautil/core_count.py @@ -27,6 +27,3 @@ def core_count(): pass return out - -if __name__ == '__main__': - print(core_count()) diff --git a/cca/scripts/diffinfo.py b/python/src/cca/ccautil/diffinfo.py similarity index 99% rename from cca/scripts/diffinfo.py rename to python/src/cca/ccautil/diffinfo.py index e8d76dc..eaaedb3 100644 --- a/cca/scripts/diffinfo.py +++ b/python/src/cca/ccautil/diffinfo.py @@ -21,11 +21,10 @@ import re import os +import gzip import logging -import pathsetup -from fragment import Fragment -import gzip +from .fragment import Fragment logger = logging.getLogger() @@ -49,7 +48,7 @@ def get_excluded(s): return result - + named_node_pat_s = '\((?P[0-9]+)\) \(([0-9]+):(?P[0-9]+)\)c:(?P.*) name=\'(?P.*)\'(?P.*)\((?P[0-9]+L.*)\)(?P.*)\((?P.*)\)$' pat_s = '\((?P[0-9]+)\) \(([0-9]+):(?P[0-9]+)\)c:(?P.*)\((?P[0-9]+L.*)\)(?P.*)\((?P.*)\)$' @@ -139,7 +138,7 @@ def read_map_info(info, swapped=False): else: # maybe compressed info = info + '.gz' opener = gzip.open - + try: f = opener(info) map_file_not_found = False diff --git a/cca/scripts/diffts.py b/python/src/cca/ccautil/diffts.py old mode 100755 new mode 100644 similarity index 88% rename from cca/scripts/diffts.py rename to python/src/cca/ccautil/diffts.py index 637c337..16024f3 --- a/cca/scripts/diffts.py +++ b/python/src/cca/ccautil/diffts.py @@ -3,7 +3,7 @@ ''' A Diff/TS Driver - Copyright 2012-2020 Codinuum Software Lab + Copyright 2012-2021 Codinuum Software Lab Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -25,11 +25,10 @@ import hashlib import logging -import pathsetup -import proc -from factextractor import Enc, HashAlgo, compute_hash -import siteconf -from common import setup_logger, normpath +from . import proc +from .factextractor import Enc, HashAlgo, compute_hash +from . import siteconf +from .common import setup_logger ##### @@ -83,22 +82,15 @@ def gen_options(): diffast_prematch = True diffast_usecache = True -diffast_bin = os.path.join(siteconf.CCA_HOME, 'bin') -diffast_cmd = os.path.join(diffast_bin, 'diffast.opt') -patchast_cmd = os.path.join(diffast_bin, 'patchast.opt') - -#diffast_bin = os.path.join(siteconf.CCA_HOME, 'scripts') -#diffast_cmd = os.path.join(diffast_bin, 'diffast.sh') -#patchast_cmd = os.path.join(diffast_bin, 'patchast.sh') +diffast_cmd = 'diffast.opt' +patchast_cmd = 'patchast.opt' diffts_cost_pat = re.compile('total changes\s*: ([0-9]+)') diffts_nmap_pat = re.compile('mapping size\s*: ([0-9]+)') -diffts_delete_pat = re.compile('deletes\(hunks\)\s*: ([0-9]+)\(([0-9]+)\)') -diffts_insert_pat = re.compile('inserts\(hunks\)\s*: ([0-9]+)\(([0-9]+)\)') -diffts_relabel_pat = re.compile('relabels\s*: ([0-9]+)\(orig:([0-9]+)\).*') -diffts_movrel_pat = re.compile('mov\+rels\s*: ([0-9]+)\(orig:([0-9]+)\)') -diffts_move_pat = re.compile('moves\(hunks\)\s*: ([0-9]+)\(([0-9]+)\)') +diffts_insert_pat = re.compile('inserts\s*: ([0-9]+)') +diffts_delete_pat = re.compile('deletes\s*: ([0-9]+)') +diffts_relabel_pat = re.compile('relabels\s*: ([0-9]+)') diffts_nnodes1_pat = re.compile('nnodes1\s*: ([0-9]+)') diffts_nnodes2_pat = re.compile('nnodes2\s*: ([0-9]+)') @@ -113,7 +105,7 @@ def gen_options(): info_file_name = 'info' -default_cache_dir_base = os.path.join(pathsetup.HOME, '.cca', 'cache') +default_cache_dir_base = os.path.join(os.environ['HOME'], '.cca', 'cache') @@ -161,10 +153,10 @@ def do_cmd(cmd): stat = proc.system(cmd) if stat == 0: break - + time.sleep(1) logger.info('retrying...({}) cmd="{}"'.format(i, cmd)) - + ##### @@ -214,6 +206,7 @@ def get_cache_dir1_(diff_cmd, a, quiet=False, algo=HashAlgo.MD5 ): + cache_opt = '' if cache_dir_base: cache_opt = ' -cache {}'.format(cache_dir_base) @@ -224,7 +217,7 @@ def get_cache_dir1_(diff_cmd, a, opts = cache_opt + hash_opt - cmd = '{} -parseonly{} -getcache {}'.format(diff_cmd, opts, normpath(a)) + cmd = '{} -parseonly{} -getcache {}'.format(diff_cmd, opts, a) if not quiet: logger.info('cmd: "{}"'.format(cmd)) @@ -245,6 +238,7 @@ def get_cache_dir(diff_cmd, a1, a2, quiet=False, algo=HashAlgo.MD5 ): + cache_opt = '' if cache_dir_base: cache_opt = ' -cache {}'.format(cache_dir_base) @@ -255,7 +249,7 @@ def get_cache_dir(diff_cmd, a1, a2, opts = cache_opt + hash_opt - cmd = '{}{} -getcache {} {}'.format(diff_cmd, opts, normpath(a1), normpath(a2)) + cmd = '{}{} -getcache {} {}'.format(diff_cmd, opts, a1, a2) if not quiet: logger.info('cmd: "{}"'.format(cmd)) @@ -296,7 +290,7 @@ def read_file(r, name_pat_list, stat_paths, retry_count=RETRY_COUNT): set_value(r, name, pat, l) f.close() break - + except IOError as e: logger.warning(str(e)) logger.info('retrying...({})'.format(count)) @@ -339,13 +333,11 @@ def read_dir_info_file(stat_paths, retry_count=RETRY_COUNT): def read_file_diff_stat_file(stat_paths, retry_count=RETRY_COUNT): - r = { 'cost' : 0, - 'nmappings' : 0, + r = { 'cost' : 0, + 'nmappings' : 0, 'ninserts' : 0, 'ndeletes' : 0, 'nrelabels' : 0, - 'nmovrels' : 0, - 'nmoves' : 0, 'nnodes1' : 0, 'nnodes2' : 0, } @@ -355,8 +347,6 @@ def read_file_diff_stat_file(stat_paths, retry_count=RETRY_COUNT): ('ninserts', diffts_insert_pat), ('ndeletes', diffts_delete_pat), ('nrelabels', diffts_relabel_pat), - ('nmovrels', diffts_movrel_pat), - ('nmoves', diffts_move_pat), ('nnodes1', diffts_nnodes1_pat), ('nnodes2', diffts_nnodes2_pat), ] @@ -422,7 +412,7 @@ def diffts(diff_cmd, file1, file2, if load_fact or stat_paths==[] or not usecache: - + logger.info('diff_cmd: {}'.format(diff_cmd)) prep_opt = '' @@ -468,7 +458,7 @@ def diffts(diff_cmd, file1, file2, if fact_proj_roots: fact_opt += ' {}'.format(get_fact_proj_roots_opt(fact_proj_roots)) - + if fact_into_virtuoso: fact_opt += ' -fact:into-virtuoso {}'.format(fact_into_virtuoso) @@ -517,20 +507,18 @@ def diffts(diff_cmd, file1, file2, cmd = ''.join((diff_cmd, cache_opt, cachedir_opt, prep_opt, prem_opt, fact_opt, dumpccs_opt, check_opt, other_opts)) - cmd += ' {} {}'.format(normpath(file1), normpath(file2)) + cmd += ' {} {}'.format(file1, file2) logger.info('cmd="{}"'.format(cmd)) proc.system(cmd, quiet=quiet) - r = { - 'cost' : 0, - 'nmappings' : 0, + r = { + 'cost' : 0, + 'nmappings' : 0, 'ninserts' : 0, 'ndeletes' : 0, 'nrelabels' : 0, - 'nmovrels' : 0, - 'nmoves' : 0, # 'exitcode' : 0, } @@ -554,14 +542,14 @@ def diffast_get_cache_dir(file1, file2, **options): return get_cache_dir(diffast_cmd, file1, file2, **options) def dump_unparsed(path, to_path, quiet=False): - cmd = '{} -clearcache -parseonly -dump:src:out {} {}'.format(diffast_cmd, normpath(to_path), normpath(path)) + cmd = '{} -clearcache -parseonly -dump:src:out {} {}'.format(diffast_cmd, to_path, path) if not quiet: logger.info('cmd="{}"'.format(cmd)) return proc.system(cmd, quiet=quiet) def patchast(path, delta_path, out_path, quiet=False): - cmd = '{} -o {} {} {}'.format(patchast_cmd, normpath(out_path), normpath(path), normpath(delta_path)) + cmd = '{} -o {} {} {}'.format(patchast_cmd, out_path, path, delta_path) if not quiet: logger.info('cmd="{}"'.format(cmd)) @@ -571,18 +559,18 @@ def patchast(path, delta_path, out_path, quiet=False): def main(): from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter - argparser = ArgumentParser(description='Diff/TS driver', + argparser = ArgumentParser(description='A Diff/TS driver', formatter_class=ArgumentDefaultsHelpFormatter) - argparser.add_argument('file1', help='original file') - argparser.add_argument('file2', help='modified file') + argparser.add_argument('file1', type=str) + argparser.add_argument('file2', type=str) argparser.add_argument('-d', '--debug', action='store_true', dest='debug', default=False, help='enable debug output') - argparser.add_argument('-p', '--nopreprune', action='store_false', dest='preprune', + argparser.add_argument('--nopreprune', action='store_false', dest='preprune', default=True, help='disable prepruning') - argparser.add_argument('-m', '--noprematch', action='store_false', dest='prematch', + argparser.add_argument('--noprematch', action='store_false', dest='prematch', default=True, help='disable prematching') argparser.add_argument('-c', '--cachebase', dest='cachebase', metavar='PATH', default=None, help='set cache base to PATH') @@ -596,10 +584,6 @@ def main(): setup_logger(logger, log_level) - #mode = args[0] - #f1 = args[1] - #f2 = args[2] - mode = 'ast' f1 = args.file1 f2 = args.file2 @@ -617,8 +601,8 @@ def main(): cost = r['cost'] nmappings = r['nmappings'] cmr = float(cost) / float(nmappings) - print(r) - print('CMR:{}'.format(cmr)) + + print('cost: {} nmappings: {} CMR:{}'.format(cost, nmappings, cmr)) else: logger.error('failed') diff --git a/cca/scripts/factextractor.py b/python/src/cca/ccautil/factextractor.py similarity index 81% rename from cca/scripts/factextractor.py rename to python/src/cca/ccautil/factextractor.py index f5391e8..7990dc1 100644 --- a/cca/scripts/factextractor.py +++ b/python/src/cca/ccautil/factextractor.py @@ -3,7 +3,7 @@ ''' factextractor.py - Copyright 2012-2020 Codinuum Software Lab + Copyright 2012-2021 Codinuum Software Lab Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -20,21 +20,20 @@ import logging -import pathsetup -import project +from . import project +from .ns import XSD_NS, RDF_NS -from factutils.const import SEP, SUB_SEP -from factutils.fileid import HashAlgo, FidEnc, Enc, FileDigest, FileDesc -from factutils.fileid import compute_hash, Version, ProjRelPath, VerKind -from factutils.rdf import Resource, Predicate, Literal, mkuri -from ns import XSD_NS, RDF_NS +from cca.factutil.const import SEP, SUB_SEP +from cca.factutil.fileid import HashAlgo, FidEnc, Enc, FileDigest, FileDesc +from cca.factutil.fileid import compute_hash, Version, ProjRelPath, VerKind +from cca.factutil.rdf import Resource, Predicate, Literal, make_literal logger = logging.getLogger() -DT_BOOLEAN = mkuri(XSD_NS + 'boolean') -DT_DOUBLE = mkuri(XSD_NS + 'double') -DT_NN_INT = mkuri(XSD_NS + 'nonNegativeInteger') -DT_INT = mkuri(XSD_NS + 'integer') +DT_BOOLEAN = XSD_NS + 'boolean' +DT_DOUBLE = XSD_NS + 'double' +DT_NN_INT = XSD_NS + 'nonNegativeInteger' +DT_INT = XSD_NS + 'integer' P_TYPE = Predicate(ns=RDF_NS, lname='type') diff --git a/cca/scripts/factloader.py b/python/src/cca/ccautil/factloader.py old mode 100755 new mode 100644 similarity index 97% rename from cca/scripts/factloader.py rename to python/src/cca/ccautil/factloader.py index 7233d39..b337658 --- a/cca/scripts/factloader.py +++ b/python/src/cca/ccautil/factloader.py @@ -4,7 +4,7 @@ ''' factloader.py - Copyright 2012-2020 Codinuum Software Lab + Copyright 2012-2021 Codinuum Software Lab Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -26,8 +26,7 @@ import shutil import logging -import pathsetup -import diffts +from . import diffts logger = logging.getLogger() @@ -36,7 +35,6 @@ def make_factbase_dir(basedir, proj_id): dname = 'factbase-' + proj_id -# path = os.path.join(pathsetup.CCA_HOME, dname) path = os.path.realpath(os.path.join(basedir, dname)) return path @@ -107,7 +105,7 @@ def load(self): logger.info('"{}" removed'.format(current)) self._current_temp_file = None - + def rotate(self): self._rotate_count += 1 self.load() @@ -159,8 +157,7 @@ def load(loader, cache_dir_base, temp_file_size=DEFAULT_TEMP_FILE_SIZE): scan_dir(cache_dir_base, diffts.cfgfact_file_name, merger3.merge) merger3.load() - -if __name__ == '__main__': +def main(): from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter parser = ArgumentParser(description='load Diff/TS generated fact', @@ -168,7 +165,7 @@ def load(loader, cache_dir_base, temp_file_size=DEFAULT_TEMP_FILE_SIZE): parser.add_argument('cache_dir_base', type=str, help='cache dir base') parser.add_argument('out_dir', type=str, help='output dir') - parser.add_argument('--temp-file-size', dest='temp_file_size', + parser.add_argument('--temp-file-size', dest='temp_file_size', default=DEFAULT_TEMP_FILE_SIZE, metavar='N', type=int, help='maximum temp file size (in bytes) for RDF loader') @@ -178,3 +175,7 @@ def load(loader, cache_dir_base, temp_file_size=DEFAULT_TEMP_FILE_SIZE): load(loader, args.cache_dir_base, args.temp_file_size) + + +if __name__ == '__main__': + main() diff --git a/python/src/cca/ccautil/find_change_patterns.py b/python/src/cca/ccautil/find_change_patterns.py new file mode 100644 index 0000000..44899bf --- /dev/null +++ b/python/src/cca/ccautil/find_change_patterns.py @@ -0,0 +1,2536 @@ +#!/usr/bin/env python3 + + +''' + find_change_patterns.py + + Copyright 2012-2020 Codinuum Software Lab + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +''' + +import os.path +import re +import time +import sys +import random +import datetime +import locale +import hashlib +from urllib.request import pathname2url +from datetime import datetime, timedelta, tzinfo +import json +from xml.sax.saxutils import escape +import logging + +from .siteconf import CCA_HOME +from . import cca_config as config +from . import project, sparql +from .factextractor import compo_join, P_TYPE, make_literal +from .ns import FB_NS, CHG_NS, NS_TBL, PREFIX_TBL +from .sparql import get_localname +from .virtuoso import VIRTUOSO_PW, VIRTUOSO_PORT +from .common import setup_logger + +from cca import factutil +from cca.factutil.entity import SourceCodeEntity +from cca.factutil.fact import Fact +from cca.factutil.rdf import Resource, Predicate, Literal +import cca.factutil.range + + +logger = logging.getLogger() + +DEFAULT_FACT_OUTPUT_DIR = '/opt/virtuoso/tmp/chgpat/' + + +ENT_PAIR_NS = NS_TBL['entpair_ns'] +CHGPAT_INST_NS = NS_TBL['chgpat_ns'] + +LOCALE = 'en_US' + +KEY_VAR_NAME = 'KEY' +GROUP_VAR_NAME = 'GROUP' +NAME_SEPARATOR = ';' +GROUP_SEPARATOR = '/' + +random.seed('DIFFTS-FIND-CHANGE-PATTERNS') + +TD_ZERO = timedelta(0) + +class JST(tzinfo): + def utcoffset(self, dt): + return timedelta(hours=9) + def dst(self, dt): + return TD_ZERO + def tzname(self, dt): + return 'JST' + +class TZ(tzinfo): + def __init__(self, ofs): + self.__utcoffset = timedelta(minutes=ofs) + def utcoffset(self, dt): + return self.__utcoffset + def dst(self, dt): + return TD_ZERO + +escape_tbl = { + '"' : '"', + "'" : ''', + '<' : '<', + '>' : '>', +} + +def html_escape(s): + return escape(s, escape_tbl) + +def jsondumps(obj): + return html_escape(json.dumps(obj, separators=(',',':'))) + +ENUM_QUERY_DIR = os.path.join(CCA_HOME, 'queries', 'enum') +Q_ENUM_MAPPING_CHG = 'enumerate_mapping_changes.rq' +Q_ENUM_ADDITION = 'enumerate_additions.rq' +Q_ENUM_REMOVAL = 'enumerate_removals.rq' + +#Q_ENUM_FILE = 'enumerate_files.rq' + + +HTML_HEAD = ''' +%(proj_id)s + + + + + + +
+''' + +HTML_TAIL = '''
+''' + +FRAME_PAGE = ''' +%(proj_id)s + + + +

%(proj_id)s

+ + + + +''' + +HTML_ITEM = '''
  • +[%(count)d] %(rname)s
    +%(others)s +%(applet)s +
  • +''' + +APPLET = ''' + +
    +%(var0)s -> %(var1)s (%(startl0)dL,%(startc0)dC - %(startl1)dL,%(startc1)dC) + +
    + + + + + + + + + + + + + + + + + + + + + +
    +''' + +HTML_TBL = ''' + %(proj_id)s + %(nversions)s + %(sloc)s(%(avg_sloc)s) + %(ntriples)s + %(npatterns)s + +''' + + +TRIPLE_COUNT_QUERY = ''' +SELECT DISTINCT (COUNT(*) AS ?count) +FROM <%(graph)s> +WHERE { + ?s ?p ?o . +} +''' + + +VER_QUERY = ''' +PREFIX ver: <%(ver_ns)s> +SELECT DISTINCT ?v +FROM <%%(graph)s> +WHERE { + <%%(fent)s> ver:version ?v . +} +''' % NS_TBL + +VER_PAIR_QUERY = ''' +PREFIX ver: <%(ver_ns)s> +SELECT DISTINCT ?v ?v_ +FROM <%%(graph)s> +WHERE { + <%%(fent0)s> ver:version ?v . + <%%(fent1)s> ver:version ?v_ . + ?v ver:next ?v_ . +} +''' % NS_TBL + +SRCTREE_PAIR_QUERY = ''' +PREFIX chg: <%(chg_ns)s> +PREFIX ver: <%(ver_ns)s> +SELECT DISTINCT ?srcpair +FROM <%%(graph)s> +WHERE { + ?srcpair a chg:SourceTreePair ; + chg:originalSourceTree/ver:version <%%(ver0)s> ; + chg:modifiedSourceTree/ver:version <%%(ver1)s> . +} +''' % NS_TBL + +SRCTREE_QUERY = ''' +PREFIX src: <%(src_ns)s> +PREFIX ver: <%(ver_ns)s> +SELECT DISTINCT ?src +FROM <%%(graph)s> +WHERE { + ?src a src:SourceTree ; + ver:version <%%(ver)s> . +} +''' % NS_TBL + +PATH_QUERY = ''' +PREFIX src: <%(src_ns)s> +PREFIX ver: <%(ver_ns)s> +PREFIX gsrc: <%(gsrc_ns)s> +SELECT DISTINCT ?loc +FROM <%%(graph)s> +WHERE { + ?file a src:File ; + ver:version ?ver ; + src:location ?loc . + + ?g gsrc:location ?file ; + gsrc:location ?loc ; + ver:version ?ver . + + FILTER (?file = <%%(fent)s>) + FILTER (?ver = <%%(ver)s>) +} +''' % NS_TBL + +ALL_PATH_QUERY = ''' +PREFIX src: <%(src_ns)s> +SELECT DISTINCT ?loc +FROM <%%(graph)s> +WHERE { + ?file a src:File ; + src:location ?loc . + + FILTER (?file = <%%(fent)s>) +} +''' % NS_TBL + +GIT_COMMIT_INFO_QUERY = ''' +PREFIX git: <%(git_ns)s> +SELECT DISTINCT ?cmtr ?date ?mes ?ofs +FROM <%%(graph)s> +WHERE { + <%%(ver)s> git:message ?mes ; + git:committer ?cmtr ; + git:commitDate ?date ; + git:commitDateOffset ?ofs . +} +''' % NS_TBL + +ENUM_FILES_QUERY = ''' +PREFIX src: <%(src_ns)s> +SELECT DISTINCT ?ent +FROM <%%(graph)s> +WHERE { + ?ent a src:File . +} +''' % NS_TBL + + +class QueryNotFound(Exception): + def __init__(self, q): + self.query = q + + +locale.setlocale(locale.LC_ALL, LOCALE) +def format_num(n): + s = '-' + if n: + s = locale.format('%d', n, grouping=True) + return s + + +def capitalize(s): + if s.isupper(): + return s + else: + return s.capitalize() + +def fname_to_title(fname): + (name, ext) = os.path.splitext(fname) + ws = name.split('_') + title = ' '.join([capitalize(w) for w in ws]) + return title + + +def mkfilepair(fent1, fent2): + id1 = fent1.get_local_name() + id2 = fent2.get_local_name() + if id1 == None: + id1 = 'null' + if id2 == None: + id2 = 'null' + uri = ENT_PAIR_NS + compo_join(id1, id2) + logger.debug('uri=%s' % uri) + return Resource(uri=uri) + +def mkchgpat(ent1, ent2, _chg): + chg = _chg.lower().replace('"', '').replace(' ', '_') + cid = compo_join(ent1.get_local_name(), + ent2.get_local_name(), + chg) + + uri = CHGPAT_INST_NS + cid + logger.debug('uri=%s' % uri) + return Resource(uri=uri) + + +def make_literals(str_set): + l = [] + for s in str_set: + for s0 in s.split(NAME_SEPARATOR): + l.append(make_literal(s0)) + return l + +def str_set_to_str(ss): + return NAME_SEPARATOR.join(sorted([html_escape(s) for s in ss])) + +zero_metric = {'copts':'', 'value':0} + + +class Predicates(object): + def __init__(self): + self.c_filepair = Resource(uri=CHG_NS+'FilePair') + self.p_orig_file = Predicate(CHG_NS, 'originalFile') + self.p_modi_file = Predicate(CHG_NS, 'modifiedFile') + + self.chgpat_ns = None + self.p_filepair = None + self.p_chgpat = None + + + +class FactExtractor(object): + def __init__(self, graph, conf, method='odbc', pw=VIRTUOSO_PW, port=VIRTUOSO_PORT): + self._graph = graph + self._conf = conf + self.metrics = "" + self.metrics_reverse = False + + self._sparql = sparql.get_driver(method, pw=pw, port=port) + + + def get_metric(self, x, y): + return zero_metric + + def extract_srctree_fact(self, fact, ent0, ent1, ver0, ver1): + pass + + def extract_file_fact(self, fact, ent0, ent1, ver0, ver1): + pass + + def get_other_info(self, ver0, ver1): + return {} + + def get_git_commit_info(self, ver0, ver1): + info = {} + if self._conf.is_vkind_gitrev(): + q = GIT_COMMIT_INFO_QUERY % { 'graph' : self._graph, + 'ver' : ver1 } + + for qvs, row in self._sparql.query(q): + info['Committer'] = row['cmtr'] + info['Message'] = '%s' % row['mes'] + try: + ts = int(row['date']) + ofs = int(row['ofs']) + dt = datetime.fromtimestamp(ts, TZ(ofs)) + + jst_dt = dt.astimezone(JST()) + + #info['Date'] = dt.isoformat(' ') + info['Date'] = jst_dt.isoformat(' ') + except Exception as e: + logger.warning(str(e)) + + return info + + + +class Finder(object): + def __init__(self, qdir, queries, base_dir, proj_id, + predicate_tbl=None, + limit=None, + lang=None, + extra_fact_extractor=None, + conf=None, + method='odbc', + pw=VIRTUOSO_PW, + port=VIRTUOSO_PORT): + + self._query_dir = qdir + self._queries = queries + self._base_dir = base_dir + self._proj_id = proj_id + self._graph_uri = FB_NS + proj_id + self._sparql = sparql.get_driver(method, pw=pw, port=port) + self._result = {} # lang -> (ver * ver) -> (fid * fid) -> change_pat_name -> (ent * ent) list + self._cache_base_dir = os.path.join(base_dir, 'work.diffts', proj_id+'.fact') + if conf == None: + self._conf = project.get_conf(proj_id) + else: + self._conf = conf + self._hash_algo = self._conf.hash_algo + + self._vi_tbl = {} + for i in range(self._conf.nversions): + self._vi_tbl[self._conf.versions[i]] = i + + if limit != None and limit > 0: + self._limit = limit + else: + self._limit = None + + self._lang = lang + + self._latest_version_cache = {} + self._earliest_version_cache = {} + self._version_pair_cache = {} + self._version_pairs_cache = {} + + self._extra_ent_pairs_tbl = {} # ent * ent -> var * var -> (ent * ent) list + + self._others_tbl = {} # ent * ent -> var -> val + + self._ess_tbl_rm = {} # ent * ent -> removed -> ent list + self._ess_tbl_ad = {} # ent * ent -> added -> ent list + self._ess_tbl_mp = {} # ent * ent -> mapped -> ent list + + self._prim_chg_tbl = {} # ver * ver -> fid * fid -> (ent * ent * chg) list + self._prim_chg_count_tbl = {} # ver * ver -> fid * fid -> int + + self._coverage_cache_f = {} + self._coverage_cache_v = {} + + self._fent_tbl = {} # fid -> fent + + self._src_path_tbl = {} # (fid * verURI) -> path + + self._predicate_tbl = predicate_tbl + + self._extra_fact_extractor = None + if extra_fact_extractor: + self._extra_fact_extractor = extra_fact_extractor(self._graph_uri, self._conf, + method=method, pw=pw, port=port) + + self._metric_tbl = {} + + self._change_enumeration = False + + def get_ver_idx(self, v): + return self._vi_tbl[v] + + def get_ver_name(self, verURI): + return self._conf.vers[self.get_vindex(verURI)] + + def get_ver_dir(self, verURI): + return self._conf.get_ver_dir(self._conf.versions[self.get_vindex(verURI)]) + + def get_ver_dir_r(self, verURI): + v = self._conf.versions[self.get_vindex(verURI)] + vdn = self._conf.get_ver_dir_name(v) + return os.path.join(config.PROJECTS_DIR_NAME, self._proj_id, vdn) + + def get_abbrev_ver(self, lver): + av = lver + try: + i = self.get_ver_idx(lver) + av = self._conf.vers[i] + except: + #logger.warning('cannot get abbreviation for "%s"' % lver) + pass + return av + + def disable_change_enumeration(self): + self._change_enumeration = False + + def enable_change_enumeration(self): + self._change_enumeration = True + + + def get_predicates(self, lang): + ps = None + if self._predicate_tbl: + try: + ps = self._predicate_tbl[lang] + except KeyError: + pass + return ps + + def extract_fact(self, lang): + ps = self.get_predicates(lang) + b = ps != None + return b + + def mkchgpatpred(self, lang, s): + ps = self.get_predicates(lang) + p = Predicate(ps.chgpat_ns, s) + return p + + def mkchgtype(self, lang, _chg): + chg = _chg + qst = chg.find('"') + qed = chg.rfind('"') + if qst >= 0 and qed >= 0: + chg = _chg[qed+1:] + + tn = ''.join([s.capitalize() for s in chg.split(' ')]) + + ps = self.get_predicates(lang) + uri = ps.chgpat_ns + tn + + r = Resource(uri=uri) + + return r + + + def get_query(self, lang, name, force_per_ver=False): + query = None + per_ver = False + path = None + if lang: + path = os.path.join(self._query_dir, lang, name) + else: + path = os.path.join(ENUM_QUERY_DIR, name) + + try: + f = open(path, 'r') + q = f.read() + + if force_per_ver: + logger.debug('forcing the query to be executed for each version pairs') + q = re.sub(r'\?ver', '?VER', q) + + _query = re.sub(r'WHERE', ('WHERE {\nGRAPH <%s>' % self._graph_uri), q, count=1, flags=re.IGNORECASE).rstrip('\n ;') + query = '}}'.join(_query.rsplit('}', 1)) + + if q.find('?VER') > -1 and q.find('?VER_') > -1: + per_ver = True + + f.close() + except Exception as e: + raise QueryNotFound(path) + #logger.error(str(e)) + + return query, per_ver + + def add_ver_filter(self, q, ver0, ver1): + #q0 = q.replace('?VER_', '<%s>' % ver1) + #query = q0.replace('?VER', '<%s>' % ver0) + + query = q.replace('{', '{\nFILTER (?VER = <%s> && ?VER_ = <%s>)' % (ver0, ver1), 1) + + # query = re.sub(r'(\?VER\s+ver:version\s+\?VER_\s*\.)', + # r'\1\nFILTER (?VER = <%s> && ?VER_ = <%s>)\n' % (ver0, ver1), + # q) + return query + + def get_version_pair(self, ent0, ent1): + fent0 = ent0 + fent1 = ent1 + + if not ent0.is_file(): + fent0 = SourceCodeEntity(file_id=ent0.get_file_id()) + + if not ent1.is_file(): + fent1 = SourceCodeEntity(file_id=ent1.get_file_id()) + + try: + return self._version_pair_cache[(fent0, fent1)] + except KeyError: + pass + + q = VER_PAIR_QUERY % { 'fent0' : fent0.get_uri(), + 'fent1' : fent1.get_uri(), + 'graph' : self._graph_uri, + } + pair = None + idx = sys.maxsize + + for qvs, row in self._sparql.query(q): + v0 = row['v'] + v = get_localname(v0) + try: + i = self.get_ver_idx(v) + if i < idx: + idx = i + pair = (v0, row['v_']) + except: + logger.error('cannot get index for version "%s"' % v) + + if pair == None: + logger.warning('cannot get version pair for "%s" and "%s"' % (ent0, ent1)) + else: + self._version_pair_cache[(fent0, fent1)] = pair + + return pair + + def get_version_pairs(self, ent0, ent1): + fent0 = ent0 + fent1 = ent1 + + if not ent0.is_file(): + fent0 = SourceCodeEntity(file_id=ent0.get_file_id()) + + if not ent1.is_file(): + fent1 = SourceCodeEntity(file_id=ent1.get_file_id()) + + try: + return self._version_pairs_cache[(fent0, fent1)] + except KeyError: + pass + + q = VER_PAIR_QUERY % { 'fent0' : fent0.get_uri(), + 'fent1' : fent1.get_uri(), + 'graph' : self._graph_uri, + } + + pairs = [] + + for qvs, row in self._sparql.query(q): + pairs.append((row['v'], row['v_'])) + + if pairs == []: + logger.warning('cannot get version pair for "%s" and "%s"' % (ent0, ent1)) + else: + self._version_pairs_cache[(fent0, fent1)] = pairs + + return pairs + + def get_srctree_pair_ent(self, ver0, ver1): + q = SRCTREE_PAIR_QUERY % { 'ver0' : ver0, + 'ver1' : ver1, + 'graph' : self._graph_uri } + + l = [] + + for qvs, row in self._sparql.query(q): + r = Resource(uri=row['srcpair']) + l.append(r) + + n = len(l) + if n == 0: + logger.warning('SourceTreePair not found for "%s" and "%s"' % (ver0, ver1)) + elif n == 1: + return l[0] + else: + logger.warning('multiple SourceTreePair found for "%s" and "%s":\n%s' % (ver0, ver1, '\n'.join(l))) + return l[0] + + def get_srctree_ent(self, ver): + q = SRCTREE_QUERY % { 'ver' : ver, + 'graph' : self._graph_uri } + + l = [] + + for qvs, row in self._sparql.query(q): + r = Resource(uri=row['src']) + l.append(r) + + n = len(l) + if n == 0: + logger.warning('SourceTree not found for "%s"' % ver) + elif n == 1: + return l[0] + else: + logger.warning('multiple SourceTree found for "%s":\n%s' % (ver, '\n'.join(l))) + return l[0] + + def count_triples(self): + q = TRIPLE_COUNT_QUERY % { 'graph' : self._graph_uri } + count = None + for qvs, row in self._sparql.query(q): + count = row['count'] + return count + + def _get_version(self, ent, direction): + fent = SourceCodeEntity(file_id=ent.get_file_id()) + + try: + if direction >= 0: #latest + return self._latest_version_cache[fent] + elif direction < 0: #earliest + return self._earliest_version_cache[fent] + except KeyError: + pass + + q = VER_QUERY % { 'fent' : fent.get_uri(), 'graph' : self._graph_uri } + + ver = None + idx = None + for qvs, row in self._sparql.query(q): + v = get_localname(row['v']) + try: + i = self.get_ver_idx(v) + except: + logger.error('cannot get index for version "%s"' % v) + + if ver == None: + ver = row['v'] + idx = i + elif direction >= 0: # latest + if i > idx: + ver = row['v'] + idx = i + else: + pass + elif direction < 0: # earliest + if i < idx: + ver = row['v'] + idx = i + else: + pass + + + if ver == None: + logger.warning('cannot get version of "%s"' % ent) + else: + if direction >= 0: #latest + self._latest_version_cache[fent] = ver + elif direction < 0: #earliest + self._earliest_version_cache[fent] = ver + + return ver + + def get_latest_version(self, ent): + return self._get_version(ent, 1) + + def get_earliest_version(self, ent): + return self._get_version(ent, -1) + + def get_cache_path(self, fd0, fd1): + p = os.path.join(self._cache_base_dir, fd0[0:2], fd0+'-'+fd1) + return p + + def get_diff_url(self, fid0, fid1): + fd0 = fid0.get_value() + fd1 = fid1.get_value() + p = os.path.join(self.get_cache_path(fd0, fd1), 'diff.json') + u = pathname2url(p) + return u + + def path_ok(self, path): + cond = True + if self._conf.include: + cond = any(path.startswith(x) for x in self._conf.include) + #logger.warning('%s --> %s' % (path, cond)) + return cond + + def get_source_path(self, fid, verURI): + path = None + + if fid == None: + return path + + try: + path = self._src_path_tbl[(fid, verURI)] + return path + except KeyError: + pass + + fent = SourceCodeEntity(file_id=fid) + q = PATH_QUERY % { 'fent' : fent.get_uri(), + 'ver' : verURI, + 'graph' : self._graph_uri, + } + locs = [] + for qvs, row in self._sparql.query(q): + loc = row['loc'] + if self.path_ok(loc): + locs.append(loc) + + n = len(locs) + if n == 1: + path = locs[0] + elif n > 1: + logger.warning('multiple paths found for "%s"@"%s":\n%s' % (fid.encode(), + get_localname(verURI), + '\n'.join(locs))) + path = locs[0] + else: + q = ALL_PATH_QUERY % { 'fent' : fent.get_uri(), + 'graph' : self._graph_uri, + } + locs = [] + for qvs, row in self._sparql.query(q): + loc = row['loc'] + if self.path_ok(loc): + locs.append(loc) + + n = len(locs) + if n == 1: + path = locs[0] + else: + if n > 1: + logger.warning('multiple paths found for "%s"@"%s":\n%s' % (fid.encode(), + get_localname(verURI), + '\n'.join(locs))) + + fd = fid.get_value() + vkey = self._conf.get_vkey(self.get_vindex(verURI)) + src = os.path.join(self._cache_base_dir, fd[0:2], fd, 'source.'+vkey) + try: + f = open(src, 'r') + if self._conf.is_vkind_gitrev(): + path = f.readline().rstrip() + else: + p = os.path.realpath(f.readline().rstrip()) + path = p.replace(self.get_ver_dir(verURI)+os.sep, '') + + except Exception as e: + #logger.warning(str(e)) + logger.debug(str(e)) + + logger.debug('path="%s"' % path) + if path == None and not self._conf.include: + logger.warning('path not found for "%s"@"%s"' % (fid.encode(), get_localname(verURI))) + + self._src_path_tbl[(fid, verURI)] = path + + return path + + def check_path(self, ent0, ent1, ver0, ver1, per_ver): + b = False + if self._conf.include: + ver_pairs = [(ver0, ver1)] + if not per_ver: + ver_pairs = self.get_version_pairs(ent0, ent1) + + if ver_pairs: + for (v0, v1) in ver_pairs: + path0 = self.get_source_path(ent0.get_file_id(), v0) + path1 = self.get_source_path(ent1.get_file_id(), v1) + if path0 or path1: + b = True + break + + else: + b = True + + return b + + + def get_source_url(self, fid, verURI, path=None): + url = None + + if fid == None: + return url + + p = path + if p == None: + p = self.get_source_path(fid, verURI) + + if self._conf.is_vkind_gitrev(): + fd = fid.get_value() + if self._conf.gitweb_proj_id: + url = '/gitweb/?p=%s;a=blob_plain;f=%s;h=%s;hb=%s' % (self._conf.gitweb_proj_id, + p, + fd, + get_localname(verURI)) + else: + url = pathname2url(p) + logger.warning('cannot get source url: fd="%s" verURI="%s"' % (fd, verURI)) + else: + try: + if p: + url = os.path.join(self.get_ver_dir_r(verURI), p) + except Exception as e: + logger.warning('cannot get source URL for fid=%s verURL=%s path=%s: %s' % (fid, verURI, path, e)) + return url + + def get_hilit(self, r): + return '%d:%d' % (r.get_start_offset(), r.get_end_offset()) + + def setup_metric_tbl(self, vpairs): + self._metric_tbl = {} + if self._extra_fact_extractor: + for (x, y) in vpairs: + m = self._extra_fact_extractor.get_metric(get_localname(x), get_localname(y)) + if m: + self._metric_tbl[(x, y)] = m + else: + self._metric_tbl[(x, y)] = zero_metric + + no_metric = True + for (k, m) in self._metric_tbl.items(): + if m != zero_metric: + no_metric = False + break + + if no_metric: + self._metric_tbl = {} + + + def sort_ver_pairs(self, vpairs): + if vpairs: + if self._metric_tbl: + r = self._extra_fact_extractor.metrics_reverse + vpairs.sort(key=lambda p: self._metric_tbl[p]['value'], reverse=r) + + else: + uris = self._conf.versionURIs + vpairs.sort(key=lambda x_y: (uris.index(x_y[0]), uris.index(x_y[1]))) + #vpairs.sort(key=lambda (x,y): self.get_ver_idx(x)) + + def _show_stat(self, f, count_tbl, cov): + change_pats = count_tbl.keys() + total = 0 + for r in change_pats: + total += count_tbl[r] + nchange_pats = len(change_pats) + + count_items = list(count_tbl.items()) + count_items.sort(key=lambda x: x[1], reverse=True) + + f.write('Found: %d change patterns (%d instances)
    \n' % (nchange_pats, total)) + + if cov: + f.write('Coverage: %d/%d=%3.2f
    \n' % cov) + + f.write('Instances:
    \n') + + f.write('\n') + for (change_pat, c) in count_items: + p = float(c) / float(total) * 100.0 + f.write('' % (change_pat, c)) + f.write('\n' % p) + f.write('
    %s%d(%3.2f%%)
    \n') + + + def make_applet_html(self, vpid, form_id, var0, var1, ent0, ent1, ver0, ver1, desc='{}', ess0='[]', ess1='[]', ess01='[]'): + fid0 = ent0.get_file_id() + fid1 = ent1.get_file_id() + + path0 = self.get_source_path(fid0, ver0) + path1 = self.get_source_path(fid1, ver1) + + if path0 == None and path1 == None: + return '' + + url0 = self.get_source_url(fid0, ver0, path=path0) + url1 = self.get_source_url(fid1, ver1, path=path1) + + if url0 == None or url1 == None: + return '' + + else: + aver0 = self.get_abbrev_ver(get_localname(ver0)) + aver1 = self.get_abbrev_ver(get_localname(ver1)) + + data = { + 'proj_id' : self._proj_id, + # 'cache' : urlname2url(self._cache_base_dir), + # 'diff' : self.get_diff_url(fid0, fid1), + 'algo' : self._hash_algo, + 'var0' : var0, + 'var1' : var1, + 'ver0' : aver0, + 'ver1' : aver1, + 'path0' : path0, + 'path1' : path1, + 'src0' : url0, + 'src1' : url1, + 'vpid' : vpid, + 'form_id' : form_id, + } + + r0 = ent0.get_range() + r1 = ent1.get_range() + + line0 = r0.get_start_line() + line1 = r1.get_start_line() + + col0 = r0.get_start_col() + col1 = r1.get_start_col() + + data['startl0'] = line0 + data['startl1'] = line1 + + data['startc0'] = col0 + data['startc1'] = col1 + + data['hilit0'] = self.get_hilit(r0) + data['hilit1'] = self.get_hilit(r1) + + data['desc'] = desc + data['ess0'] = ess0 + data['ess1'] = ess1 + data['ess01'] = ess01 + + applet = APPLET % data + + return applet + + + def get_total_coverage(self): + total = 0 + identified = 0 + + for (ver0, ver1) in self._prim_chg_count_tbl.keys(): + (i, t, c) = self.get_coverage_v(ver0, ver1) + identified += i + total += t + + coverage = None + + if total > 0: + coverage = float(identified) / float(total) + + return (identified, total, coverage) + + def get_coverage_v(self, ver0, ver1): + try: + return self._coverage_cache_v[(ver0, ver1)] + except KeyError: + + total = 0 + identified = 0 + + try: + fd_tbl = self._prim_chg_count_tbl[(ver0, ver1)] + for (fid0, fid1) in fd_tbl.keys(): + (i, t, c) = self.get_coverage_f(ver0, ver1, fid0, fid1) + identified += i + total += t + except KeyError: + pass + + coverage = None + + if total > 0: + coverage = float(identified) / float(total) + + res = (identified, total, coverage) + + self._coverage_cache_v[(ver0, ver1)] = res + + return res + + + def get_coverage_f(self, ver0, ver1, fid0, fid1): + try: + return self._coverage_cache_f[(ver0, ver1, fid0, fid1)] + except KeyError: + + total = 0 + unidentified = 0 + + try: + total = self._prim_chg_count_tbl[(ver0, ver1)][(fid0, fid1)] + except KeyError: + pass + + try: + unidentified = len(self._prim_chg_tbl[(ver0, ver1)][(fid0, fid1)]) + except KeyError: + pass + + coverage = None + + identified = total - unidentified + + if total > 0: + coverage = float(identified) / float(total) + + res = (identified, total, coverage) + + self._coverage_cache_f[(ver0, ver1, fid0, fid1)] = res + + return res + + def reduce_chgs(self, chgs): + prune = set() + graft = set() + for chg in chgs: + (ent0, ent1, ch, cat) = chg + if ch in ('grafted onto', 'weak addition'): + graft.add(chg) + + elif ch in ('pruned from', 'weak removal'): + prune.add(chg) + + logger.debug('|prune|=%d |graft|=%d' % (len(prune), len(graft))) + + reduced = set() + + for chg in chgs: + (ent0, ent1, ch, cat) = chg + b = True + for (e0, e1, c, ct) in prune: + if ch in ('deleted from', 'pruned from', 'weak removal') and e0.contains(ent0) and e0 != ent0: + b = False + break + if b: + for (e0, e1, c, ct) in graft: + if ch in ('inserted into', 'grafted onto', 'weak addition') and e1.contains(ent1) and e1 != ent1: + b = False + break + + # if c in ('weak addition', 'weak removal'): + # b = False + + if b: + reduced.add(chg) + else: + prune.discard(chg) + graft.discard(chg) + + logger.debug('|prune|=%d |graft|=%d |reduced|=%d' % (len(prune), len(graft), len(reduced))) + + + return (reduced | prune | graft) + + + + def dump(self, outdir, foutdir=None): + + if not os.path.exists(outdir): + logger.warning('creating "%s"...' % outdir) + os.makedirs(outdir) + + fact = Fact(PREFIX_TBL) + + html_dir = os.path.join(outdir, self._proj_id) + + if not os.path.exists(html_dir): + logger.warning('creating "%s"...' % html_dir) + os.makedirs(html_dir) + + html_path = os.path.join(html_dir, 'index.html') + + path_summary = os.path.join(html_dir, 'summary.html') + + f_summary = open(path_summary, 'w') + + sloc = '-' + try: + sloc = format_num(self._conf.sloc) + except: + pass + avg_sloc = '-' + try: + avg_sloc = format_num(self._conf.sloc / self._conf.nversions) + except: + pass + + html_tbl_data = { 'proj_id' : self._proj_id, + 'sloc' : sloc, + 'avg_sloc' : avg_sloc, + 'html' : html_path, + 'nversions' : format_num(self._conf.nversions), + } + + count_tbl = {} + + for lang in self._result.keys(): + ver_tbl = self._result[lang] + for ver_pair in ver_tbl.keys(): + fd_tbl = ver_tbl[ver_pair] + for fd_pair in fd_tbl.keys(): + g_tbl = fd_tbl[fd_pair] + for g in g_tbl.keys(): + r_tbl = g_tbl[g] + for change_pat in r_tbl.keys(): + count_tbl[change_pat] = count_tbl.get(change_pat, 0) + len(r_tbl[change_pat]) + + html_head = HTML_HEAD % {'proj_id': self._proj_id } + + f_summary.write(html_head) + + f_summary.write('

    Total

    \n') + + # cov = self.get_total_coverage() + cov = None + + html_tbl_data['npatterns'] = format_num(len(count_tbl.keys())) + + self._show_stat(f_summary, count_tbl, cov) + + ver_pair_count = 0 + file_count = 0 + count = 0 + + xkey_tbl = {} + + first_path_sub = None + + for lang in self._result.keys(): + + ver_tbl = self._result[lang] + ver_pairs = list(ver_tbl.keys()) + + self.setup_metric_tbl(ver_pairs) + self.sort_ver_pairs(ver_pairs) + + for (ver0, ver1) in ver_pairs: + + ### BEGIN EXTRACT FACT + if self.extract_fact(lang) and self._extra_fact_extractor: + sent0 = self.get_srctree_ent(ver0) + sent1 = self.get_srctree_ent(ver1) + self._extra_fact_extractor.extract_srctree_fact(fact, sent0, sent1, ver0, ver1) + ### END EXTRACT FACT + + lver0 = get_localname(ver0) + lver1 = get_localname(ver1) + + path_sub = os.path.join(html_dir, '%d.html' % ver_pair_count) + + vpid = str(ver_pair_count) + + ver_pair_count += 1 + + if first_path_sub == None: + first_path_sub = path_sub + + f_sub = open(path_sub, 'w') + f_sub.write(html_head) + + f_summary.write('
    \n') + + aver0 = self.get_abbrev_ver(lver0) + aver1 = self.get_abbrev_ver(lver1) + + f_summary.write('

    Between "%s" and "%s"

    \n' % (aver0, aver1)) + + if self._metric_tbl: + m = self._metric_tbl[(ver0, ver1)] + copts_str = m['copts'] + if copts_str: + copts_str = ' (%s)' % copts_str + met = '%s: %s%s
    \n' % (self._extra_fact_extractor.metrics, m['value'], copts_str) + f_summary.write(met) + + other_info = self._extra_fact_extractor.get_other_info(ver0, ver1) + if other_info: + for k in other_info.keys(): + f_summary.write('%s: %s
    \n' % (k, other_info[k])) + + + f_sub.write('

    Patterns found between "%s" and "%s"

    \n' % (aver0, aver1)) + + fd_tbl = ver_tbl[(ver0, ver1)] + + instances = [] + + for fd_pair in fd_tbl.keys(): + g_tbl = fd_tbl[fd_pair] + for g in g_tbl.keys(): + r_tbl = g_tbl[g] + for change_pat in r_tbl.keys(): + i_tbl = r_tbl[change_pat] + for k in i_tbl.keys(): + (v0, v1, e0, e1) = i_tbl[k] + iid = (change_pat, k) + instances.append(iid) + + ninstances = len(instances) + logger.info('%d instances found between "%s" and "%s"' % (ninstances, lver0, lver1)) + + if self._limit != None: + if ninstances > self._limit: + reduced_count = 0 + reduced_instances = random.sample(instances, self._limit) + logger.info('--> randomly sampled %d instances' % len(reduced_instances)) + + for fd_pair in fd_tbl.keys(): + g_tbl = fd_tbl[fd_pair] + for g in g_tbl.keys(): + r_tbl = g_tbl[g] + for change_pat in r_tbl.keys(): + rs = [] + i_tbl = r_tbl[change_pat] + for k in i_tbl.keys(): + (v0, v1, e0, e1) = i_tbl[k] + iid = (change_pat, k) + if iid in reduced_instances: + rs.append((v0, v1, e0, e1)) + reduced_count += 1 + r_tbl[change_pat] = rs + + logger.info('--> %d instances selected' % reduced_count) + + for fd_pair in fd_tbl.keys(): + g_tbl = fd_tbl[fd_pair] + for g in g_tbl.keys(): + r_tbl = g_tbl[g] + for change_pat in r_tbl.keys(): + if r_tbl[change_pat] == []: + del r_tbl[change_pat] + + for fd_pair in fd_tbl.keys(): + if fd_tbl[fd_pair] == {}: + del fd_tbl[fd_pair] + + + count_tbl = {} + for fd_pair in fd_tbl.keys(): + g_tbl = fd_tbl[fd_pair] + for g in g_tbl.keys(): + r_tbl = g_tbl[g] + for change_pat in r_tbl.keys(): + count_tbl[change_pat] = count_tbl.get(change_pat, 0) + len(r_tbl[change_pat].keys()) + + # cov = self.get_coverage_v(ver0, ver1) + cov = None + + self._show_stat(f_summary, count_tbl, cov) + + link = 'Show Detail\n' % (os.path.basename(path_sub)) + f_summary.write(link) + + + + for (fid0, fid1) in fd_tbl.keys(): + + path0 = self.get_source_path(fid0, ver0) + path1 = self.get_source_path(fid1, ver1) + + file_count += 1 + + file_pair_node = None + ### BEGIN EXTRACT FACT + if self.extract_fact(lang): + #fent0 = self._fent_tbl[fid0] + #fent1 = self._fent_tbl[fid1] + fent0 = self.get_fent(fid0) + fent1 = self.get_fent(fid1) + file_pair_node = mkfilepair(fent0, fent1) + ps = self.get_predicates(lang) + fact.add(file_pair_node, P_TYPE, ps.c_filepair) + fact.add(file_pair_node, ps.p_orig_file, fent0) + fact.add(file_pair_node, ps.p_modi_file, fent1) + #self._extra_fact_extractor.extract_file_fact(fact, fent0, fent1, ver0, ver1) + ### END EXTRACT FACT + + + url0 = self.get_source_url(fid0, ver0, path=path0) + url1 = self.get_source_url(fid1, ver1, path=path1) + + f_sub.write('

    [%d] %s
    - %s

    \n' % (file_count, path0, path1)) + f_sub.write('
      \n') + + applet_data = { + 'proj_id' : self._proj_id, + # 'cache' : pathname2url(self._cache_base_dir), + # 'diff' : self.get_diff_url(fid0, fid1), + 'algo' : self._hash_algo, + 'ver0' : aver0, + 'ver1' : aver1, + 'path0' : path0, + 'path1' : path1, + 'src0' : url0, + 'src1' : url1, + 'desc' : '{}', + 'ess0' : '[]', + 'ess1' : '[]', + } + item_data = {} + + g_tbl = fd_tbl.get((fid0, fid1), {}) + + for g in g_tbl.keys(): + + r_tbl = g_tbl[g] + + for change_pat in r_tbl.keys(): + + item_data['rname'] = change_pat + + i_tbl = r_tbl[change_pat] + + for k in i_tbl.keys(): + + (var0, var1, ent0, ent1) = i_tbl[k] + + others_tbl = self.get_others(change_pat, ent0, ent1) + + count += 1 + + ln0 = get_localname(str(ent0.get_uri())) + ln1 = get_localname(str(ent1.get_uri())) + xkey = '%s:%s:%s' % (change_pat, ln0, ln1) + xkey_tbl[xkey] = count + + ### BEGIN EXTRACT FACT + chgpat_node = None + if file_pair_node: + chgpat_node = mkchgpat(ent0, ent1, change_pat) + chg_ty_node = self.mkchgtype(lang, change_pat) + ps = self.get_predicates(lang) + fact.add(file_pair_node, ps.p_chgpat, chgpat_node) + fact.add(chgpat_node, ps.p_filepair, file_pair_node) + fact.add(chgpat_node, P_TYPE, chg_ty_node) + fact.add(chgpat_node, self.mkchgpatpred(lang, var0), ent0) + fact.add(chgpat_node, self.mkchgpatpred(lang, var1), ent1) + ### END EXTRACT FACT + + + item_data['count'] = count + + r0 = ent0.get_range() + r1 = ent1.get_range() + + line0 = r0.get_start_line() + line1 = r1.get_start_line() + + col0 = r0.get_start_col() + col1 = r1.get_start_col() + + applet_data['vpid'] = vpid + applet_data['form_id'] = str(count) + + applet_data['startl0'] = line0 + applet_data['startl1'] = line1 + + applet_data['startc0'] = col0 + applet_data['startc1'] = col1 + + applet_data['hilit0'] = self.get_hilit(r0) + applet_data['hilit1'] = self.get_hilit(r1) + + applet_data['var0'] = var0 + applet_data['var1'] = var1 + + otbl_keys = sorted(list(others_tbl.keys())) + + + ### BEGIN EXTRACT FACT + if chgpat_node: + for k in otbl_keys: + objs = [] + for s in others_tbl[k]: + if s.startswith('http'): + objs.append(Resource(uri=s)) + else: + objs.append(make_literal(s)) + + if len(objs) > 0: + pred = self.mkchgpatpred(lang, k) + + for obj in objs: + fact.add(chgpat_node, pred, obj) + + # for (pred, lits) in [(self.mkchgpatpred(lang, k), make_literals(others_tbl[k])) for k in otbl_keys]: + # for lit in lits: + # fact.add(chgpat_node, pred, lit) + ### END EXTRACT FACT + + other_vs = [] + for k in otbl_keys: + ss = list(filter(lambda x: not x.startswith('http'), others_tbl[k])) + if len(ss) > 0: + other_vs.append('%s: %s' % (k, str_set_to_str(ss))) + + # other_vs = ['%s: %s' % (k, str_set_to_str(others_tbl[k])) for k in otbl_keys] + others_str = ', '.join(other_vs) + item_data['others'] = others_str + + # + + desc = others_tbl.copy() + for (k, v) in desc.items(): + if isinstance(v, set): + if len(v) == 1: + desc[k] = v.pop() + else: + desc[k] = list(v) + + desc['$cid'] = count + desc['$change'] = change_pat + desc_str = jsondumps(desc) + + # + + ess0 = self.get_essentials_RM(change_pat, ent0, ent1) + ess1 = self.get_essentials_AD(change_pat, ent0, ent1) + ess01 = self.get_essentials_MP(change_pat, ent0, ent1) + + ess_tbl0 = {} # fid -> (so * eo * var) list + ess_tbl1 = {} # fid -> (so * eo * var) list + ess_tbl01 = {} # (fid * fid) -> ((so * eo * var) * (so * eo * var)) list + + def mkofs(e, v): + r = e.get_range() + so = r.get_start_offset() + eo = r.get_end_offset() + ofs = None + if so == eo: + ofs = (so, v) + elif so < eo: + ofs = (so, eo, v) + return ofs + + def mk_ess_tbl(ess, ess_tbl): + for (v, es) in ess.items(): + m = isinstance(v, tuple) + for e in es: + ofs = None + if m: + ofs = (mkofs(e[0], v[0]), mkofs(e[1], v[1])) + else: + ofs = mkofs(e, v) + + if ofs: + ofss = [] + if m: + fd = (e[0].get_file_id(), e[1].get_file_id()) + else: + fd = e.get_file_id() + try: + ofss = ess_tbl[fd] + except KeyError: + ess_tbl[fd] = ofss + + ofss.append(ofs) + + mk_ess_tbl(ess0, ess_tbl0) + mk_ess_tbl(ess1, ess_tbl1) + mk_ess_tbl(ess01, ess_tbl01) + + def finish_ess_tbl(tbl): + for (fd, ofss) in tbl.items(): + tbl[fd] = jsondumps(sorted(ofss)) + + finish_ess_tbl(ess_tbl0) + finish_ess_tbl(ess_tbl1) + finish_ess_tbl(ess_tbl01) + + fid0 = ent0.get_file_id() + fid1 = ent1.get_file_id() + + applet_data['desc'] = desc_str + applet_data['ess0'] = ess_tbl0.get(fid0, '[]') + applet_data['ess1'] = ess_tbl1.get(fid1, '[]') + applet_data['ess01'] = ess_tbl01.get((fid0, fid1), '[]') + + # + + applet = APPLET % applet_data + + extra_pairs = self.get_extra_ent_pairs(change_pat, ent0, ent1) + + aps = [] + + sub_count = 0 + + for (v0, v1, e0, e1) in extra_pairs: + f0 = e0.get_file_id() + f1 = e1.get_file_id() + ess0_str = ess_tbl0.get(f0, '[]') + ess1_str = ess_tbl1.get(f1, '[]') + ess01_str = ess_tbl01.get((f0, f1), '[]') + + form_id = '%d-%d' % (count, sub_count) + + a = self.make_applet_html(vpid, form_id, v0, v1, e0, e1, ver0, ver1, desc_str, ess0_str, ess1_str, ess01_str) + + if a: + aps.append(a) + sub_count += 1 + + extra_applets = ''.join(aps) + + ### BEGIN EXTRACT FACT + if chgpat_node: + for (v0, v1, e0, e1) in extra_pairs: + fact.add(chgpat_node, self.mkchgpatpred(lang, v0), e0) + fact.add(chgpat_node, self.mkchgpatpred(lang, v1), e1) + ### END EXTRACT FACT + + item_data['applet'] = applet + extra_applets + + f_sub.write(HTML_ITEM % item_data) + + f_sub.write('
    \n') + + # dump unidentified changes + + count_uc = 0 + + chgs = [] + + try: + chgs = self._prim_chg_tbl[(ver0, ver1)][(fid0, fid1)] + except KeyError: + pass + + reduced_chgs = self.reduce_chgs(chgs) + + if reduced_chgs: + f_sub.write('
    Other Changes
    \n') + + for (e0, e1, c, ct) in reduced_chgs: + + r0 = e0.get_range() + r1 = e1.get_range() + + ln0 = r0.get_start_line() + ln1 = r1.get_start_line() + + col0 = r0.get_start_col() + col1 = r1.get_start_col() + + applet_data['startl0'] = ln0 + applet_data['startl1'] = ln1 + + applet_data['startc0'] = col0 + applet_data['startc1'] = col1 + + applet_data['hilit0'] = self.get_hilit(r0) + applet_data['hilit1'] = self.get_hilit(r1) + + applet_data['var0'] = '_' + applet_data['var1'] = '_' + + applet_data['form_id'] = 'o%d' % count_uc + + applet = APPLET % applet_data + + count_uc += 1 + + item_data['rname'] = '%s (%s)' % (c, ct) + item_data['count'] = count_uc + item_data['others'] = '' + item_data['applet'] = applet + + f_sub.write('
      \n') + f_sub.write(HTML_ITEM % item_data) + f_sub.write('
    \n') + + + f_sub.write(HTML_TAIL) + f_sub.close() + + f_summary.write(HTML_TAIL) + f_summary.close() + + f = open(html_path, 'w') + + summary = '' + if path_summary: + summary = os.path.basename(path_summary) + first = '' + if first_path_sub: + first = os.path.basename(first_path_sub) + + f.write(FRAME_PAGE % { 'proj_id' : self._proj_id, + 'summary' : summary, + 'first' : first, + 'year' : datetime.today().year, + }) + + f.close() + + if self.extract_fact(lang): + fdir = outdir + if foutdir: + fdir = foutdir + fact.write(os.path.join(fdir, self._proj_id+'.ttl')) + + + ## html table for index.html + path_html_tbl = os.path.join(outdir, self._proj_id+'.tdata') + f_html_tbl = open(path_html_tbl, 'w') + html_tbl_data['ntriples'] = format_num(self.count_triples()) + + html_tbl_data0 = html_tbl_data.copy() + html_tbl_data0['html'] = os.path.join(self._proj_id, 'index.html') + + f_html_tbl.write(HTML_TBL % html_tbl_data0) + f_html_tbl.close() + ## + + with open(os.path.join(outdir, self._proj_id+'.json'), 'w') as f: + json.dump(xkey_tbl, f) + + + + def get_vindex(self, uri): + vi = None + try: + vi = self._conf.versionURIs.index(uri) + except: + pass + return vi + + def get_ent(self, uri): + try: + ent = SourceCodeEntity(uri=uri) + return ent + except Exception as e: + logger.error ('%s: uri=\"%s\"' % (e, uri)) + + def get_uri(self, row, var): + uri = row[var] + if not uri: + logger.warning('cannot get URI for "%s"' % var) + return uri + + def add_extra_ent_pair(self, chgpat, bent0_bent1, var0_var1, ent0_ent1): + bent0, bent1 = bent0_bent1 + var0, var1 = var0_var1 + ent0, ent1 = ent0_ent1 + vtbl = {} + bkey = (chgpat, bent0.get_uri(), bent1.get_uri()) + try: + vtbl = self._extra_ent_pairs_tbl[bkey] + except KeyError: + self._extra_ent_pairs_tbl[bkey] = vtbl + + eps = set() + try: + eps = vtbl[(var0, var1)] + except KeyError: + vtbl[(var0, var1)] = eps + + eps.add((ent0, ent1)) + + + def get_extra_ent_pairs(self, chgpat, bent0, bent1): + res = [] + bkey = (chgpat, bent0.get_uri(), bent1.get_uri()) + try: + vtbl = self._extra_ent_pairs_tbl[bkey] + for (v0, v1) in vtbl.keys(): + for (e0, e1) in vtbl[(v0, v1)]: + res.append((v0, v1, e0, e1)) + except KeyError: + pass + + res.sort(key=lambda x: factutil.range.Key(x[2].get_range())) + + return res + + + def add_other(self, chgpat, bent0_bent1, key, val): + bent0, bent1 = bent0_bent1 + vtbl = {} + bkey = (chgpat, bent0.get_uri(), bent1.get_uri()) + try: + vtbl = self._others_tbl[bkey] + except KeyError: + self._others_tbl[bkey] = vtbl + + vs = set() + try: + vs = vtbl[key] + except KeyError: + vtbl[key] = vs + + vs.add(val) + + + def get_others(self, chgpat, bent0, bent1): + vtbl = {} + bkey = (chgpat, bent0.get_uri(), bent1.get_uri()) + try: + vtbl = self._others_tbl[bkey] + except KeyError: + pass + + return vtbl + + + def add_essential(self, ess_tbl, chgpat, bent0_bent1, var, val): + bent0, bent1 = bent0_bent1 + vtbl = {} + bkey = (chgpat, bent0.get_uri(), bent1.get_uri()) + try: + vtbl = ess_tbl[bkey] + except KeyError: + ess_tbl[bkey] = vtbl + + s = set() + try: + s = vtbl[var] + except KeyError: + vtbl[var] = s + + s.add(val) + + def add_essential_RM(self, chgpat, bent01, var, val): + self.add_essential(self._ess_tbl_rm, chgpat, bent01, var, val) + + def add_essential_AD(self, chgpat, bent01, var, val): + self.add_essential(self._ess_tbl_ad, chgpat, bent01, var, val) + + def add_essential_MP(self, chgpat, bent01, var, val): + self.add_essential(self._ess_tbl_mp, chgpat, bent01, var, val) + + + def get_essentials(self, ess_tbl, chgpat, bent0, bent1): + tbl = {} + bkey = (chgpat, bent0.get_uri(), bent1.get_uri()) + try: + tbl = ess_tbl[bkey] + except KeyError: + pass + return tbl + + def get_essentials_RM(self, chgpat, bent0, bent1): + return self.get_essentials(self._ess_tbl_rm, chgpat, bent0, bent1) + + def get_essentials_AD(self, chgpat, bent0, bent1): + return self.get_essentials(self._ess_tbl_ad, chgpat, bent0, bent1) + + def get_essentials_MP(self, chgpat, bent0, bent1): + return self.get_essentials(self._ess_tbl_mp, chgpat, bent0, bent1) + + + + + def _enumerate_changes(self, lang, q): + + logger.info('enumerating changes (%s)...' % q), + + start = time.time() + + try: + _query, per_ver = self.get_query(lang, q) + + tbl = {} + + if per_ver: + for (ver0, ver1) in self._conf.vURIpairs: + logger.info(' "%s"-"%s"' % (get_localname(ver0), get_localname(ver1))) + + count = 0 + + query = self.add_ver_filter(_query, ver0, ver1) + + for qvs, row in self._sparql.query(query): + uri0 = self.get_uri(row, 'ent') + uri1 = self.get_uri(row, 'ent_') + chg = self.get_uri(row, 'chg') + cat = get_localname(self.get_uri(row, 'cat')) + + skip = chg == 'modified' + + if uri0 == None or uri1 == None or skip: + continue + + ent0 = self.get_ent(uri0) + ent1 = self.get_ent(uri1) + + if ent0.is_file() and ent1.is_file(): + continue + + try: + tbl[(ver0, ver1)].append((ent0, ent1, chg, cat)) + except KeyError: + tbl[(ver0, ver1)] = [(ent0, ent1, chg, cat)] + + count += 1 + + logger.debug('%d rows processed' % count) + + + else: + count = 0 + + for qvs, row in self._sparql.query(_query): + ver0 = self.get_uri(row, 'ver') + ver1 = self.get_uri(row, 'ver_') + uri0 = self.get_uri(row, 'ent') + uri1 = self.get_uri(row, 'ent_') + chg = self.get_uri(row, 'chg') + cat = get_localname(self.get_uri(row, 'cat')) + + skip = chg == 'modified' + + if uri0 == None or uri1 == None or skip: + continue + + ent0 = self.get_ent(uri0) + ent1 = self.get_ent(uri1) + + if ent0.is_file() and ent1.is_file(): + continue + + try: + tbl[(ver0, ver1)].append((ent0, ent1, chg, cat)) + except KeyError: + tbl[(ver0, ver1)] = [(ent0, ent1, chg, cat)] + + count += 1 + + logger.debug('%d rows processed' % count) + + + for (ver0, ver1) in tbl.keys(): + fd_tbl = {} + try: + fd_tbl = self._prim_chg_tbl[(ver0, ver1)] + except KeyError: + self._prim_chg_tbl[(ver0, ver1)] = fd_tbl + + for (ent0, ent1, chg, cat) in tbl[(ver0, ver1)]: + + fid0 = ent0.get_file_id() + fid1 = ent1.get_file_id() + + self.add_fent(fid0, SourceCodeEntity(file_id=fid0)) + self.add_fent(fid1, SourceCodeEntity(file_id=fid1)) + + chgs = set() + try: + chgs = fd_tbl[(fid0, fid1)] + except KeyError: + fd_tbl[(fid0, fid1)] = chgs + + chgs.add((ent0, ent1, chg, cat)) + + + + logger.debug('primitive changes:') + + self._prim_chg_count_tbl = {} + + for k0 in self._prim_chg_tbl.keys(): + + (ver0, ver1) = k0 + + logger.debug(' %s-%s:' % (get_localname(ver0), get_localname(ver1))) + + fd_tbl = {} + try: + fd_tbl = self._prim_chg_count_tbl[k0] + except KeyError: + self._prim_chg_count_tbl[k0] = fd_tbl + + for k1 in self._prim_chg_tbl[k0].keys(): + (fid0, fid1) = k1 + n = len(self._prim_chg_tbl[k0][k1]) + + logger.debug(' "%s"-"%s": %d' % (fid0.encode(), fid1.encode(), n)) + + self._prim_chg_count_tbl[k0][k1] = n + + t = time.time() - start + logger.info('done. (%ds)' % t) + + except QueryNotFound as e: + logger.warning('query not found: "%s"' % e.query) + + + + def enumerate_primitive_changes(self, lang): + logger.info('enumerating primitive changes for %s...' % lang) + self._prim_chg_tbl = {} + lang = None + self._enumerate_changes(lang, Q_ENUM_MAPPING_CHG) + self._enumerate_changes(lang, Q_ENUM_ADDITION) + self._enumerate_changes(lang, Q_ENUM_REMOVAL) + + def add_fent(self, fid, e): + if fid not in self._fent_tbl: + self._fent_tbl[fid] = e + + def get_fent(self, fid): + fent = None + try: + fent = self._fent_tbl[fid] + except KeyError: + fent = SourceCodeEntity(file_id=fid) + self.add_fent(fid, fent) + return fent + + + # def enumerate_files(self): + + # logger.info('enumerating files...') + + # query, per_ver = self.get_query(None, Q_ENUM_FILE) + + # for qvs, row in self._sparql.query(query): + # uri = self.get_uri(row, 'ent') + + # if uri == None: + # continue + + # ent = self.get_ent(uri) + + # if ent.is_file(): + # fid = ent.get_file_id() + # self.add_fent(fid, SourceCodeEntity(file_id=fid)) + + # logger.info('done.') + + def is_removal(self, s): + b = s in ('deleted or pruned', 'deleted from', 'pruned from', 'weak removal') + logger.debug('%s --> %s' % (s, b)) + return b + + def is_addition(self, s): + b = s in ('inserted or grafted', 'inserted into', 'grafted onto', 'weak addition') + logger.debug('%s --> %s' % (s, b)) + return b + + def is_mapping_chg(self, s): + b = s in ('changed to', 'modified', 'moved to', 'order changed', 'renamed') + logger.debug('%s --> %s' % (s, b)) + return b + + def remove_from_prim_chg_tbl(self, ver0, ver1, fid0, fid1, ent0, ent1): + try: + chgs = self._prim_chg_tbl[(ver0, ver1)][(fid0, fid1)] + to_be_removed = [] + for chg in chgs: + (e0, e1, c, ct) = chg + + cond = False + + if ent0 and not ent1: + cond = ent0.contains(e0) and self.is_removal(c) + + elif not ent0 and ent1: + cond = ent1.contains(e1) and self.is_addition(c) + + elif ent0 and ent1: + cond0 = ent0.contains(e0) and ent1.contains(e1) and self.is_mapping_chg(c) + cond1 = ent0.contains(e0) and self.is_removal(c) + cond2 = ent1.contains(e1) and self.is_addition(c) + cond = cond0 or cond1 or cond2 + + if cond: + to_be_removed.append(chg) + + for chg in to_be_removed: + logger.debug('removing (%s,%s,%s)' % (e0, e1, c)) + chgs.remove(chg) + + except KeyError: + pass + + + + + def find(self, force_per_ver=False, query_prec=False): + self._extra_ent_pair_tbl = {} + self._result = {} + + if query_prec: + #self._inst_key_tbl = {} # (ver * ver) -> inst_key set + self._inst_elemsl_tbl = {} # (ver * ver) -> elem_set list + + #self.enumerate_files() + + for lang in self._queries.keys(): + if self._lang != None and lang != self._lang: + continue + + if self._change_enumeration: + self.enumerate_primitive_changes(lang) + + try: + ver_tbl = self._result[lang] # (ver * ver) -> (fid * fid) -> change_pat_name -> (ent * ent) list + except KeyError: + ver_tbl = {} + self._result[lang] = ver_tbl + + qdata = self._queries[lang] + + for (q, var0, var1, extra, others, essential, inst_key, inst_key_is_one_to_one, is_complex, min_extra) in qdata: + + if query_prec: + skip_count = 0 + + name = fname_to_title(q) + logger.info('finding \"%s\" for %s...' % (name, lang)), + sys.stdout.flush() + + ess0, ess1, ess01 = essential + + has_extra = len(extra) > 0 + + _query, per_ver = self.get_query(lang, q, force_per_ver=(force_per_ver and is_complex)) + + #logger.debug('query:\n%s' % _query) + + start = time.time() + + vpairs = [] + + if self._conf.vpairs: + if per_ver: + vpairs = self._conf.vURIpairs + else: + vpairs.append(self._conf.vURIpairs[0]) + else: + if per_ver: + for i in range(self._conf.nversions - 1): + v0 = self._conf.versionURIs[i] + v1 = self._conf.versionURIs[i+1] + vpairs.append((v0, v1)) + else: + vpairs.append((self._conf.versionURIs[0], self._conf.versionURIs[1])) + + nvpairs = len(vpairs) + vp_count = 0 + + if per_ver: + logger.info('') + + for (ver0, ver1) in vpairs: + vp_count += 1 + + logger.debug('%s vs %s' % (ver0, ver1)) + + st0 = time.time() + + if per_ver: + query = self.add_ver_filter(_query, ver0, ver1) + sys.stdout.write('<%s -> %s> [%d/%d]' % (self.get_ver_name(ver0), + self.get_ver_name(ver1), + vp_count, + nvpairs)) + sys.stdout.flush() + else: + query = _query + + #logger.debug('query:\n%s' % query) + + rows = [] + + for qvs, row in self._sparql.query(query): + rows.append(row) + + extra_count_tbl = {} + + for row in rows: + uri0 = self.get_uri(row, var0) + uri1 = self.get_uri(row, var1) + + if uri0 == None or uri1 == None: + continue + + ######################################## + if query_prec and inst_key: + def get(v): + u = row.get(v, None) + e = None + # if u: + # e = self.get_ent(u) + return u + + ikey = tuple([get(v) for v in inst_key]) + + vp = (ver0, ver1) + + elemsl = self._inst_elemsl_tbl.get(vp, []) + if elemsl == []: + self._inst_elemsl_tbl[vp] = [set([x]) for x in ikey] + else: + all_match_mode = False + any_match_mode = False + if inst_key_is_one_to_one: + all_match_mode = not any(inst_key_is_one_to_one) + any_match_mode = all(inst_key_is_one_to_one) + + def get_elems(i): + try: + elems = elemsl[i] + except IndexError: + elems = set() + elemsl.append(elems) + return elems + + if any_match_mode: + skip_flag = any([x in get_elems(i) for (i, x) in enumerate(ikey)]) + elif all_match_mode: + skip_flag = all([x in get_elems(i) for (i, x) in enumerate(ikey)]) + else: + vec = [] + for (i, x) in enumerate(ikey): + is_one_to_one = False + if inst_key_is_one_to_one: + try: + is_one_to_one = inst_key_is_one_to_one[i] + except IndexError: + pass + if not is_one_to_one: + vec.append(x in get_elems(i)) + skip_flag = all(vec) + + if skip_flag: + skip_count += 1 + continue + else: + for (i, x) in enumerate(ikey): + get_elems(i).add(x) + + # if ikey in self._inst_key_tbl.get(vp, []): + # skip_count += 1 + # continue + # else: + # try: + # s = self._inst_key_tbl[vp] + # except KeyError: + # s = set() + # self._inst_key_tbl[vp] = s + # s.add(ikey) + ######################################## + + k = (uri0, uri1) + + for (v0, v1) in extra: + u0 = row[v0] + u1 = row[v1] + + if u0 and u1: + e0 = self.get_ent(u0) + e1 = self.get_ent(u1) + + if not self.check_path(e0, e1, ver0, ver1, per_ver): + continue + + try: + extra_count_tbl[k] += 1 + except KeyError: + extra_count_tbl[k] = 1 + + for row in rows: + + uri0 = self.get_uri(row, var0) + uri1 = self.get_uri(row, var1) + + logger.debug('var0: %s --> uri0: %s' % (var0, uri0)) + logger.debug('var1: %s --> uri1: %s' % (var1, uri1)) + #print('var0: %s --> %s' % (var0, uri0)) + #print('var1: %s --> %s' % (var1, uri1)) + + if uri0 == None or uri1 == None: + continue + + if has_extra: + extra_count = extra_count_tbl.get((uri0, uri1), 0) + logger.debug('extra_count=%d min_extra=%d' % (extra_count, min_extra)) + if extra_count < min_extra: + continue + + ent0 = self.get_ent(uri0) + ent1 = self.get_ent(uri1) + + fid0 = ent0.get_file_id() + fid1 = ent1.get_file_id() + + if fid0 == fid1: + logger.warning('illegal URI pair: %s and %s' % (uri0, uri1)) + + version_pairs = [(ver0, ver1)] + + if not per_ver: + version_pairs = self.get_version_pairs(ent0, ent1) + if version_pairs == []: + logger.warning('%s' % name) + continue + + if self._conf.include: + ok = False + for (ve0, ve1) in version_pairs: + path0 = self.get_source_path(fid0, ve0) + path1 = self.get_source_path(fid1, ve1) + if path0 or path1: + ok = True + break + + if not ok: + continue + + + key_var = '' + try: + key_var = row[KEY_VAR_NAME] + except: + pass + + chg_name = name + if key_var: + chg_name = '"%s" %s' % (key_var, name) + + + for (v0, v1) in extra: + u0 = row[v0] + u1 = row[v1] + if u0 and u1: + e0 = self.get_ent(u0) + e1 = self.get_ent(u1) + + if not self.check_path(e0, e1, ver0, ver1, per_ver): + continue + + self.add_extra_ent_pair(chg_name, (ent0, ent1), (v0, v1), (e0, e1)) + + + ess_exists = False + + for v0 in ess0: + u0 = row[v0] + if u0: + ess_exists = True + e0 = self.get_ent(u0) + + self.add_essential_RM(chg_name, (ent0, ent1), v0, e0) + + if self._change_enumeration: + for (ver0_, ver1_) in version_pairs: + self.remove_from_prim_chg_tbl(ver0_, ver1_, fid0, fid1, e0, None) + + for v1 in ess1: + u1 = row[v1] + if u1: + ess_exists = True + e1 = self.get_ent(u1) + + self.add_essential_AD(chg_name, (ent0, ent1), v1, e1) + + if self._change_enumeration: + for (ver0_, ver1_) in version_pairs: + self.remove_from_prim_chg_tbl(ver0_, ver1_, fid0, fid1, None, e1) + + for (v0, v1) in ess01: + u0 = row[v0] + u1 = row[v1] + if u0 and u1: + ess_exists = True + e0 = self.get_ent(u0) + e1 = self.get_ent(u1) + + self.add_essential_MP(chg_name, (ent0, ent1), (v0, v1), (e0, e1)) + + if self._change_enumeration: + for (ver0_, ver1_) in version_pairs: + self.remove_from_prim_chg_tbl(ver0_, ver1_, fid0, fid1, e0, e1) + + if not ess_exists: + #logger.warning('no essential variables found') + continue + + + for v in others: + try: + x = row[v] + if x: + self.add_other(chg_name, (ent0, ent1), v, x) + + except Exception as e: + logger.warning(str(e)) + + data = (var0, var1, ent0, ent1) + + key = (ent0.get_uri(), ent1.get_uri()) + + + + + fd_tbls = [] + for (ver0_, ver1_) in version_pairs: + try: + fd_tbls.append(ver_tbl[(ver0_, ver1_)]) + except KeyError: + fd_tbl = {} + ver_tbl[(ver0_, ver1_)] = fd_tbl + fd_tbls.append(fd_tbl) + + grp = GROUP_SEPARATOR + try: + gstr = row[GROUP_VAR_NAME] + if gstr: + gs = [x for x in gstr.split(GROUP_SEPARATOR) if x != ''] + + ys = [] + for ns in gs: + xs = [x for x in ns.split(NAME_SEPARATOR) if x != ''] + xs.sort() + ys.append(NAME_SEPARATOR.join(xs)) + grp = GROUP_SEPARATOR.join(ys) + + except Exception as e: + logger.warning('group not set: %s' % e) + + + for fd_tbl in fd_tbls: + + g_tbl = {} + + try: + g_tbl = fd_tbl[(fid0, fid1)] + except KeyError: + g_tbl = {} + fd_tbl[(fid0, fid1)] = g_tbl + + try: + d = g_tbl[grp] + + try: + t = d[chg_name] + t[key] = data + except KeyError: + d[chg_name] = {key : data} + + except KeyError: + d = { chg_name : {key: data} } + g_tbl[grp] = d + + + if per_ver: + logger.info(' (%ds)' % (time.time() - st0)) + + + t = time.time() - start + logger.info('done. (%ds)' % t) + + if query_prec and skip_count: + logger.info('%d instances skipped' % skip_count) + + +def find(query_dir, queries, predicate_tbl, extra_fact_extractor, + base_dir, proj_id, foutdir, outdir, pw, port, + limit, lang, method, change_enumeration, per_ver, query_prec, conf=None): + + finder = Finder(query_dir, queries, base_dir, proj_id, + predicate_tbl=predicate_tbl, limit=limit, lang=lang, + extra_fact_extractor=extra_fact_extractor, conf=conf, + method=method, pw=pw, port=port) + + if change_enumeration: + finder.enable_change_enumeration() + + finder.find(force_per_ver=per_ver, query_prec=query_prec) + + fact_outdir = foutdir + if fact_outdir == DEFAULT_FACT_OUTPUT_DIR: + fact_outdir = fact_outdir.replace('', proj_id) + + finder.dump(outdir, fact_outdir) + + +def main(query_dir, queries, desc, predicate_tbl=None, extra_fact_extractor=None): + + from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter + + parser = ArgumentParser(description=desc, + formatter_class=ArgumentDefaultsHelpFormatter) + + parser.add_argument('work_dir', type=str, help="diffts's work directory for factbase construction") + parser.add_argument('proj_id', type=str, help='project id') + + parser.add_argument('--port', dest='port', default=VIRTUOSO_PORT, + metavar='PORT', type=int, help='set port number') + + parser.add_argument('--pw', dest='pw', metavar='PASSWORD', default=VIRTUOSO_PW, + help='set password to access DB') + + parser.add_argument('-m', '--method', dest='method', default='odbc', + metavar='METHOD', type=str, help='execute query via METHOD (http|odbc)') + + parser.add_argument('-d', '--debug', dest='debug', action='store_true', help='enable debug printing') + + parser.add_argument('-c', '--enable-change-enumeration', dest='change_enumeration', action='store_true', + help='enable change enumeration') + + parser.add_argument('-o', '--outdir', dest='outdir', default='.', + metavar='DIR', type=str, help='dump result into DIR') + + parser.add_argument('-f', '--fact-outdir', dest='foutdir', default=DEFAULT_FACT_OUTPUT_DIR, + metavar='DIR', type=str, help='dump fact into DIR') + + parser.add_argument('-l', '--limit', dest='limit', default=None, + metavar='N', type=int, help='at most N instances are reported for each version pair') + + parser.add_argument('--force-per-ver', dest='per_ver', action='store_true', + help='force complex queries to be executed per version pairs') + + parser.add_argument('--query-prec', dest='query_prec', action='store_true', help='recognize query precedence') + + parser.add_argument('--lang', dest='lang', default=None, + metavar='LANG', type=str, help='query only for LANG (c|java|fortran)') + + args = parser.parse_args() + + log_level = logging.INFO + if args.debug: + log_level = logging.DEBUG + setup_logger(logger, log_level) + + find(query_dir, queries, predicate_tbl, extra_fact_extractor, + args.work_dir, args.proj_id, args.foutdir, args.outdir, args.pw, args.port, + args.limit, args.lang, args.method, + args.change_enumeration, args.per_ver, args.query_prec) diff --git a/python/src/cca/ccautil/find_refactoring.py b/python/src/cca/ccautil/find_refactoring.py new file mode 100644 index 0000000..cd1b7e8 --- /dev/null +++ b/python/src/cca/ccautil/find_refactoring.py @@ -0,0 +1,367 @@ +#!/usr/bin/env python3 + + +''' + find_refactoring.py + + Copyright 2018-2021 Chiba Institute of Technology + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +''' + +__author__ = 'Masatomo Hashimoto ' + +import os.path +#import logging + +from .siteconf import CCA_HOME +from . import find_change_patterns, sparql +from .find_change_patterns import Predicates +from .ns import REF_NS, JREF_NS, CREF_NS + +from cca.factutil.rdf import Resource, Predicate + +#logger = logging.getLogger() + +QUERY_DIR = os.path.join(CCA_HOME, 'queries', 'refactoring') + + +class FactExtractor(find_change_patterns.FactExtractor): + + def get_other_info(self, ver0, ver1): + return self.get_git_commit_info(ver0, ver1) + +JAVA_PREDICATES = Predicates() +JAVA_PREDICATES.chgpat_ns = JREF_NS +JAVA_PREDICATES.p_filepair = Predicate(REF_NS, 'filePair') +JAVA_PREDICATES.p_chgpat = Predicate(REF_NS, 'refactoring') + +C_PREDICATES = Predicates() +C_PREDICATES.chgpat_ns = CREF_NS +C_PREDICATES.p_filepair = Predicate(REF_NS, 'filePair') +C_PREDICATES.p_chgpat = Predicate(REF_NS, 'refactoring') + + +PREDICATE_TBL = {'java':JAVA_PREDICATES,'c':C_PREDICATES} + + + +def get_queries(weak=False): + # push_down_method = 'push_down_method.rq' + # rename_method = 'rename_method.rq' + # move_method = 'move_method.rq' + + # if weak: + # push_down_method = 'weak_'+push_down_method + # rename_method = 'weak_'+rename_method + # move_method = 'weak_'+move_method + + queries = { 'java' : # (FILE, ENT_VAR0, ENT_VAR1, EXTRA_ENT_VAR_PAIRS, EXTRA_VAR_LIST, ESSENTIAL_VARS(RM,AD,MP), INST_KEY, INST_KEY_IS_ONE_TO_ONE, PER_VER, MIN_EXTRA_PAIRS) + [ + ('local_variable_rename.rq', 'originalDtor', 'modifiedDtor', + [('originalVariable','modifiedVariable')], ['originalVariableName','modifiedVariableName'], + ([],[],[('dtor','dtor_'),('v','v_')]), None, None, False, 1), + + ('add_parameter.rq', 'originalMethod', 'addedParameter', [], + #[('originalInvocation','modifiedInvocation')], + ['parameterName','methodName','modifiedMethod','className','class','class_'], + ([], ['param_'], []), None, None, False, 0), + + ('add_parameter_and_add_method.rq', 'originalMethod', 'addedParameter', [], + ['parameterName','methodName','modifiedMethod','className','class','class_','originalContext','addedMethod'], + ([], ['param_'], []), None, None, False, 0), + + ('remove_parameter.rq', 'removedParameter', 'modifiedMethod', [], + #[('originalInvocation','modifiedInvocation')], + ['parameterName','methodName','originalMethod','className','class','class_'], + (['param'], [], []), None, None, False, 0), + + ('remove_parameter_and_remove_method.rq', 'removedParameter', 'modifiedMethod', [], + ['parameterName','methodName','originalMethod','className','class','class_'], + (['param'], [], []), None, None, False, 0), + + ('rename_parameter.rq', 'originalParameter', 'modifiedParameter', + [('originalVariable','modifiedVariable')], + ['originalParameterName','modifiedParameterName','methodName','className','originalMethod','modifiedMethod','originalClass'], + ([],[],[('param','param_'),('v','v_')]), None, None, False, 1), + + ('change_bidirectional_association_to_unidirectional.rq', 'removedField', 'modifiedContext', + [('field0','field0_')], ['otherClassName','className','originalClass','modifiedClass'], + (['field1'], [], []), None, None, False, 1), + + ('change_unidirectional_association_to_bidirectional.rq', 'originalContext', 'addedField', + [('field0','field0_')], ['otherClassName','className','originalClass','modifiedClass'], + ([], ['field1_'], []), None, None, False, 1), + + ('consolidate_conditional_expression.rq', 'originalIf', 'modifiedIf', [], + ['methodName','originalMethod','modifiedMethod'], ([], [], [('if0','if_')]), None, None, False, 0), + + ('extract_class.rq', 'originalClass', 'extractedClass', [], + ['className', 'extractedClassName'], ([], ['class1_'], []), None, None, False, 0), + + ('inline_class.rq', 'inlinedClass', 'modifiedClass', [], + ['inlinedClassName','className'], (['class1'], [], []), None, None, False, 0), + + ('extract_interface.rq', 'originalContext', 'extractedInterface', [('originalClass','modifiedClass')], + ['interfaceName'], ([], ['interf_'], [('class','class_')]), None, None, False, 2), + + ('quasi_extract_interface.rq', 'originalContext', 'extractedInterface', [('originalClass','modifiedClass')], + ['interfaceName'], ([], ['interf_'], [('class','class_')]), None, None, False, 1), + + ('extract_method.rq', 'originalMethod', 'extractedMethod', [('originalContext', 'addedInvocation')], + ['originalMethodFQN','modifiedMethodFQN','extractedMethodFQN'], + ([], ['meth_'], [('ctx','ivk_')]), None, None, False, 1), + + ('inline_method.rq', 'inlinedMethod', 'modifiedMethod', [('removedInvocation','modifiedContext')], + ['originalMethodFQN','modifiedMethodFQN','inlinedMethodFQN'], + (['meth'], [], [('ivk','ctx_')]), None, None, False, 1), + + ('extract_superclass.rq', 'originalContext', 'extractedSuperclass', [('originalClass','modifiedClass')], + ['className','superclassName'], ([], ['SuperC_'], [('C0x','C0x_')]), None, None, False, 2), + + ('quasi_extract_superclass.rq', 'originalContext', 'extractedSuperclass', [('originalClass','modifiedClass')], + ['className','superclassName'], ([], ['SuperC_'], [('C0x','C0x_')]), None, None, False, 1), + + ('extract_superclass_and_move_field.rq', 'originalField', 'movedField', + [('originalField','context_'),('context','movedField')], + ['fieldTypeName','fieldName','fromClassName','toClassName','fromModifiers','toModifiers','fromClass','toClass'], + (['vdtor'], ['vdtor_'], []), ('vdtor','vdtor_'), (False, True), False, 2), + + ('extract_superclass_and_move_method.rq', 'originalMethod', 'movedMethod', + [('originalMethod','context_'),('context','movedMethod')], + ['signature','methodName','fromClassName','toClassName','fromClass','toClass'], + (['meth'], ['meth_'], []), ('meth','meth_'), (False, True), False, 2), + + ('form_template_method.rq', 'originalParentClass', 'modifiedParentClass', [], + ['templateMethodName','templateMethod','subMethod'], + ([], ['templM_','subM_'], []), None, None, False, 0), + + ('hide_delegate.rq', 'originalInvocation', 'modifiedInvocation', + [('originalContext', 'addedServerMethod')], + ['ClientClassName','ServerClassName','DelegateClassName','delegateMethodName','serverMethodName', + 'addedServerMethod','originalServerClass','modifiedServerClass', + 'originalClientClass','modifiedClientClass'], + ([], ['serverM_'], []), None, None, False, 1), + + ('remove_middle_man.rq', 'originalInvocation', 'modifiedInvocation', + [('removedServerMethod', 'modifiedContext')], + ['ClientClassName','ServerClassName','DelegateClassName','serverMethodName','delegateMethodName', + 'removedServerMethod','originalServerClass','modifiedServerClass', + 'originalClientClass','modifiedClientClass'], + (['serverM'], [], []), None, None, False, 1), + + ('hide_method.rq', 'originalMethod', 'modifiedMethod', [], + ['methodName'], ([], [], [('meth','meth_')]), None, None, False, 0), + + ('extract_variable.rq', 'originalExpr', 'movedExpr', + [('originalContext','modifiedContext'),('originalExpr','extractedVariable')], + ['extractedVariableName','originalMethodName','modifiedMethodName','originalMethod','modifiedMethod'], + ([], ['decl_','v_'], [('f','f_'),('a','rhs_')]), None, None, False, 0), + + ('inline_temp.rq', 'originalExpr', 'movedExpr', + [('originalContext','modifiedContext'),('eliminatedVariable','movedExpr')], + ['eliminatedVariableName','originalMethodName','modifiedMethodName','originalMethod','modifiedMethod'], + (['decl','v'], [], [('f','f_'),('rhs','a_')]), None, None, False, 0), + + ('introduce_assertion.rq', 'originalMethod', 'introducedAssertion', [], + ['methodName','modifiedMethod'], ([], ['assert_'], []), None, None, False, 0), + + ('introduce_local_extension.rq', 'originalClientMethod', 'modifiedClientMethod', [], + ['introducedClassName','introducedClass'], ([], ['ExtC_'], []), None, None, False, 0), + + ('introduce_null_object.rq', 'equation', 'modifiedContext', [], + ['className','introducedNullClass'], (['if'], ['null_class_'], []), None, None, False, 0), + + ('introduce_parameter_object.rq', 'originalMethod', 'modifiedMethod', [], + ['parameterNames','parameterClassName'], ([], ['param_'], []), None, None, False, 0), + + ('pull_up_constructor_body.rq', 'originalCtor', 'modifiedCtor', [], + ['className','originalClass','modifiedClass'], ([], [], [('ctor','ctor_')]), None, None, False, 0), + + ('pull_up_field.rq', 'originalContext', 'movedField', + [('originalField','modifiedContext'),('originalField','movedField')], + ['fieldName','className','superclassName','originalClass','modifiedClass','superclass'], + (['vdtor0x'], ['vdtor0_'], []), ('vdtor0x','vdtor0_'), (False, True), False, 1), + + ('quasi_pull_up_field.rq', 'originalContext', 'movedField', + [('originalField','modifiedContext'),('originalField','movedField')], + ['fieldName','className','superclassName','originalClass','modifiedClass','superclass'], + (['vdtor0x'], ['vdtor0_'], []), ('vdtor0x','vdtor0_'), (True, True), False, 1), + + ('push_down_field.rq', 'originalField', 'modifiedContext', + [('originalContext','movedField'),('originalField','movedField')], + ['fieldName','className','subclassName','originalClass','modifiedClass','subclass'], + (['vdtor0'], ['vdtor0_'], []), ('vdtor0','vdtor0_'), (True, False), False, 1), + + ('pull_up_method.rq', 'originalContext', 'movedMethod', + [('originalMethod','modifiedContext'),('originalMethod','movedMethod')], + ['methodName','signature','className','superclassName','originalClass','modifiedClass','superclass'], + (['methx'], ['meth_'], []), ('methx','meth_'), (False, True), False, 1), + + ('quasi_pull_up_method.rq', 'originalContext', 'movedMethod', + [('originalMethod','modifiedContext'),('originalMethod','movedMethod')], + ['methodName','signature','className','superclassName','originalClass','modifiedClass','superclass'], + (['methx'], ['meth_'], []), ('methx','meth_'), (True, True), False, 1), + + ('rename_and_pull_up_method.rq', 'originalContext', 'movedMethod', + [('originalMethod','modifiedContext'),('originalMethod','movedMethod')], + ['methodName','modifiedMethodName','signature','className','superclassName','originalClass','modifiedClass','superclass'], + (['meth'], ['meth_'], []), ('meth','meth_'), (False, True), False, 1), + + ('change_signature_and_pull_up_method.rq', 'originalContext', 'movedMethod', + [('originalMethod','modifiedContext'),('originalMethod','movedMethod')], + ['methodName','signature','changedSignature','className','superclassName','originalClass','modifiedClass','superclass'], + (['meth'], ['meth_'], []), ('meth','meth_'), (False, True), False, 1), + + ('push_down_method.rq', 'originalMethod', 'modifiedContext', + [('originalContext','movedMethod'),('originalMethod','movedMethod')], + ['methodName','signature','className','subclassName','originalClass','modifiedClass','subclass'], + (['meth0'], ['meth0_'], []), ('meth0','meth0_'), (True, False), False, 1), + + ('quasi_push_down_method.rq', 'originalMethod', 'modifiedContext', + [('originalContext','movedMethod'),('originalMethod','movedMethod')], + ['methodName','signature','className','subclassName','originalClass','modifiedClass','subclass'], + (['meth0'], ['meth0_'], []), ('meth0','meth0_'), (True, True), False, 1), + + ('remove_assignments_to_parameters.rq', 'removedAssignment', 'addedDeclarator', [], + ['methodName','parameterName','originalMethod','modifiedMethod'], + (['assign'], ['dtor_'], []), None, None, False, 0), + + ('remove_control_flag.rq', 'assign', 'introducedControl', [], + ['methodName','flagName','originalMethod','modifiedMethod'], + (['dtor'], [], [('assign','break_or_continue_')]), None, None, False, 0), + + ('method_visibility_increased.rq', 'originalMethod', 'modifiedMethod', [], + ['originalMethodName','modifiedMethodName'], ([], [], [('meth','meth_')]), None, None, False, 0), + + ('method_visibility_decreased.rq', 'originalMethod', 'modifiedMethod', [], + ['originalMethodName','modifiedMethodName'], ([], [], [('meth','meth_')]), None, None, False, 0), + + ('replace_constructor_with_factory_method.rq', 'removedCtor', 'factoryMethod', [], + ['className','methodName','originalClass','modifiedClass'], (['ctor'], [], []), None, None, False, 0), + + ('replace_exception_with_test.rq', 'try', 'addedIf', [], + ['methodName','originalMethod','modifiedMethod'], (['try'], ['if_'], []), None, None, False, 0), + + ('replace_magic_number_with_symbolic_constant.rq', 'const', 'ident_', [], + ['constantName','methodName','originalMethod','modifiedMethod'], + ([], [], [('const','ident_')]), None, None, False, 0), + + ('replace_nested_conditional_with_guard_clauses.rq', 'removedIf', 'addedIf', [], + ['methodName','originalMethod','modifiedMethod'], (['if0'], ['if_'], []), None, None, False, 0), + + ('replace_parameter_with_method.rq', 'originalInvocation', 'modifiedInvocation', [], + ['methodName','originalMethod','modifiedMethod'], (['arg'], [], []), None, None, False, 0), + + ('replace_temp_with_query.rq', 'originalDeclarator', 'addedMethod', [], + ['variableName','addedMethodName','methodName','originalMethod','modifiedMethod'], + (['decl'], ['qmeth_'], [('v','invoke_')]), None, None, False, 0), + + ('separate_query_from_modifier.rq', 'originalMethod', 'addedMethod1', [], + ['methodName','addedMethodName1','addedMethodName2','originalMethod','addedMethod1','addedMethod2'], + (['meth0'], ['meth1_','meth2_'], []), None, None, False, 0), + + ('rename_package.rq', 'originalPackage', 'modifiedPackage', [], + ['originalPackageName','modifiedPackageName'], ([], [], [('pdecl','pdecl_')]), None, None, False, 0), + + ('rename_class.rq', 'originalClass', 'modifiedClass', [], + ['originalClassName','modifiedClassName'], ([], [], [('class','class_')]), None, None, False, 0), + + ('rename_field.rq', 'originalField', 'modifiedField', [], + ['originalFieldName','modifiedFieldName'], ([], [], [('vdtor','vdtor_')]), None, None, False, 0), + + ('rename_method.rq', 'originalMethod', 'modifiedMethod', [], + ['originalMethodName','modifiedMethodName'], ([], [], [('meth','meth_')]), None, None, False, 0), + + ('move_class.rq', 'originalClass', 'modifiedClass', [], + ['originalClassName','modifiedClassName'], ([], [], [('class','class_')]), None, None, False, 0), + + ('move_field.rq', 'originalField', 'movedField', + [('originalField','modifiedContext'),('originalContext','movedField')], + ['fieldTypeName','fieldName','fromClassName','toClassName','fromModifiers','toModifiers','fromClass','toClass'], + (['vdtor'], ['vdtor_'], []), ('vdtor','vdtor_'), (False, False), False, 1), + + ('local_move_field.rq', 'originalField', 'movedField', + [('originalField','modifiedContext'),('originalContext','movedField')], + ['fieldTypeName','fieldName','fromClassName','toClassName','fromModifiers','toModifiers','fromClass','toClass'], + ([], [], [('vdtor','vdtor_')]), ('vdtor','vdtor_'), (True, True), False, 1), + + ('move_method.rq', 'originalMethod', 'movedMethod', + [('originalMethod','context_'),('context','movedMethod')], + ['signature','methodName','fromClassName','toClassName','fromClass','toClass'], + (['meth'], ['meth_'], []), ('meth','meth_'), (False, False), False, 1), + + ('local_move_method.rq', 'originalMethod', 'movedMethod', [], + ['signature','methodName','fromClassName','toClassName','fromClass','toClass'], + ([], [], [('meth','meth_')]), ('meth','meth_'), (True, True), False, 1), + ], + 'c' : + [ # ('add_macro_parameter.rq', 'e', 'param_', [], ['name_'], ([], [], []), None, None, False, 0), + # ('remove_macro_parameter.rq', 'param', 'e_', [], ['name'], ([], [], []), None, None, False, 0), + # ('add_parameter.rq', 'e', 'param_', [], ['name_'], ([], [], []), None, None, False, 0), + # ('remove_parameter.rq', 'param', 'e_', [], ['name'], ([], [], []), None, None, False, 0), + # ('change_bidirectional_association_to_unidirectional.rq', 'mem0', 'struct0_', [] , ['sname0_'], ([], [], []), None, None, False, 0), + # ('change_unidirectional_association_to_bidirectional.rq', 'struct0', 'mem0_', [] , ['sname0'], ([], [], []), None, None, False, 0), + # ('consolidate_conditional_expression.rq', 'if0', 'if0_', [] , ['fname1_'], ([], [], []), None, None, False, 0), + # ('decompose_conditional.rq', 'if', 'func_', [], ['fname_'], ([], [], []), None, None, False, 0), + # ('extract_function.rq', 'ent0', 'func_', [], ['fname_'], ([], [], []), None, None, False, 0), + # ('extract_structure.rq', 'spec0', 'spec1_', [], ['name0', 'name1_'], ([], [], []), None, None, False, 0), + # ('inline_function.rq', 'func', 'ent0', [], ['fname'], ([], [], []), None, None, False, 0), + # ('inline_structure.rq', 'spec1', 'spec0_', [], ['name1', 'name0_'], ([], [], []), None, None, False, 0), + + # ('extract_variable.rq', 'originalExpr', 'movedExpr', + # [('originalContext', 'modifiedContext'),('originalExpr','extractedVariable')], + # ['extractedVariableName','functionName'], ([], ['decl_','v_'], [('f','f_'),('a','rhs_')]), + # None, None, False, 1), + + # ('inline_temp.rq', 'originalExpr', 'movedExpr', + # [('originalContext','modifiedContext'),('eliminatedVariable','movedExpr')], + # ['eliminatedVariableName', 'functionName'], (['decl', 'v'], [], [('f', 'f_'),('rhs','a_')]), + # None, None, False, 1), + + # ('introduce_parameter_structure.rq', 'func', 'func_', [], ['name_ty0', 'name_ty1', 'name_struct_'], ([], [], []), None, None, False, 0), + # ('move_member.rq', 'mem0', 'mem0_', [], ['name0', 'name0_'], ([], [], []), None, None, False, 0), + # ('parameterize_function.rq', 'func0', 'func_', [], [], ([], [], []), None, None, False, 0), + # ('preserve_whole_structure.rq', 'call', 'call_', [], [], ([], [], []), None, None, False, 0), + # ('remove_assignments_to_parameters.rq', 'lhs0', 'lhs1_', [], [], ([], [], []), None, None, False, 0), + # ('rename_function.rq', 'func', 'func_', [], ['name', 'name_'], ([], [], []), None, None, False, 0), + # ('rename_macro.rq', 'macro', 'macro_', [], ['name', 'name_'], ([], [], []), None, None, False, 0), + # ('replace_data_value_with_structure.rq', 'decl', 'decl_', [], ['sname'], ([], [], []), None, None, False, 0), + # ('replace_magic_number_with_symbolic_constant.rq', 'const', 'ident_', [], ['name_'], ([], [], []), None, None, False, 0), + # ('replace_nested_conditional_with_guard_clauses.rq', 'if0', 'if0_', [], [], ([], [], []), None, None, False, 0), + # ('replace_parameter_with_explicit_functions.rq', 'func', 'func0_', [], [], ([], [], []), None, None, False, 0), + # ('replace_temp_with_query.rq', 'decl', 'func_', [], [], ([], [], []), None, None, False, 0), + # ('split_temporary_variable.rq', 'decl0', 'decl1_', [], [], ([], [], []), None, None, False, 0), + ] + } + return queries + +QUERIES = get_queries(weak=False) + +def find(base_dir, proj_id, foutdir, outdir, pw, port, + limit=None, lang=None, method='odbc', change_enumeration=False, per_ver=False, + query_prec=False, conf=None): + + find_change_patterns.find(QUERY_DIR, QUERIES, PREDICATE_TBL, FactExtractor, + base_dir, proj_id, foutdir, outdir, pw, port, + limit, lang, method, change_enumeration, per_ver, query_prec, conf=conf) + +def main(): + find_change_patterns.main(QUERY_DIR, + QUERIES, + 'find refactorings', + predicate_tbl=PREDICATE_TBL, + extra_fact_extractor=FactExtractor) + +if __name__ == '__main__': + main() diff --git a/cca/scripts/fragment.py b/python/src/cca/ccautil/fragment.py similarity index 98% rename from cca/scripts/fragment.py rename to python/src/cca/ccautil/fragment.py index dd54079..dc4a1a0 100644 --- a/cca/scripts/fragment.py +++ b/python/src/cca/ccautil/fragment.py @@ -23,8 +23,6 @@ import hashlib import logging -import pathsetup - logger = logging.getLogger() @@ -68,13 +66,13 @@ def __pop_elem(self): n0 = int(n0s) if n1s: n1 = int(n1s) - + result = (n0, n1) self.__ptr = m.end() else: raise StopIteration - + logger.debug(result) return result @@ -97,7 +95,7 @@ def __next__(self): self.__cur += 1 return result - + def __iter__(self): self.reset() @@ -125,7 +123,7 @@ def from_list(elems): if c > 0: result += '-' + str(elems[-1]) - + return Fragment(result) from_list = staticmethod(from_list) diff --git a/python/src/cca/ccautil/java_token_diff.py b/python/src/cca/ccautil/java_token_diff.py new file mode 100644 index 0000000..a807912 --- /dev/null +++ b/python/src/cca/ccautil/java_token_diff.py @@ -0,0 +1,351 @@ +#!/usr/bin/env python3 + +''' + java_token_diff.py + + Copyright 2018-2019 Chiba Institute of Technology + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +''' + +__author__ = 'Masatomo Hashimoto ' + +import sys +import os +import logging + +import filecmp +from difflib import SequenceMatcher +from javalang import tokenizer + +logger = logging.getLogger() + +def is_src(f): + return f.endswith('.java') + +def get_tokens(path): + toks = [] + try: + with open(path, 'r') as f: + for tok in tokenizer.tokenize(f.read()): + toks.append(tok.value) + except Exception as e: + pass + + seq = [] + + while True: + try: + tok = toks.pop(0) + + if tok == '.': + try: + nxt = toks.pop(0) + r = '.' + nxt + if seq: + if seq[-1] not in (',','('): + seq[-1] += r + else: + seq.append(r) + else: + seq.append(r) + + except IndexError: + seq.append(tok) + + elif tok == ',': + try: + nxt = toks.pop(0) + if nxt in ('}', ';'): + seq.append(nxt) + else: + seq.append(tok) + seq.append(nxt) + + except IndexError: + seq.append(tok) + + else: + seq.append(tok) + + except IndexError: + break + + return seq + +def count_tokens(path): + c = len(get_tokens(path)) + return c + +def get_files(x): + l = [] + for (d, dns, ns) in os.walk(x): + for n in ns: + p = os.path.join(d, n) + if is_src(p): + l.append(p) + return l + +def get_pre_context(toks, i): + if i > 2: + pre = ' '.join(toks[i-3:i]) + elif i == 2: + pre = ' '.join(toks[i-2:i]) + elif i == 1: + pre = ' '.join(toks[i-1:i]) + else: + pre = ' '.join(toks[0:i]) + return pre + +def get_post_context(toks, i): + post = ' '.join(toks[i:i+5]) + return post + +def get_context(toks, i): + return (get_pre_context(toks, i), get_post_context(toks, i)) + +def diff_to_str(d, toks1, toks2): + dels = d['delete'] + repls = d['replace'] + inss = d['insert'] + + lines = [] + + if dels: + for ((a, b), _) in dels: + pre, post = get_pre_context(toks1, a), get_post_context(toks1, b) + lines.append('[DELETE] {}-{} ({}):\n'.format(a, b-1, b-a)) + lines.append(' {}\n'.format(pre)) + lines.append('- ') + lines.append(' '.join(toks1[a:b])) + lines.append('\n') + lines.append(' {}\n'.format(post)) + if repls: + for ((a, b), (a2, b2)) in repls: + pre, post = get_pre_context(toks1, a), get_post_context(toks1, b) + lines.append('[REPLACE] {}-{} -> {}-{} ({}->{}):\n'.format(a, b-1, a2, b2-1, + b-a, b2-a2)) + lines.append(' {}\n'.format(pre)) + lines.append('- ') + lines.append(' '.join(toks1[a:b])) + lines.append('\n-----\n') + lines.append('+ ') + lines.append(' '.join(toks2[a2:b2])) + lines.append('\n') + lines.append(' {}\n'.format(post)) + if inss: + for ((i, _), (a, b)) in inss: + pre, post = get_context(toks1, i) + lines.append('[INSERT] {} -> {}-{} ({}):\n'.format(i, a, b-1, b-a)) + lines.append(' {}\n'.format(pre)) + lines.append('+ ') + lines.append(' '.join(toks2[a:b])) + lines.append('\n') + lines.append(' {}\n'.format(post)) + + s = ''.join(lines) + + return s + +def print_diff(d, toks1, toks2): + print(diff_to_str(d, toks1, toks2)) + +def size_of_diff(d): + sz = 0 + for ((i1, i2), _) in d['delete']: + sz += i2 - i1 + + for ((i1, i2), (j1, j2)) in d['replace']: + sz += i2 - i1 + j2 - j1 + + for (_, (j1, j2)) in d['insert']: + sz += j2 - j1 + + return sz + +def diff_tokens(toks1, toks2): + m = SequenceMatcher(isjunk=None, a=toks1, b=toks2) + d = {'replace':[],'delete':[],'insert':[]} + for (tag, i1, i2, j1, j2) in m.get_opcodes(): + if tag != 'equal': + d[tag].append(((i1, i2), (j1, j2))) + d['sim'] = m.ratio() + nm = 0 + for nt in m.get_matching_blocks(): + nm += nt.size + d['nmatches'] = nm + return d + +def is_equivalent_file(path1, path2): + if filecmp.cmp(path1, path2, shallow=False): + logger.info('same files') + return True + + logger.info('comparing {} with {}'.format(path1, path2)) + + toks1 = get_tokens(path1) + toks2 = get_tokens(path2) + b = toks1 == toks2 + return b + +def all_different(paths): + n = len(paths) + for i in range(n-1): + for j in range(i+1, n): + if filecmp.cmp(paths[i], paths[j], shallow=False): + logger.info('same files: {} {}'.format(paths[i], paths[j])) + return False + + toks_list = [None for _ in paths] + + for i in range(n-1): + for j in range(i+1, n): + if toks_list[i] == None: + toks_list[i] = get_tokens(paths[i]) + if toks_list[j] == None: + toks_list[j] = get_tokens(paths[j]) + if toks_list[i] == toks_list[j]: + logger.info('equivalent files: {} {}'.format(paths[i], paths[j])) + return False + + return True + +def compare_files(path1, path2, simple=False): + if filecmp.cmp(path1, path2, shallow=False): + logger.info('same files') + return {'count':0,'diff':'','sim':1.0} + elif simple: + logger.info('different files') + return {} + + logger.info('comparing {} with {}'.format(path1, path2)) + + toks1 = get_tokens(path1) + toks2 = get_tokens(path2) + d = diff_tokens(toks1, toks2) + c = size_of_diff(d) + s = diff_to_str(d, toks1, toks2) + sim = d['sim'] + nm = d['nmatches'] + dist = float(c) / (float(nm) if nm > 0 else 1.0) + ret = {'count':c,'diff':s,'sim':sim,'dist':dist} + return ret + +def compare_dirs(d1, d2, simple=False): + #print('comparing {} with {}'.format(d1, d2)) + + dcmp = filecmp.dircmp(d1, d2) + removed_files = [] + added_files = [] + modified_files = [] + + removed_dirs = [] + added_dirs = [] + + def scan(dc): + for f in dc.left_only: + p = os.path.join(dc.left, f) + if is_src(f): + removed_files.append(p) + + elif os.path.isdir(p): + removed_dirs.append(p) + + for f in dc.right_only: + p = os.path.join(dc.right, f) + if is_src(f): + added_files.append(p) + + elif os.path.isdir(p): + added_dirs.append(p) + + for f in dc.diff_files: + if is_src(f): + p1 = os.path.join(dc.left, f) + p2 = os.path.join(dc.right, f) + modified_files.append((p1, p2)) + + for subd in dc.subdirs.values(): + scan(subd) + + scan(dcmp) + + count = 0 + + for f in removed_files: + count += count_tokens(f) + + for f in added_files: + count += count_tokens(f) + + for d in removed_dirs: + for f in get_files(d): + count += count_tokens(f) + + for d in added_dirs: + for f in get_files(d): + count += count_tokens(f) + + for (f1, f2) in modified_files: + r = compare_files(f1, f2, simple=simple) + if r: + count += r['count'] + + return count + + +def main(): + from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter + + parser = ArgumentParser(description='compute size of token sequence delta', + formatter_class=ArgumentDefaultsHelpFormatter) + + parser.add_argument('path1', type=str) + parser.add_argument('path2', type=str) + + parser.add_argument('-v', '--verbose', dest='verbose', action='store_true', + help='enable verbose printing') + + parser.add_argument('-d', '--debug', dest='debug', action='store_true', + help='enable debug printing') + + parser.add_argument('-s', '--simple', dest='simple', action='store_true', + help='only checks if file1 is equivalent to file2') + + args = parser.parse_args() + + log_level = logging.WARNING + if args.verbose: + log_level = logging.INFO + if args.debug: + log_level = logging.DEBUG + logging.basicConfig(format='[%(levelname)s][%(funcName)s] %(message)s', level=log_level) + + c = None + + if os.path.isfile(args.path1) and os.path.isfile(args.path2): + r = compare_files(args.path1, args.path2, simple=args.simple) + if r: + d = r['diff'] + if d: + logger.debug('differences:\n{}'.format(d)) + c = r['count'] + + elif os.path.isdir(args.path1) and os.path.isdir(args.path2): + c = compare_dirs(args.path1, args.path2, simple=args.simple) + + print(c) + + +if __name__ == '__main__': + main() diff --git a/cca/scripts/load_into_virtuoso.py b/python/src/cca/ccautil/load_into_virtuoso.py old mode 100755 new mode 100644 similarity index 88% rename from cca/scripts/load_into_virtuoso.py rename to python/src/cca/ccautil/load_into_virtuoso.py index 2f8fa86..a8e98b3 --- a/cca/scripts/load_into_virtuoso.py +++ b/python/src/cca/ccautil/load_into_virtuoso.py @@ -4,7 +4,7 @@ ''' A fact loader - Copyright 2012-2020 Codinuum Software Lab + Copyright 2012-2021 Codinuum Software Lab Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -21,19 +21,20 @@ import os.path import sys +import logging -import pathsetup -from pathsetup import CCA_HOME, LOG_DIR +from .siteconf import CCA_HOME, LOG_DIR -import virtuoso -from virtuoso import (VTMP_DIR, - DB_DIR, - GRAPH_URI_BASE, - VIRTUOSO_PORT, - DEFAULT_MAX_FILES, - VIRTUOSO_PW) +from . import virtuoso +from .virtuoso import (VTMP_DIR, + DB_DIR, + GRAPH_URI_BASE, + VIRTUOSO_PORT, + DEFAULT_MAX_FILES, + VIRTUOSO_PW) +logger = logging.getLogger() def load(proj_id, db_dir, fact_dir, exts, port=VIRTUOSO_PORT, pw=VIRTUOSO_PW): graph_uri = GRAPH_URI_BASE+proj_id @@ -42,7 +43,7 @@ def load(proj_id, db_dir, fact_dir, exts, port=VIRTUOSO_PORT, pw=VIRTUOSO_PW): rc = loader.disable_checkpoint() if rc != 0: - loader.message('starting server...') + logger.info('starting server...') loader.start_server() loader.disable_checkpoint() @@ -51,7 +52,7 @@ def load(proj_id, db_dir, fact_dir, exts, port=VIRTUOSO_PORT, pw=VIRTUOSO_PW): return rc -if __name__ == '__main__': +def main(): from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter default_fdir = os.path.join(VTMP_DIR, '') @@ -97,12 +98,9 @@ def load(proj_id, db_dir, fact_dir, exts, port=VIRTUOSO_PORT, pw=VIRTUOSO_PW): def doit(): loader = virtuoso.Loader(args.dbdir, daemonize=args.daemon) - if args.debug: - loader.set_debug_flag() - rc = loader.disable_checkpoint() if rc != 0: - loader.message('starting server...') + logger.info('starting server...') loader.start_server() loader.disable_checkpoint() @@ -131,3 +129,6 @@ def doit(): else: doit() + +if __name__ == '__main__': + main() diff --git a/cca/scripts/load_ont_into_virtuoso.py b/python/src/cca/ccautil/load_ont_into_virtuoso.py similarity index 91% rename from cca/scripts/load_ont_into_virtuoso.py rename to python/src/cca/ccautil/load_ont_into_virtuoso.py index cda6dba..6d33b0c 100644 --- a/cca/scripts/load_ont_into_virtuoso.py +++ b/python/src/cca/ccautil/load_ont_into_virtuoso.py @@ -4,7 +4,7 @@ ''' An ontology loader - Copyright 2012-2020 Codinuum Software Lab + Copyright 2012-2021 Codinuum Software Lab Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -21,7 +21,7 @@ import os -import virtuoso +from . import virtuoso GRAPH_URI = 'http://codinuum.com/ont/cpi' RULE_NAME = 'ont.cpi' @@ -43,5 +43,8 @@ def load(db_dir, fact_dir, rc = loader.checkpoint() return rc -if __name__ == '__main__': +def main(): load(virtuoso.DB_DIR, FACT_DIR) + +if __name__ == '__main__': + main() diff --git a/cca/scripts/materialize_fact.py b/python/src/cca/ccautil/materialize_fact.py similarity index 96% rename from cca/scripts/materialize_fact.py rename to python/src/cca/ccautil/materialize_fact.py index 35626cc..d5b284f 100644 --- a/cca/scripts/materialize_fact.py +++ b/python/src/cca/ccautil/materialize_fact.py @@ -24,13 +24,12 @@ import re import logging -import pathsetup -import project -import sparql -from ns import VER_NS, FB_NS -import virtuoso -from virtuoso import VIRTUOSO_PW, VIRTUOSO_PORT -from common import setup_logger +from . import project +from . import sparql +from .ns import VER_NS, FB_NS +from . import virtuoso +from .virtuoso import VIRTUOSO_PW, VIRTUOSO_PORT +from .common import setup_logger logger = logging.getLogger() diff --git a/python/src/cca/ccautil/materialize_fact_for_refactoring.py b/python/src/cca/ccautil/materialize_fact_for_refactoring.py new file mode 100644 index 0000000..602d519 --- /dev/null +++ b/python/src/cca/ccautil/materialize_fact_for_refactoring.py @@ -0,0 +1,155 @@ +#!/usr/bin/env python3 + +import os.path + +from .materialize_fact import main, Materializer, VIRTUOSO_PORT, VIRTUOSO_PW +from .siteconf import CCA_HOME + +QUERY_DIR = os.path.join(CCA_HOME, 'queries', 'refactoring') + +JAVA_ITER_QUERIES = [ + 'materialize_resolved_name.rq', + 'materialize_resolved_facc.rq', + + 'materialize_reftype_of_this.rq', + 'materialize_reftype_of_ivk.rq', + 'materialize_reftype_of_new.rq', + 'materialize_reftype_of_expr0.rq', + 'materialize_reftype_of_expr1.rq', + 'materialize_reftype_of_expr2.rq', + 'materialize_reftype_of_field_access.rq', + + 'materialize_type_of_this.rq', + 'materialize_type_of_uop.rq', + 'materialize_type_of_bop.rq', + 'materialize_type_of_array_access.rq', + 'materialize_type_of_ivk.rq', + 'materialize_type_of_new.rq', + 'materialize_type_of_field_access.rq', + + #'materialize_simple_ivk0.rq', + 'materialize_simple_ivkx.rq', + + #'materialize_primary_ivk.rq', + 'materialize_primary_ivk0.rq', + 'materialize_primary_ivkx.rq', + + 'materialize_type_ivk0.rq', + 'materialize_type_ivkx.rq', + + 'materialize_new_ivkx0.rq', + 'materialize_new_ivkx1.rq', + 'materialize_new_ivkx2.rq', + + 'materialize_super_ivkx.rq', + 'materialize_this_ivkx.rq', +] + +QUERIES = { 'java' : + [ 'materialize_tdecl_in_srctree.rq', + 'materialize_file_mapping.rq', + 'materialize_stmt_level0.rq', + 'materialize_stmt_level.rq', + + 'materialize_pruned_tdecl.rq', + 'materialize_pruned_super_type.rq', + 'materialize_pruned_method.rq', + 'materialize_pruned_field.rq', + 'materialize_pruned_field_access.rq', + 'materialize_pruned_name.rq', + 'materialize_pruned_enum_const.rq', + 'materialize_pruned_invocation.rq', + 'materialize_pruned_import.rq', + 'materialize_pruned_param.rq', + + 'materialize_grafted_tdecl.rq', + 'materialize_grafted_super_type.rq', + 'materialize_grafted_method.rq', + 'materialize_grafted_field.rq', + 'materialize_grafted_field_access.rq', + 'materialize_grafted_name.rq', + 'materialize_grafted_enum_const.rq', + 'materialize_grafted_invocation.rq', + 'materialize_grafted_import.rq', + 'materialize_grafted_param.rq', + + 'materialize_resolved_reftype.rq', + 'materialize_resolved_tyvar.rq', + 'materialize_resolved_type_ivk_pe.rq', + 'materialize_resolved_type_ivk_ps.rq', + 'materialize_resolved_type_ivk.rq', + 'materialize_resolved_type_ivk_static.rq', + + 'materialize_class_hierarchy.rq', + 'materialize_interface_hierarchy.rq', + 'materialize_class_name_hierarchy.rq', + + 'materialize_resolved_enum_const.rq', + 'materialize_resolved_facc0.rq', + + 'materialize_refers_to_decl.rq', + + 'materialize_tdecl_mapped_eq.rq', + 'materialize_tdecl_modified.rq', + 'materialize_stable_mapping.rq', + + 'materialize_return_reftype.rq', + 'materialize_return_type.rq', + + 'materialize_reftype_of_new0.rq', + + 'materialize_declared_by_field0-0-0.rq', + 'materialize_declared_by_field0-0-1.rq', + 'materialize_declared_by_field0-0-2.rq', + 'materialize_declared_by_field0-1.rq', + 'materialize_declared_by_catch_param.rq', + + 'materialize_reftype_of_enum_const.rq', + 'materialize_reftype_of_cast.rq', + 'materialize_reftype_of_declared_var.rq', + 'materialize_reftype_of_var_declared_by_param.rq', + 'materialize_reftype_of_local_field_access.rq', + + 'materialize_type_of_enum_const.rq', + 'materialize_type_of_cast.rq', + 'materialize_type_of_literal.rq', + 'materialize_type_of_declared_var.rq', + 'materialize_type_of_var_declared_by_param.rq', + 'materialize_type_of_local_field_access.rq', + + 'materialize_param_ty.rq', + + 'materialize_simple_ivk0.rq', + 'materialize_super_ivk0.rq', + 'materialize_this_ivk0.rq', + #'materialize_type_ivk.rq', + 'materialize_new_ivk0.rq', + ] + + JAVA_ITER_QUERIES + + JAVA_ITER_QUERIES + + JAVA_ITER_QUERIES + + [ + #'materialize_declared_by_field1.rq', + 'materialize_declared_by_field2.rq', + ], + # 'c' : + # [ 'materialize_pruned_functions.rq', + # 'materialize_pruned_declarations.rq', + # 'materialize_grafted_functions.rq', + # 'materialize_grafted_declarations.rq', + # 'materialize_functions_in_srctree.rq', + # 'materialize_declarations_in_srctree.rq', + # ], + } + +def materialize(proj_id, pw=VIRTUOSO_PW, port=VIRTUOSO_PORT, conf=None): + m = Materializer(QUERY_DIR, QUERIES, proj_id, pw=pw, port=port, conf=conf) + rc = m.materialize() + return rc + + +def main(): + main(QUERY_DIR, QUERIES, 'materialize facts for refactoring') + +if __name__ == '__main__': + main() diff --git a/cca/scripts/ns.py b/python/src/cca/ccautil/ns.py similarity index 100% rename from cca/scripts/ns.py rename to python/src/cca/ccautil/ns.py diff --git a/python/src/cca/ccautil/patchast.py b/python/src/cca/ccautil/patchast.py new file mode 100644 index 0000000..7656621 --- /dev/null +++ b/python/src/cca/ccautil/patchast.py @@ -0,0 +1,64 @@ +#!/usr/bin/env python3 + +''' + patchast.py + + Copyright 2018 Chiba Institute of Technology + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +''' + +__author__ = 'Masatomo Hashimoto ' + +import logging + +from . import proc +from .diffts import patchast_cmd +from .common import setup_logger + +logger = logging.getLogger() + +def patchast(src_path, delta_path, quiet=True): + cmd = patchast_cmd + cmd += ' %s %s' % (src_path, delta_path) + stat = proc.system(cmd, quiet=quiet) + return stat + + +def main(): + from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter + + parser = ArgumentParser(description='apply AST delta', + formatter_class=ArgumentDefaultsHelpFormatter) + + parser.add_argument('src_path', type=str, help='source directory') + parser.add_argument('delta_path', type=str, help='delta bundle') + + parser.add_argument('-d', '--debug', dest='debug', action='store_true', help='enable debug printing') + parser.add_argument('-v', '--verbose', dest='verbose', action='store_true', help='enable verbose printing') + + args = parser.parse_args() + + log_level = logging.WARNING + if args.verbose: + log_level = logging.INFO + if args.debug: + log_level = logging.DEBUG + setup_logger(logger, log_level) + + proc.logger = logger + + patchast(args.src_path, args.delta_path, quiet=(not args.debug)) + +if __name__ == '__main__': + main() diff --git a/python/src/cca/ccautil/plain_patch.py b/python/src/cca/ccautil/plain_patch.py new file mode 100644 index 0000000..163f71a --- /dev/null +++ b/python/src/cca/ccautil/plain_patch.py @@ -0,0 +1,396 @@ +#!/usr/bin/env python3 + +''' + plain_patch.py + + Copyright 2018-2020 Chiba Institute of Technology + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +''' + +__author__ = 'Masatomo Hashimoto ' + +import os +import sys +import re +import filecmp +import difflib +import time +import random +import logging + +from .common import setup_logger + +logger = logging.getLogger() + +EXTS = ['.java','.jj','.jjt','.properties'] + +HEAD_PAT0 = re.compile(r'^--- (?P[A-Za-z0-9._/-]+).*$') +HEAD_PAT1 = re.compile(r'^\+\+\+ (?P[A-Za-z0-9._/-]+).*$') +HUNK_HEAD_PAT = re.compile(r'^@@ -(?P[0-9]+),(?P[0-9]+) \+(?P[0-9]+),(?P[0-9]+) @@$') + + +def get_head0(s): + h = None + m = HEAD_PAT0.match(s) + if m: + path = m.group('path') + h = (s, path) + return h + +def get_head1(s): + h = None + m = HEAD_PAT1.match(s) + if m: + path = m.group('path') + h = (s, path) + return h + +def get_hunk_head(s): + rs = None + m = HUNK_HEAD_PAT.match(s) + if m: + sl0 = int(m.group('sl0')) + c0 = int(m.group('c0')) + sl1 = int(m.group('sl1')) + c1 = int(m.group('c1')) + rs = (s, (sl0, c0, sl1, c1)) + return rs + +def is_src(f): + return any([f.endswith(ext) for ext in EXTS]) + +class IdGenerator(object): + def __init__(self): + self._count = 0 + + def gen(self): + i = self._count + self._count += 1 + return i + +class Hunk(object): + def __init__(self, head_ranges): + head, ranges = head_ranges + self.head = head + self.ranges = ranges + self._lines = [] + + def __str__(self): + return '%s(%s)>' % self.ranges + + def add_line(self, l): + self._lines.append(l) + + def dump(self, out): + out.write(self.head) + for l in self._lines: + out.write(l) + +class Header(object): + def __init__(self, head1_path1, head2_path2): + head1, path1 = head1_path1 + head2, path2 = head2_path2 + self.head1 = head1 + self.head2 = head2 + self.path1 = path1 + self.path2 = path2 + + def __str__(self): + return '' % self.path1 + + def dump(self, out): + out.write(self.head1) + out.write(self.head2) + + +class Patch(object): + def __init__(self, dpath1, dpath2, filt=None, shuffle=0): + self._idgen = IdGenerator() + self._hunk_tbl = {} # hid -> hunk + self._header_tbl = {} # hunk -> header + self._dpath1 = dpath1 + self._dpath2 = dpath2 + + self._filt = lambda x: True + if filt: + self._filt = filt + + self.compare_dirs(dpath1, dpath2) + + if shuffle: + permutation = range(len(self)) + for i in range(shuffle): + random.shuffle(permutation) + logger.info('permutation=%s' % permutation) + tbl = {} + for (hid, hunk) in self._hunk_tbl.items(): + tbl[permutation[hid]] = hunk + self._hunk_tbl = tbl + + def __len__(self): + return len(self._hunk_tbl.keys()) + + def __str__(self): + return '' % len(self._hunk_tbl.keys()) + + def get_hunk_ids(self): + return self._hunk_tbl.keys() + + def dump(self, hids=None, out=sys.stdout): + tbl = {} # header -> hunk list + + if hids == None: + hids = self._hunk_tbl.keys() + + for hid in hids: + hunk = self._hunk_tbl[hid] + header = self._header_tbl[hunk] + try: + l = tbl[header] + except KeyError: + l = [] + tbl[header] = l + l.append(hunk) + + for (header, hunks) in tbl.items(): + header.dump(out) + for hunk in sorted(hunks, key=lambda h: h.ranges): + hunk.dump(out) + + def gen_id(self): + return self._idgen.gen() + + def get_hunk(self, hid): + return self._hunk_tbl[hid] + + def get_header(self, hunk): + return self._header_tbl[hunk] + + def reg_hunk(self, header, hunk): + hid = self.gen_id() + self._hunk_tbl[hid] = hunk + self._header_tbl[hunk] = header + + def compare_dirs(self, d1, d2): + logger.info('comparing %s with %s' % (d1, d2)) + dcmp = filecmp.dircmp(d1, d2) + removed_files = [] + added_files = [] + modified_files = [] + + removed_dirs = [] + added_dirs = [] + + def scan(dc): + for f in dc.left_only: + p = os.path.join(dc.left, f) + if is_src(f): + if self._filt(p): + logger.debug('R %s' % p) + removed_files.append(p) + + elif os.path.isdir(p): + logger.debug('R %s' % p) + removed_dirs.append(p) + + for f in dc.right_only: + p = os.path.join(dc.right, f) + if is_src(f): + if self._filt(p): + logger.debug('A %s' % p) + added_files.append(p) + + elif os.path.isdir(p): + logger.debug('A %s' % p) + added_dirs.append(p) + + for f in dc.diff_files: + if is_src(f): + p1 = os.path.join(dc.left, f) + p2 = os.path.join(dc.right, f) + if self._filt(p1) and self._filt(p2): + logger.debug('M %s' % p1) + modified_files.append((p1, p2)) + + for subd in dc.subdirs.values(): + scan(subd) + + scan(dcmp) + + for f1 in removed_files: + self.compare_files(f1, None) + + for f2 in added_files: + self.compare_files(None, f2) + + for d1 in removed_dirs: + self.scan_files(d1, self.reg_file_del_patch) + + for d2 in added_dirs: + self.scan_files(d2, self.reg_file_ins_patch) + + for (f1, f2) in modified_files: + self.compare_files(f1, f2) + + def scan_files(self, x, f): + for (d, dns, ns) in os.walk(x): + for n in ns: + p = os.path.join(d, n) + if self._filt(p): + f(p) + + def reg_file_del_patch(self, path): + date = time.ctime()#time.ctime(os.stat(path).st_mtime) + with open(path, 'U') as f: + lines = f.readlines() + count = len(lines) + p = os.path.relpath(path, self._dpath1) + header = Header(('--- %s %s\n' % (p, date), path), + ('+++ /dev/null %s\n' % date, '/dev/null')) + hunk = Hunk(('@@ -1,%d +0,0 @@\n' % count, (1, count, 0, 0))) + last_line = None + for line in lines: + last_line = line + hunk.add_line('-'+line) + + if last_line and not last_line.endswith('\n'): + hunk.add_line('\n\\ No newline at end of file\n') + + hid = self.gen_id() + self._hunk_tbl[hid] = hunk + self._header_tbl[hunk] = header + + def reg_file_ins_patch(self, path): + date = time.ctime()#time.ctime(os.stat(path).st_mtime) + with open(path, 'U') as f: + lines = f.readlines() + count = len(lines) + p = os.path.relpath(path, self._dpath2) + header = Header(('--- /dev/null %s\n' % date, '/dev/null'), + ('+++ %s %s\n' % (p, date), path)) + hunk = Hunk(('@@ -0,0 +1,%d @@\n' % count, (0, 0, 1, count))) + last_line = None + for line in lines: + last_line = line + hunk.add_line('+'+line) + + if last_line and not last_line.endswith('\n'): + hunk.add_line('\n\\ No newline at end of file\n') + + hid = self.gen_id() + self._hunk_tbl[hid] = hunk + self._header_tbl[hunk] = header + + def compare_files(self, file1, file2): + logger.info('comparing %s with %s' % (file1, file2)) + + if file1 and file2 == None: + self.reg_file_del_patch(file1) + + elif file1 == None and file2: + self.reg_file_ins_patch(file2) + + elif file1 and file2: + date1 = time.ctime()#time.ctime(os.stat(file1).st_mtime) + date2 = time.ctime()#time.ctime(os.stat(file2).st_mtime) + + with open(file1, 'U') as f1: + lines1 = f1.readlines() + + with open(file2, 'U') as f2: + lines2 = f2.readlines() + + p1 = os.path.relpath(file1, self._dpath1) + p2 = os.path.relpath(file2, self._dpath2) + + dls = difflib.unified_diff(lines1, lines2, p1, p2, date1, date2) + + head0 = None + head1 = None + hunk_head = None + + header = None + hunk = None + + for dl in dls: + if head0 == None: + head0 = get_head0(dl) + + if head1 == None: + head1 = get_head1(dl) + + hunk_head = get_hunk_head(dl) + + if head0: + logger.debug('HEAD0:%s' % (head0,)) + if head1: + logger.debug('HEAD1:%s' % (head1,)) + if hunk_head: + logger.debug('HUNK_HEAD:%s' % (hunk_head,)) + + if hunk and not hunk_head: + if not dl.endswith('\n'): + dl += '\n\\ No newline at end of file\n' + hunk.add_line(dl) + + if head0 and head1: + header = Header(head0, head1) + head0 = None + head1 = None + + if header and hunk_head: + hunk = Hunk(hunk_head) + hunk_head = None + self.reg_hunk(header, hunk) + + logger.debug(dl) + + logger.debug(header) + + + +def main(): + from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter + + parser = ArgumentParser(description='decompose patch', + formatter_class=ArgumentDefaultsHelpFormatter) + + parser.add_argument('-d', '--debug', dest='debug', action='store_true', + help='enable debug printing') + parser.add_argument('-v', '--verbose', dest='verbose', action='store_true', + help='enable verbose printing') + + parser.add_argument('dir1', type=str, help='base directory') + parser.add_argument('dir2', type=str, help='modified directory') + + args = parser.parse_args() + + log_level = logging.WARNING + if args.verbose: + log_level = logging.INFO + if args.debug: + log_level = logging.DEBUG + setup_logger(logger, log_level) + + patch = Patch(args.dir1, args.dir2) + + hids = patch.get_hunk_ids() + print('%d hunks generated' % (len(hids))) + + for hid in hids: + print('*** Hunk ID: %s' % hid) + patch.dump([hid]) + +if __name__ == '__main__': + main() diff --git a/cca/scripts/proc.py b/python/src/cca/ccautil/proc.py similarity index 98% rename from cca/scripts/proc.py rename to python/src/cca/ccautil/proc.py index 73c744a..7edfe4f 100644 --- a/cca/scripts/proc.py +++ b/python/src/cca/ccautil/proc.py @@ -24,8 +24,7 @@ import subprocess import logging -import pathsetup -from common import setup_logger +from .common import setup_logger logger = logging.getLogger() diff --git a/cca/scripts/project.py b/python/src/cca/ccautil/project.py similarity index 95% rename from cca/scripts/project.py rename to python/src/cca/ccautil/project.py index dd595ca..497bc59 100644 --- a/cca/scripts/project.py +++ b/python/src/cca/ccautil/project.py @@ -22,13 +22,13 @@ import os import logging -import pathsetup +from .siteconf import CONFIGS_DIR logger = logging.getLogger() def get_confs(): - _confs = filter(lambda x: x.endswith('.py'), os.listdir(pathsetup.CONFIGS_DIR)) + _confs = filter(lambda x: x.endswith('.py'), os.listdir(CONFIGS_DIR)) confs = [os.path.splitext(x)[0] for x in _confs] confs.sort() return confs diff --git a/cca/scripts/run_workers.py b/python/src/cca/ccautil/run_workers.py similarity index 99% rename from cca/scripts/run_workers.py rename to python/src/cca/ccautil/run_workers.py index eac6673..61f5968 100644 --- a/cca/scripts/run_workers.py +++ b/python/src/cca/ccautil/run_workers.py @@ -27,8 +27,7 @@ import time import logging -import pathsetup -from common import setup_logger +from .common import setup_logger logger = logging.getLogger() diff --git a/python/src/cca/ccautil/sim.py b/python/src/cca/ccautil/sim.py new file mode 100644 index 0000000..d7ea80f --- /dev/null +++ b/python/src/cca/ccautil/sim.py @@ -0,0 +1,82 @@ +#!/usr/bin/env python3 + + +''' + sim.py + + Copyright 2012-2021 Codinuum Software Lab + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +''' + +import filecmp +import difflib + +from . import java_token_diff as java + + +def line_sim(f1, f2): + if filecmp.cmp(f1, f2): + return 0.0 + lines1 = open(f1, 'U').readlines() + lines2 = open(f2, 'U').readlines() + matcher = difflib.SequenceMatcher(None, lines1, lines2) + similarity = matcher.quick_ratio() + return similarity + + +def java_sim(f1, f2): + if filecmp.cmp(f1, f2): + return 0.0 + toks1 = java.get_tokens(f1) + toks2 = java.get_tokens(f2) + matcher = difflib.SequenceMatcher(isjunk=None, a=toks1, b=toks2) + similarity = matcher.quick_ratio() + return similarity + + +def sim(f1, f2, plain=False): + similarity = 1.0 + if not filecmp.cmp(f1, f2): + if plain: + similarity = line_sim(f1, f2) + elif java.is_src(f1) and java.is_src(f2): + similarity = java_sim(f1, f2) + else: + similarity = line_sim(f1, f2) + + return similarity + + +def main(): + from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter + + parser = ArgumentParser(description='compute similarity between files', + formatter_class=ArgumentDefaultsHelpFormatter) + + parser.add_argument('path1', type=str) + parser.add_argument('path2', type=str) + + parser.add_argument('--plain', dest='plain', action='store_true', + help='perform language agnostic differencing') + + args = parser.parse_args() + + try: + s = sim(args.path1, args.path2, plain=args.plain) + print(s) + except IOError as e: + print('ERROR: {}'.format(str(e))) + +if __name__ == '__main__': + main() diff --git a/cca/scripts/siteconf.py b/python/src/cca/ccautil/siteconf.py similarity index 67% rename from cca/scripts/siteconf.py rename to python/src/cca/ccautil/siteconf.py index e1f2b93..99809e3 100644 --- a/cca/scripts/siteconf.py +++ b/python/src/cca/ccautil/siteconf.py @@ -3,13 +3,24 @@ import sys import os -#CCA_HOME = '/opt/cca' -CCA_HOME = os.path.dirname(os.path.dirname(os.path.abspath(sys.argv[0]))) +CCA_HOME = os.getenv('CCA_HOME', '/opt/cca') -VIRTUOSO_DIR = '/opt/virtuoso' +LOG_DIR = os.getenv('CCA_LOG_DIR', '/var/log/cca') + +PROJECTS_DIR_NAME = 'projects' + +_PROJECTS_DIR = os.path.join(CCA_HOME, PROJECTS_DIR_NAME) +PROJECTS_DIR = os.getenv('CCA_PROJECTS_DIR', _PROJECTS_DIR) + +_CONFIGS_DIR = os.path.join(CCA_HOME, 'configs') +CONFIGS_DIR = os.getenv('CCA_CONFIGS_DIR', _CONFIGS_DIR) + +GIT_REPO_BASE = '' # +VIRTUOSO_DIR = '/opt/virtuoso' + SPARQL_ENDPOINT = 'http://localhost:8890/sparql' VIRTUOSO_DSN = 'local-virtuoso' @@ -17,11 +28,4 @@ VIRTUOSO_HOST = 'localhost' VIRTUOSO_PORT = 1111 VIRTUOSO_USER = 'dba' -VIRTUOSO_PW = 'xxx' - -GIT_REPO_BASE = '' - -PROJECTS_DIR_NAME = 'projects' - -_PROJECTS_DIR = os.path.join(CCA_HOME, PROJECTS_DIR_NAME) -PROJECTS_DIR = os.getenv('CCA_PROJECTS_DIR', _PROJECTS_DIR) +VIRTUOSO_PW = 'cca' diff --git a/python/src/cca/ccautil/sparql.py b/python/src/cca/ccautil/sparql.py new file mode 100644 index 0000000..dfe81c0 --- /dev/null +++ b/python/src/cca/ccautil/sparql.py @@ -0,0 +1,290 @@ +#!/usr/bin/env python3 + + +''' + A SPARQL driver + + Copyright 2012-2020 Codinuum Software Lab + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +''' + +# Fortran namespaces added by Masatomo Hashimoto + +import logging + +from .siteconf import SPARQL_ENDPOINT +from .virtuoso import ODBCDriver, VIRTUOSO_PW, VIRTUOSO_PORT, get_odbc_connect_string +from . import ns +from cca.factutil.const import ENTITY_NS, VARIANT_NS, SVNREV_NS, GITREV_NS, RELEASE_NS +from .common import setup_logger + +logger = logging.getLogger() + + +NAMESPACES = { 'xsd' : ns.XSD_NS, + 'owl' : ns.OWL_NS, + 'rdf' : ns.RDF_NS, + 'fb' : ns.FB_NS, + 'src' : ns.SRC_NS, + 'ver' : ns.VER_NS, + 'chg' : ns.CHG_NS, + 'git' : ns.GIT_NS, + + 'ent' : ENTITY_NS, + 'variant' : VARIANT_NS, + 'svnrev' : SVNREV_NS, + 'gitrev' : GITREV_NS, + 'rel' : RELEASE_NS, + + 'f' : ns.F_NS, + 'pa' : ns.PA_NS, + 'fjpa' : ns.FJPA_NS, + 'fpt' : ns.FPT_NS, + + 'fjpadata' : ns.PREFIX_TBL['fjpadata'], + 'entpair' : ns.PREFIX_TBL['entpair'], + 'chgpat' : ns.PREFIX_TBL['chgpat'], + 'chginst' : ns.PREFIX_TBL['chginst'], + } + + + +def get_localname(s): + res = s + if s: + try: + if s.startswith('http://'): + res = (s.split('/'))[-1].split('#')[-1] + except Exception as e: + logger.warning(str(e)) + + return res + + + +class Driver(object): + def __init__(self): + self._ns_tbl = {} + for (n, p) in NAMESPACES.items(): + self._ns_tbl[p] = n + + def to_prefixed_form(self, v): + r = v + if v: + try: + for p in self._ns_tbl.keys(): + if str(v).startswith(p): + r = '%s:%s' % (self._ns_tbl[p], v[len(p):]) + break + except Exception as e: + logger.warning('"%s": %s' % (v, e)) + + return r + + + def execute(self, q): + pass + + def query(self, q, abbrev=False): + return None + + def fetchone(self, q, abbrev=False): + return None + + +class VirtuosoODBCDriver(ODBCDriver, Driver): + def __init__(self, pw=VIRTUOSO_PW, port=VIRTUOSO_PORT): + connect_string = get_odbc_connect_string(pwd=pw, port=port) + ODBCDriver.__init__(self, connect_string) + Driver.__init__(self) + + def conv_row(self, row, abbrev=False): + if row and abbrev: + for (k, v) in row.items(): + row[k] = self.to_prefixed_form(v) + + return row + + def query(self, q, abbrev=False): + #logger.debug('query:\n{}'.format(q)) + for qvs, row in ODBCDriver.query(self, 'SPARQL\n'+q): + yield qvs, self.conv_row(row, abbrev) + + def execute(self, q): + ODBCDriver.execute(self, 'SPARQL\n'+q) + + def fetchone(self, q, abbrev=False): + r = ODBCDriver.fetchone(self, 'SPARQL\n'+q) + if r: + r = self.conv_row(r, abbrev) + return r + + + +class VirtuosoHTTPDriver(Driver): + def __init__(self, endpoint=SPARQL_ENDPOINT): + self._endpoint = endpoint + + def conv_binding(self, b, abbrev=False): + d = {} + for k in b.keys(): + data = b[k] + v = str(data['value']) + ty = data['type'] + if ty == 'typed-literal': + dty = self.to_prefixed_form(data['datatype']) + logger.debug('%s (%s)' % (v, dty)) + if dty == 'xsd:decimal': + v = float(v) + elif dty == 'xsd:integer': + v = int(v) + + if abbrev: + if ty == 'uri': + v = self.to_prefixed_form(v) + + d[k] = v + return d + + def _exec(self, q, limit=-1): + import json + from urllib.parse import urlencode + from urllib.request import Request, urlopen + + format = 'application/json' + + if limit < 0: + maxrows = '' + else: + maxrows = str(limit) + + params = { + 'query' : q, + 'format' : format, + 'maxrows' : maxrows, + } + + qpart = urlencode(params) + + req = Request(self._endpoint, qpart) + + response = urlopen(req).read() + + result = json.loads(response) + + return result + + def execute(self, q): + self._exec(q) + + def fetchone(self, q, abbrev=False): + row = None + try: + r = self._exec(q, limit=1) + b = r['results']['bindings'][0] + row = self.conv_binding(b, abbrev) + except: + pass + + return row + + def query(self, q, abbrev=False, limit=-1): + result = self._exec(q, limit) + for b in result['results']['bindings']: + qvs = [str(v) for v in result['head']['vars']] + yield qvs, self.conv_binding(b, abbrev) + + + + +def get_driver(method='http', pw=VIRTUOSO_PW, port=VIRTUOSO_PORT): + driver = None + if method == 'http': + driver = VirtuosoHTTPDriver() + elif method == 'odbc': + driver = VirtuosoODBCDriver(pw=pw, port=port) + else: + logger.error('unknown method: "%s"' % method) + return driver + + +def main(): + from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter + + parser = ArgumentParser(description='Execute SPARQL Query', + formatter_class=ArgumentDefaultsHelpFormatter) + + parser.add_argument('query_file', type=str, help='query file') + + parser.add_argument('-d', '--debug', dest='debug', action='store_true', help='enable debug printing') + + parser.add_argument('--port', dest='port', default=VIRTUOSO_PORT, + metavar='PORT', type=int, help='set port number') + + parser.add_argument('--pw', dest='pw', metavar='PASSWORD', default=VIRTUOSO_PW, + help='set password to access DB') + + parser.add_argument('-m', '--method', dest='method', default='odbc', + metavar='METHOD', type=str, help='execute query via METHOD (http|odbc)') + + + args = parser.parse_args() + + log_level = logging.INFO + if args.debug: + log_level = logging.DEBUG + setup_logger(logger, log_level) + + qfile = args.query_file + + logger.info('method: "%s"' % args.method) + logger.info('query: "%s"' % qfile) + + + driver = get_driver(args.method, pw=args.pw, port=args.port) + + count = 0 + + try: + f = open(qfile, 'r') + q = f.read() + f.close() + + for vs, r in driver.query(q, abbrev=True): + row = [] + for v in vs: + row.append(' %s="%s"' % (v, r[v])) + print('* row[%d]' % count) + print('\n'.join(row)) + count += 1 + + except Exception as e: + #logger.error(str(e)) + raise + + print('%d rows' % count) + + +def test(): + #sparql = VirtuosoODBCDriver() + sparql = VirtuosoHTTPDriver() + + q = 'DEFINE input:inference "ont.cpi" SELECT ?s ?p ?o WHERE { ?s ?p ?o } LIMIT 10' + + for r in sparql.query(q): + print(r) + + +if __name__ == '__main__': + main() diff --git a/cca/scripts/srcdiff.py b/python/src/cca/ccautil/srcdiff.py old mode 100755 new mode 100644 similarity index 93% rename from cca/scripts/srcdiff.py rename to python/src/cca/ccautil/srcdiff.py index a265b4a..5b65563 --- a/cca/scripts/srcdiff.py +++ b/python/src/cca/ccautil/srcdiff.py @@ -27,23 +27,16 @@ import csv import xml.parsers.expat as expat import random -import json import logging -import pathsetup -import diffts -import diffinfo -import sim -import proc -from factextractor import Enc, HashAlgo, compute_hash -from common import setup_logger, normpath +from . import diffts, diffinfo, sim, proc +from .factextractor import Enc, HashAlgo, compute_hash +from .common import setup_logger ##### logger = logging.getLogger() -STATS_JSON = 'stats.json' - AUXFILE_EXTS = ['.jj', '.jjt', '.properties'] dirdiff_fact_file_name = 'fact.nt' @@ -90,10 +83,10 @@ def read_info(info_paths): return i['nnodes'] -def count_nodes(files, cache_dir_base=None, - load_fact=False, - fact_dir=None, - fact_versions=[], +def count_nodes(files, cache_dir_base=None, + load_fact=False, + fact_dir=None, + fact_versions=[], fact_proj='', fact_proj_roots=[], fact_for_ast=False, @@ -172,11 +165,12 @@ def count_nodes(files, cache_dir_base=None, if not load_fact: incomplete_opt = ' -incompleteinfo' - cmd = '{}{} -parseonly{}{} {}'.format(diffts.diffast_cmd, incomplete_opt, cache_opt, fact_opt, normpath(f)) + cmd = '{}{} -parseonly{}{} {}'.format(diffts.diffast_cmd, incomplete_opt, cache_opt, fact_opt, f) logger.info('cmd="{}"'.format(cmd)) - with proc.PopenContext(cmd, stderr=None) as p: + pc = proc.PopenContext(cmd) + with pc as p: (stdout_data, stderr_data) = p.communicate() for line in stdout_data.split('\n'): m = nodes_pat.search(line) @@ -190,6 +184,7 @@ def count_nodes(files, cache_dir_base=None, except: logger.warning('not an integer: "{}"'.format(g[0])) + return c get_cache_dir1_by_diffts = diffts.diffast_get_cache_dir1 @@ -211,7 +206,7 @@ def read_stat2(fname, roots=[]): try: f = open(fname) reader = csv.reader(f) - + for row in reader: logger.debug('row={}'.format(row)) if len(roots) > 1: @@ -257,7 +252,7 @@ def read_stat_except_first(fname, root=None): try: f = open(fname) reader = csv.reader(f) - + for row in reader: logger.debug('row={}'.format(row)) @@ -283,7 +278,7 @@ def read_stat_except_last(fname, root=None): try: f = open(fname) reader = csv.reader(f) - + for row in reader: logger.debug('row={}'.format(row)) @@ -432,7 +427,6 @@ def get_info(dir1, dir2, usecache=True, cache_dir_base=None, 'moved' : moved, 'copied' : copied, 'glued' : glued, - 'cache_dir' : cache_dir, } return result @@ -468,7 +462,7 @@ def start_element(name, attrs): xmlparser.StartElementHandler = start_element xmlparser.ParseFile(f) f.close() - + except NotNull: return False @@ -479,7 +473,7 @@ def start_element(name, attrs): return b - + def has_AST(f): b = False for astml_ext in astml_exts: @@ -526,7 +520,6 @@ def filter_pairs(pairs, ignore1=[], ignore2=[], def diff_dirs(diff, dir1, dir2, usecache=True, cache_dir_base=None, include=[], - exclude=[], ignore1=[], ignore2=[], load_fact=False, fact_dir=None, @@ -562,12 +555,6 @@ def diff_dirs(diff, dir1, dir2, usecache=True, cache_dir_base=None, if include: filt = lambda x: any(x.startswith(p) for p in include) - if exclude: - if include: - filt = lambda x: any(x.startswith(p) for p in include) and all(not x.startswith(p) for p in exclude) - else: - filt = lambda x: all(not x.startswith(p) for p in exclude) - logger.info('"{}" - "{}" cache_dir_base="{}"'.format(dir1, dir2, cache_dir_base)) cost = 0 @@ -575,11 +562,7 @@ def diff_dirs(diff, dir1, dir2, usecache=True, cache_dir_base=None, nnodes = 0 nnodes1 = 0 nnodes2 = 0 - ndeletes = 0 - ninserts = 0 nrelabels = 0 - nmovrels = 0 - nmoves = 0 line_sim_sum = 0.0 line_sim_count = 0 @@ -604,16 +587,6 @@ def diff_dirs(diff, dir1, dir2, usecache=True, cache_dir_base=None, logger.info('"{}" - "{}" get_info finished'.format(dir1, dir2)) - stats_json = os.path.join(info['cache_dir'], STATS_JSON) - - if os.path.exists(stats_json): - logger.info('cached stats found: "{}"'.format(stats_json)) - try: - with open(stats_json) as f: - return json.load(f) - except Exception as e: - logger.warning('{}'.format(str(e))) - get_rel1 = lambda x: x get_rel2 = lambda x: x if len(fact_proj_roots) == 2: @@ -764,8 +737,6 @@ def diff_dirs(diff, dir1, dir2, usecache=True, cache_dir_base=None, nmappings += nunmodified0 + nmoved0 + nrenamed0 cost += nadded + ncopied + nremoved + nglued - ndeletes += nremoved + nglued - ninserts += nadded + ncopied logger.info('nnodes={}, nmappings={}, cost={}'.format(nnodes, nmappings, cost)) @@ -799,8 +770,8 @@ def diff_dirs(diff, dir1, dir2, usecache=True, cache_dir_base=None, if line_sim: line_sim_sum += sim.line_sim(file1, file2) line_sim_count += 1 - - r = diff(file1, file2, + + r = diff(file1, file2, cache_dir_base=cache_dir_base, load_fact=load_fact, fact_dir=fact_dir, @@ -826,10 +797,8 @@ def diff_dirs(diff, dir1, dir2, usecache=True, cache_dir_base=None, ) c = r['cost'] - #c = r['ndeletes'] + r['ninserts'] + r['nrelabels'] + r['nmoves'] - m = r['nmappings'] - + logger.info('"{}" - "{}": CMR=({}/{})'.format(file1, file2, c, m)) @@ -861,13 +830,9 @@ def diff_dirs(diff, dir1, dir2, usecache=True, cache_dir_base=None, cost += c nmappings += m - ndeletes += r['ndeletes'] - ninserts += r['ninserts'] nrelabels += r['nrelabels'] - nmovrels += r['nmovrels'] - nmoves += r['nmoves'] - + except Exception as e: logger.warning('{}'.format(str(e))) @@ -880,28 +845,18 @@ def diff_dirs(diff, dir1, dir2, usecache=True, cache_dir_base=None, logger.info('"{}" - "{}" --> {} comparisons ({} min.)'.format(dir1, dir2, ncomp, m)) - res = {'cost' : cost, - 'ncomparisons' : ncomp, - 'nmappings' : nmappings, - 'nnodes1' : nnodes1, - 'nnodes2' : nnodes2, - 'nnodes' : nnodes, - 'ndeletes' : ndeletes, - 'ninserts' : ninserts, + res = {'cost' : cost, + 'ncomparisons' : ncomp, + 'nmappings' : nmappings, + 'nnodes1' : nnodes1, + 'nnodes2' : nnodes2, + 'nnodes' : nnodes, 'nrelabels' : nrelabels, - 'nmovrels' : nmovrels, - 'nmoves' : nmoves, } if line_sim and line_sim_count > 0: res['line_sim'] = line_sim_sum / line_sim_count - try: - with open(stats_json, 'w') as f: - json.dump(res, f) - except Exception as e: - logger.warning('{}'.format(str(e))) - return res @@ -937,19 +892,19 @@ def test_diff_dirs(): if args.debug: log_level = logging.DEBUG setup_logger(logger, log_level) - + mode = args.mode logger.info('mode: "{}"'.format(mode)) - + diff = None if mode == 'diffast': diff = diffast else: logger.error('illegal mode: "{}"'.format(mode)) - + res = diff_dirs(diff, args.dir1, args.dir2, use_sim=args.use_sim, diff --git a/cca/scripts/tp.py b/python/src/cca/ccautil/tp.py similarity index 98% rename from cca/scripts/tp.py rename to python/src/cca/ccautil/tp.py index a81e3f0..777b99f 100644 --- a/cca/scripts/tp.py +++ b/python/src/cca/ccautil/tp.py @@ -2,7 +2,7 @@ ''' - A naive implementation of task pool + A naive implementation of task pool Copyright 2012-2020 Codinuum Software Lab @@ -29,8 +29,6 @@ import math import logging -import pathsetup - logger = logging.getLogger() ##### @@ -181,7 +179,7 @@ def lock_task(self, taskid): f = open(lock_path, 'w') try: fcntl.lockf(f, fcntl.LOCK_EX | fcntl.LOCK_NB) - self.__task_tbl[taskid] = f + self.__task_tbl[taskid] = f logger.debug('task set "%s" is locked' % taskid) except IOError as e: if e.errno == errno.EACCES or e.errno == errno.EAGAIN: @@ -209,7 +207,7 @@ def unlock_task(self, taskid): except IOError as e: f.close() logger.warning(str(e)) - + except KeyError: logger.warning('task set "%s" is not locked' % taskid) except Exception as e: @@ -227,7 +225,7 @@ def do_task(self, taskid): else: self.__LOCK_OK = False return - + self.__LOCK_OK = True self.__consecutive_lock_failure_count = 0 @@ -274,7 +272,7 @@ def do_task(self, taskid): logger.warning(str(e)) self.unlock_task(taskid) return - + def pick_up_task(self, wid=''): l = os.listdir(self.__task_dir) @@ -283,7 +281,7 @@ def pick_up_task(self, wid=''): w = '' if wid: w = ' ' % wid - + logger.info('%spicking up "%s"' % (w, t)) self.do_task(t) @@ -349,7 +347,7 @@ def watch_results(self): self.pick_up_results() logger.info('finished.') - + ##### def test(): diff --git a/cca/scripts/verdiff.py b/python/src/cca/ccautil/verdiff.py similarity index 96% rename from cca/scripts/verdiff.py rename to python/src/cca/ccautil/verdiff.py index 655e41e..1143693 100644 --- a/cca/scripts/verdiff.py +++ b/python/src/cca/ccautil/verdiff.py @@ -21,22 +21,15 @@ import os import re -import gzip +import gzip import tempfile import logging -import pathsetup -import tp -import diffts -import srcdiff -import project -import cca_options -import AST -from factloader import make_factbase_dir, scan_dir, FactMerger -from factloader import DEFAULT_TEMP_FILE_SIZE, DefaultFactLoader -from factextractor import Enc, HashAlgo -import cca_factextractor -from common import setup_logger +from . import tp, diffts, srcdiff, project, cca_options, AST, cca_factextractor +from .factloader import make_factbase_dir, scan_dir, FactMerger +from .factloader import DEFAULT_TEMP_FILE_SIZE, DefaultFactLoader +from .factextractor import Enc, HashAlgo +from .common import setup_logger logger = logging.getLogger() @@ -367,9 +360,9 @@ def compute(load_fact=False, if args.command == 'work': logger.info('worker id: "{}"'.format(args.wid)) - pool = TaskPool(args.basedir, working_dir, conf, False, - load_fact, fact_dir, ignore_unmodified, restrict_fact, - fact_for_changes, fact_for_mapping, fact_for_ast, + pool = TaskPool(args.basedir, working_dir, conf, False, + load_fact, fact_dir, ignore_unmodified, restrict_fact, + fact_for_changes, fact_for_mapping, fact_for_ast, fact_into_virtuoso, fact_into_directory, fact_size_thresh, fact_for_cfg, fact_encoding, fact_hash_algo, line_sim, #local_cache_name=args.wid, @@ -382,9 +375,9 @@ def compute(load_fact=False, pool.watch_tasks(args.wid) elif args.command == 'generate': - pool = TaskPool(args.basedir, working_dir, conf, True, - load_fact, fact_dir, ignore_unmodified, restrict_fact, - fact_for_changes, fact_for_mapping, fact_for_ast, + pool = TaskPool(args.basedir, working_dir, conf, True, + load_fact, fact_dir, ignore_unmodified, restrict_fact, + fact_for_changes, fact_for_mapping, fact_for_ast, fact_into_virtuoso, fact_into_directory, fact_size_thresh, fact_for_cfg, fact_encoding, fact_hash_algo, line_sim, @@ -402,9 +395,9 @@ def compute(load_fact=False, if load_fact: fact_dir = make_factbase_dir(args.basedir, proj_id) logger.info('fact dir: "{}"'.format(fact_dir)) - pool = TaskPool(args.basedir, working_dir, conf, False, - load_fact, fact_dir, ignore_unmodified, restrict_fact, - fact_for_changes, fact_for_mapping, fact_for_ast, + pool = TaskPool(args.basedir, working_dir, conf, False, + load_fact, fact_dir, ignore_unmodified, restrict_fact, + fact_for_changes, fact_for_mapping, fact_for_ast, fact_into_virtuoso, fact_into_directory, fact_size_thresh, fact_for_cfg, fact_encoding, fact_hash_algo, line_sim) diff --git a/cca/scripts/virtuoso.py b/python/src/cca/ccautil/virtuoso.py similarity index 92% rename from cca/scripts/virtuoso.py rename to python/src/cca/ccautil/virtuoso.py index 5f98645..4dc7fa3 100644 --- a/cca/scripts/virtuoso.py +++ b/python/src/cca/ccautil/virtuoso.py @@ -3,7 +3,7 @@ ''' A Virtuoso driver - Copyright 2012-2020 Codinuum Software Lab + Copyright 2012-2021 Codinuum Software Lab Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -26,19 +26,19 @@ import sys import logging -import pathsetup -from pathsetup import LOG_DIR -import proc -from siteconf import (VIRTUOSO_HOST, - VIRTUOSO_PORT, - VIRTUOSO_USER, - VIRTUOSO_PW, - VIRTUOSO_DRIVER, - VIRTUOSO_DSN, - VIRTUOSO_DIR) -import ns -from run_workers import spawn, dump_log -from common import setup_logger +from . import proc + +from .siteconf import (VIRTUOSO_HOST, + VIRTUOSO_PORT, + VIRTUOSO_USER, + VIRTUOSO_PW, + VIRTUOSO_DRIVER, + VIRTUOSO_DSN, + VIRTUOSO_DIR, + LOG_DIR) +from . import ns +from .run_workers import spawn, dump_log +from .common import setup_logger logger = logging.getLogger() @@ -90,9 +90,9 @@ def __init__(self, connect_string=ODBC_CONNECT_STRING): except Exception as e: logger.warning(str(e)) logger.warning('using pypyodbc') - import pypyodbc as pyodbc + from . import pypyodbc as pyodbc pyodbc.lowercase = False - self._db = pyodbc.connect(connect_string, ansi=True, autocommit=True) + self._db = pyodbc.connect(connect_string.encode('utf-8'), ansi=True, autocommit=True) def conv_row(self, row): d = {} @@ -132,7 +132,7 @@ def exec_cmd(cmd): return proc.system(cmd, quiet=True) -def exec_cmd_n(cmd, n, logdir='.'): +def exec_cmd_n(cmd, n, logdir=os.curdir): logger.debug('cmd: "%s"' % cmd) ps = [] @@ -173,7 +173,7 @@ def __init__(self, prog_name = os.path.splitext(os.path.basename(sys.argv[0]))[0] logger_name = prog_name+'.'+__name__ - self.log_dir = None + self.log_dir = os.curdir if daemonize: self.log_dir = LOG_DIR @@ -241,7 +241,7 @@ def exec_cmd(self, _cmd): def exec_cmd_n(self, _cmd, n): cmd = '%s %s EXEC="%s"' % (self._isql_cmd, self._pw, _cmd) - rc = exec_cmd_n(cmd, n, logdir=LOG_DIR) + rc = exec_cmd_n(cmd, n, logdir=self.log_dir) time.sleep(1) return rc diff --git a/python/src/cca/factutil/__init__.py b/python/src/cca/factutil/__init__.py new file mode 100644 index 0000000..feef89d --- /dev/null +++ b/python/src/cca/factutil/__init__.py @@ -0,0 +1,2 @@ +__version__ = '0.1' +__all__ = ['common'] diff --git a/cca/factutils/python/factutils/common.py b/python/src/cca/factutil/common.py similarity index 100% rename from cca/factutils/python/factutils/common.py rename to python/src/cca/factutil/common.py diff --git a/cca/factutils/python/factutils/const.py b/python/src/cca/factutil/const.py similarity index 89% rename from cca/factutils/python/factutils/const.py rename to python/src/cca/factutil/const.py index a3973ac..1a8ff52 100644 --- a/cca/factutils/python/factutils/const.py +++ b/python/src/cca/factutil/const.py @@ -1,9 +1,9 @@ #!/usr/bin/env python3 ''' - Factutils: helper scripts for source code entities + Factutil: helper scripts for source code entities - Copyright 2012-2020 Codinuum Software Lab + Copyright 2012-2021 Codinuum Software Lab Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/cca/factutils/python/factutils/entity.py b/python/src/cca/factutil/entity.py similarity index 94% rename from cca/factutils/python/factutils/entity.py rename to python/src/cca/factutil/entity.py index ea3536c..6a12f50 100644 --- a/cca/factutils/python/factutils/entity.py +++ b/python/src/cca/factutil/entity.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 ''' - Factutils: helper scripts for source code entities + Factutil: helper scripts for source code entities Copyright 2012-2020 Codinuum Software Lab @@ -18,8 +18,6 @@ limitations under the License. ''' -import RDF - from .const import ENTITY_NS, EXTERNAL_NS, SEP from .exn import Invalid_argument from .rdf import Resource @@ -109,7 +107,7 @@ def __init__(self, **args): pass try: - self._node = RDF.Node(uri_string=args['uri']) + self._node = Resource(uri=args['uri']).as_node() except KeyError: pass @@ -126,11 +124,11 @@ def __init__(self, **args): compos = [self._enc, self._file_id.encode(), self._range.encode()] self._local_name = SEP.join(compos) - uri = RDF.Uri(ENTITY_NS + self._local_name) - self._node = RDF.Node(uri) + uri = ENTITY_NS + self._local_name + self._node = Resource(uri=uri).as_node() elif self._node: - uri_str = str(self._node.uri) + uri_str = self.get_uri() if uri_str.startswith(ENTITY_NS): self._local_name = uri_str.replace(ENTITY_NS, '') @@ -170,9 +168,6 @@ def get_range(self): def get_file_id(self): return self._file_id - def get_uri(self): - return self._node.uri - def get_local_name(self): return self._local_name diff --git a/cca/factutils/python/factutils/exn.py b/python/src/cca/factutil/exn.py similarity index 85% rename from cca/factutils/python/factutils/exn.py rename to python/src/cca/factutil/exn.py index f5609c6..8c3b65d 100644 --- a/cca/factutils/python/factutils/exn.py +++ b/python/src/cca/factutil/exn.py @@ -1,9 +1,9 @@ #!/usr/bin/env python3 ''' - Factutils: helper scripts for source code entities + Factutil: helper scripts for source code entities - Copyright 2012-2020 Codinuum Software Lab + Copyright 2012-2021 Codinuum Software Lab Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/python/src/cca/factutil/fact.py b/python/src/cca/factutil/fact.py new file mode 100644 index 0000000..eced315 --- /dev/null +++ b/python/src/cca/factutil/fact.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python3 + +''' + Factutil: helper scripts for source code entities + + Copyright 2012-2021 Codinuum Software Lab + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +''' + +import logging + +from functools import reduce + +from .const import SPEC_NS, PREDICATE_NS, RELEASE_NS, SVNREV_NS, GITREV_NS, GUARD_NS +from .rdf import Graph, Resource, Predicate + + +logger = logging.getLogger() + + +class Fact(Graph): + + def create_release_version(self, rel): + return Resource(RELEASE_NS + rel) + + def create_svn_revision(self, rev): + s = '%s%s' % (SVNREV_NS, rev) + return Resource(s) + + def create_git_revision(self, rev): + s = '%s%s' % (GITREV_NS, rev) + return Resource(s) + + def get_guard_pred(self, pred): + g_pred = None + try: + g_pred = self._g_pred_map[pred] + + except KeyError: + g_pred = Predicate(GUARD_NS+'?orig='+pred.get_namespace(), + pred.get_local_name()) + self._g_pred_map[pred] = g_pred + + return g_pred + + def list_guards(self, s, p, o): + guards = [] + gp = self.get_guard_pred(p) + q = self._create_statement(None, gp, s) + for stmt in self.find_statements(q): + g = Resource(node=stmt.subject) + if self.contains(g, gp, o): + guards.append(g) + return guards + + def add(self, subj, pred, obj, attr=None, value=None): + self._add(subj, pred, obj) + if attr and value: + blk = None + guards = self.list_guards(subj, pred, obj) + + if len(guards) == 0: + blk = Resource() + g_pred = self.get_guard_pred(pred) + self._add(blk, g_pred, subj) + self._add(blk, g_pred, obj) + + else: + blk = guards[0] + + if blk == None: + blk = Resource() + + self._add(blk, attr, value) + + def addStatement(self, stmt, attr=None, value=None): + self.add(stmt.subject, stmt.predicate, stmt.object, attr, value) + diff --git a/cca/factutils/python/factutils/fileid.py b/python/src/cca/factutil/fileid.py similarity index 98% rename from cca/factutils/python/factutils/fileid.py rename to python/src/cca/factutil/fileid.py index 12840da..29c5024 100644 --- a/cca/factutils/python/factutils/fileid.py +++ b/python/src/cca/factutil/fileid.py @@ -1,9 +1,9 @@ #!/usr/bin/env python3 ''' - Factutils: helper scripts for source code entities + Factutil: helper scripts for source code entities - Copyright 2012-2020 Codinuum Software Lab + Copyright 2012-2021 Codinuum Software Lab Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/cca/factutils/python/factutils/range.py b/python/src/cca/factutil/range.py similarity index 99% rename from cca/factutils/python/factutils/range.py rename to python/src/cca/factutil/range.py index 9b663a7..04dc75c 100644 --- a/cca/factutils/python/factutils/range.py +++ b/python/src/cca/factutil/range.py @@ -1,9 +1,9 @@ #!/usr/bin/env python3 ''' - Factutils: helper scripts for source code entities + Factutil: helper scripts for source code entities - Copyright 2012-2020 Codinuum Software Lab + Copyright 2012-2021 Codinuum Software Lab Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/python/src/cca/factutil/rdf.py b/python/src/cca/factutil/rdf.py new file mode 100644 index 0000000..2374ba5 --- /dev/null +++ b/python/src/cca/factutil/rdf.py @@ -0,0 +1,363 @@ +#!/usr/bin/env python3 + +''' + Factutil: helper scripts for source code entities + + Copyright 2012-2021 Codinuum Software Lab + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +''' + +import os +import gzip +import tempfile +from functools import reduce + +import rdflib +from rdflib.namespace import XSD + +import logging + +logger = logging.getLogger() + + +def uri_split(uri): + lname = uri.split('/')[-1].split('#')[-1] + ns = uri[:len(uri)-len(lname)] + return ns, lname + + +class RDFNode(object): + def __init__(self, nd): + self._valid = True + self._node = nd + + def __eq__(self, other): + res = False + if isinstance(other, RDFNode): + res = self._node == other._node + + return res + + def is_valid(self): + return self._valid + + def as_node(self): + return self._node + + +class Resource(RDFNode): + def __init__(self, uri=None, **args): + nd = args.get('node', None) + if nd != None: + RDFNode.__init__(self, nd) + else: + if uri != None: + try: + RDFNode.__init__(self, rdflib.term.URIRef(uri)) + except: + logger.warning('uri="%s"(%s)' % (uri, str(type(uri)))) + raise + else: + RDFNode.__init__(self, rdflib.term.BNode()) # blank node + + def __eq__(self, other): + res = False + if isinstance(other, Resource): + if isinstance(self._node, rdflib.term.URIRef) and isinstance(other._node, rdflib.term.URIRef): + res = self.get_uri() == other.get_uri() + else: + res = self._node == other._node + + return res + + def __lt__(self, other): + return str(self.get_uri()) < str(other.get_uri()) + + def __gt__(self, other): + return str(self.get_uri()) > str(other.get_uri()) + + def __le__(self, other): + self.__eq__(other) or self.__lt__(other) + + def __ge__(self, other): + self.__eq__(other) or self.__gt__(other) + + def __hash__(self): + return str(self.get_uri()).__hash__() + + + def __str__(self): + return '<%s>' % self.get_uri() + + def get_uri(self): + return str(str(self.as_node())) + + def get_namespane(self): + ns, ln = uri_split(self.get_uri()) + return ns + + def get_local_name(self): + ns, ln = uri_split(self.get_uri()) + return ln + + +class Literal(RDFNode): + def __init__(self, literal="", **args): + nd = args.get('node', None) + if nd != None: + RDFNode.__init__(self, nd) + else: + RDFNode.__init__(self, rdflib.Literal(literal, **args)) + + def __eq__(self, other): + res = False + if isinstance(other, Literal): + res = self._node.eq(other._node) + return res + + def __str__(self): + return '"%s"' % self.get_content() + + def get_content(self): + return self._node.value + + +def make_literal(x): + lit = None + if isinstance(x, bool): + lit = Literal(literal=str(x).lower(), datatype=XSD.boolean) + elif isinstance(x, int): + if x >= 0: + lit = Literal(literal=str(x), datatype=XSD.nonNegativeInteger) + else: + lit = Literal(literal=str(x), datatype=XSD.integer) + elif isinstance(x, float): + lit = Literal(literal=str(x), datatype=XSD.double) + # elif isinstance(x, str): + # lit = Literal(literal=x.encode('utf-8')) + else: + lit = Literal(literal=str(x)) + + return lit + + + +class Predicate(Resource): + def __init__(self, ns=None, lname=None, **args): + self._lname = None + self._ns = None + + uri = None + node = args.get('node', None) + + if ns == None or lname==None: + uri = args.get('uri', None) + + if uri == None: + if node != None: + if isinstance(node, rdflib.term.URIRef): + uri = str(str(node)) + if uri != None: + self._ns, self._lname = uri_split(uri) + + else: + self._ns = ns + self._lname = lname + uri = ns + lname + + Resource.__init__(self, uri, **args) + + def __str__(self): + return '<%s>' % self.get_uri() + + def get_namespace(self): + return self._ns + + def get_local_name(self): + return self._lname + + +class Statement(object): + def __init__(self, subject=None, predicate=None, object=None, **args): + try: + stmt = args['statement'] + self.subject = stmt.subject + self.predicate = stmt.predicate + self.object = stmt.object + self._stmt = stmt._stmt + + except KeyError: + self.subject = subject + self.predicate = predicate + self.object = object + s = None + p = None + o = None + if isinstance(subject, Resource): + s = subject.as_node() + if isinstance(predicate, Predicate): + p = predicate.as_node() + if isinstance(object, RDFNode): + o = object.as_node() + + self._stmt = (s, p, o) + + + def __eq__(self, other): + res = False + if isinstance(other, Statement): + res = reduce(lambda x,y: x and y, [self.subject == other.subject, + self.predicate == other.predicate, + self.object == other.object]) + return res + + + +class Graph(object): + def __init__(self, ns_tbl, large=False): + + if large: + self._model = rdflib.graph.Graph('SleepyCat') + else: + self._model = rdflib.graph.Graph('IOMemory') + + self._g_pred_map = {} + self._pred_tbl = {} + + self.l_true = Literal('true') + self.l_false = Literal('false') + + self.namespace_tbl = ns_tbl + + + def set_namespace(self, prefix, uri): + self.namespace_tbl[prefix] = uri + + def contains(self, s, p, o): + stmt = self._create_statement(s, p, o) + return (stmt in self._model) + + def find_statements(self, t): + return self._model.triples(t) + + def size(self): + return len(self._model) + + def _add(self, subj, pred, obj): + self._model.add((subj.as_node(), pred.as_node(), obj.as_node())) + + def _create_statement(self, subj, pred, obj): + s = None + p = None + o = None + if subj: + s = subj.as_node() + if pred: + p = pred.as_node() + if obj: + o = obj.as_node() + return (s, p, o) + + def _guess_fmt(self, path): + fmt = '' + if path.endswith('.nt'): + fmt = 'nt' + elif path.endswith('.ttl'): + fmt = 'turtle' + elif path.endswith('.rdf'): + fmt = 'xml' + if path.endswith('.nt.gz'): + fmt = 'nt' + elif path.endswith('.ttl.gz'): + fmt = 'turtle' + elif path.endswith('.rdf.gz'): + fmt = 'xml' + return fmt + + def _mktemp(self): + (fd, path) = tempfile.mkstemp() + os.close(fd) + return path + + def _gzipped(self, path): + return path.endswith('.gz') + + def _gzip(self, from_file, to_file): + f_from = open(from_file, 'rb') + f_to = gzip.open(to_file, 'wb') + f_to.writelines(f_from) + f_to.close() + f_from.close() + + def _gunzip(self, from_file, to_file): + f_from = gzip.open(from_file, 'rb') + f_to = open(to_file, 'wb') + f_to.writelines(f_from) + f_to.close() + f_from.close() + + def write(self, path, fmt='', base_uri=None): + if fmt == '': + fmt = self._guess_fmt(path) + + gzipped_path = None + + if self._gzipped(path): + gzipped_path = path + tmp = self._mktemp() + path = tmp + + for (prefix, uri) in self.namespace_tbl.items(): + self._model.bind(prefix, uri) + + logger.info('writing to "%s"...' % path) + d = os.path.dirname(path) + if d != '' and not os.path.exists(d): + logger.warning('No such directory: "%s"' % d) + logger.info('creating "%s"...' % d) + os.makedirs(d) + self._model.serialize(path, format=fmt, base=base_uri) + logger.info('done.') + + if gzipped_path: + self._gzip(path, gzipped_path) + os.unlink(path) + + def read(self, path, fmt='', base_uri=None): + if fmt == '': + fmt = self._guess_fmt(path) + + gzipped = False + + if self._gzipped(path): + gzipped = True + tmp = self._mktemp() + self._gunzip(path, tmp) + path = tmp + + logger.info('reading "%s"...' % path) + self._model.parse(location=path, format=fmt, base=base_uri) + logger.info('done.') + + if gzipped: + os.unlink(tmp) + + + def query(self, qstr, base_uri=None): + results = self._model.query(qstr, initNs=self.namespace_tbl) + return results + + +if __name__ == '__main__': + pass From e76aa75395a238189a2c904afd556eb0a484dc56 Mon Sep 17 00:00:00 2001 From: codinuum Date: Sun, 30 May 2021 20:13:36 +0900 Subject: [PATCH 03/11] minor --- README.md | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/README.md b/README.md index 9ab37bc..9374e60 100644 --- a/README.md +++ b/README.md @@ -52,8 +52,23 @@ You can run both Diff/AST and DiffViewer by the following line. $ ./cca.py diffast -c results --view samples/java/0/Test.java samples/java/1/Test.java +## Installing parsers and Diff/AST + +### Requirements + +* [OCaml](http://ocaml.org/) (>=4.11.1) +* [OPAM](https://opam.ocaml.org/) (for installing camlzip, cryptokit, csv, git-unix, menhir, ocamlnet, pxp, ulex, uuidm, and volt.) + +### Installation + +The following will install `parsesrc` and `diffast`. + + $ opam install cca + ## Building parsers and Diff/AST +You can also build parsers and Diff/AST in person. + ### Requirements * GNU make From 7d93c53d9d1c54bca658550a81a53e07b57b71bf Mon Sep 17 00:00:00 2001 From: codinuum Date: Sun, 30 May 2021 20:15:33 +0900 Subject: [PATCH 04/11] oops --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 9374e60..016e63a 100644 --- a/README.md +++ b/README.md @@ -57,7 +57,7 @@ You can run both Diff/AST and DiffViewer by the following line. ### Requirements * [OCaml](http://ocaml.org/) (>=4.11.1) -* [OPAM](https://opam.ocaml.org/) (for installing camlzip, cryptokit, csv, git-unix, menhir, ocamlnet, pxp, ulex, uuidm, and volt.) +* [OPAM](https://opam.ocaml.org/) ### Installation From cc0b572bd23dcd7d8903ed322f78e9ddce942726 Mon Sep 17 00:00:00 2001 From: codinuum Date: Tue, 1 Jun 2021 11:36:16 +0900 Subject: [PATCH 05/11] [java parser] fix: avoiding exponential increase of the number of calls --- src/ast/analyzing/langs/java/java_tree.ml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/ast/analyzing/langs/java/java_tree.ml b/src/ast/analyzing/langs/java/java_tree.ml index f56ea3c..dc61bd5 100644 --- a/src/ast/analyzing/langs/java/java_tree.ml +++ b/src/ast/analyzing/langs/java/java_tree.ml @@ -1281,7 +1281,7 @@ class translator options = let bid_gen = new BID.generator in object (self) | Ast.Pparen expr -> let e_nd = self#of_expression expr in let tid = self#mktid e_nd in - self#mknode (L.Primary (L.Primary.Paren tid)) [self#of_expression expr] + self#mknode (L.Primary (L.Primary.Paren tid)) [e_nd] | Ast.PclassInstanceCreation class_instance_creation -> self#of_class_instance_creation class_instance_creation @@ -1616,7 +1616,7 @@ class translator options = let bid_gen = new BID.generator in object (self) (*let tid = L.null_tid in*) let lab = L.Statement (L.Statement.If tid) in self#mknode lab - [self#of_expression e; + [e_; self#of_statement ~block_context:"if" s] (* order sensitive s -> e *) | Ast.SifThenElse(e, s1, s2) -> @@ -1625,7 +1625,7 @@ class translator options = let bid_gen = new BID.generator in object (self) (*let tid = L.null_tid in*) let lab = L.Statement (L.Statement.If tid) in self#mknode lab - [self#of_expression e; + [e_; self#of_statement ~block_context:"if" s1; self#of_statement ~block_context:"if" s2] (* order sensitive s2 -> s1 -> e *) From 64e35fe12fe5b3facd24299988db62a03ff7b85f Mon Sep 17 00:00:00 2001 From: codinuum Date: Tue, 1 Jun 2021 18:57:19 +0900 Subject: [PATCH 06/11] fix --- src/ast/analyzing/common/edit_base.ml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ast/analyzing/common/edit_base.ml b/src/ast/analyzing/common/edit_base.ml index e88a9e3..63fd137 100644 --- a/src/ast/analyzing/common/edit_base.ml +++ b/src/ast/analyzing/common/edit_base.ml @@ -1229,7 +1229,7 @@ class ['node_t, 'tree_t] seq_base options = object (self : 'edits) let segs_to_json idx ?(st=(-1)) ?(ed=(-1)) _segs = match _segs with - | [s, e] when st = e && ed = s -> "" + | [s, e] when st >= 0 && ed >= 0 && st <> ed && st = e && ed = s -> "" | _ -> let segs = List.filter (fun (s, e) -> s <= e) _segs in let seg_to_json (s, e) = sprintf "{\"start\":%d,\"end\":%d}" s e in From 7a9053542e59feae117f71916ed53fe373dc0edd Mon Sep 17 00:00:00 2001 From: codinuum Date: Tue, 1 Jun 2021 19:56:14 +0900 Subject: [PATCH 07/11] for increasing relabels --- src/ast/analyzing/common/edit_base.ml | 3 ++- src/ast/analyzing/engine/postprocessing.ml | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/src/ast/analyzing/common/edit_base.ml b/src/ast/analyzing/common/edit_base.ml index 63fd137..970fb8b 100644 --- a/src/ast/analyzing/common/edit_base.ml +++ b/src/ast/analyzing/common/edit_base.ml @@ -3905,9 +3905,10 @@ class ['node_t, 'tree_t] seq_base options = object (self : 'edits) raise Exit end ); + DEBUG_MSG "false"; false with - Exit -> true + Exit -> DEBUG_MSG "true"; true method dump_delta ?(extra_ns_decls=([] : (string * string) list)) diff --git a/src/ast/analyzing/engine/postprocessing.ml b/src/ast/analyzing/engine/postprocessing.ml index e7a2719..376a1b9 100644 --- a/src/ast/analyzing/engine/postprocessing.ml +++ b/src/ast/analyzing/engine/postprocessing.ml @@ -5949,6 +5949,7 @@ end; let glue_filt u1 u2 = let n1 = tree1#search_node_by_uid u1 in let n2 = tree2#search_node_by_uid u2 in + n1#data#relabel_allowed n2#data || not n1#data#is_named_orig && not n2#data#is_named_orig && n1#data#anonymized_label = n2#data#anonymized_label in From ea79d14ffe339bad16c37367801c529758ebce29 Mon Sep 17 00:00:00 2001 From: codinuum Date: Thu, 3 Jun 2021 17:32:12 +0900 Subject: [PATCH 08/11] fix and more --- src/ast/analyzing/common/UIDmapping.ml | 47 +++++++++++++++++++ src/ast/analyzing/common/edit_base.ml | 8 ++-- src/ast/analyzing/common/engine_options.ml | 5 ++ src/ast/analyzing/common/sourcecode.ml | 1 + src/ast/analyzing/common/spec.ml | 7 ++- src/ast/analyzing/diffast.ml | 2 + src/ast/analyzing/engine/analyzing.ml | 1 + src/ast/analyzing/engine/postprocessing.ml | 4 +- src/ast/analyzing/langs/astml/label.ml | 1 + src/ast/analyzing/langs/cpp/cpp_label.ml | 2 + .../langs/fortran/parsing/src/label.ml | 2 + src/ast/analyzing/langs/java/java_label.ml | 6 ++- src/ast/analyzing/langs/python/py_label.ml | 6 +++ src/ast/analyzing/langs/verilog/v_label.ml | 2 + 14 files changed, 84 insertions(+), 10 deletions(-) diff --git a/src/ast/analyzing/common/UIDmapping.ml b/src/ast/analyzing/common/UIDmapping.ml index 846849e..a95227f 100644 --- a/src/ast/analyzing/common/UIDmapping.ml +++ b/src/ast/analyzing/common/UIDmapping.ml @@ -868,6 +868,53 @@ class ['node_t] c cenv = object (self : 'self) with Sys_error s -> WARN_MSG s + method dump_json ?(comp=Comp.none) fname = + let tree1 = cenv#tree1 in + let tree2 = cenv#tree2 in + let _fprintf ch fmt = + Printf.ksprintf (fun s -> ignore (ch#output_ s 0 (String.length s))) fmt + in + let dump_node ch nd = + let loc = nd#data#src_loc in + let so = loc.Loc.start_offset in + let eo = loc.Loc.end_offset in + let sl = loc.Loc.start_line in + let el = loc.Loc.end_line in + let lab = nd#data#get_category in + _fprintf ch "{"; + _fprintf ch "\"label\":\"%s\"" lab; + _fprintf ch ",\"start_offset\":%d,\"end_offset\":%d,\"start_line\":%d,\"end_line\":%d" so eo sl el; + _fprintf ch "}"; + in + let dump_map ?(comma=false) ch map = + let comma_flag = ref comma in + Hashtbl.iter + (fun u1 u2 -> + let n1 = tree1#search_node_by_uid u1 in + let n2 = tree2#search_node_by_uid u2 in + + if !comma_flag then + _fprintf ch ","; + _fprintf ch "["; + dump_node ch n1; + _fprintf ch ","; + dump_node ch n2; + _fprintf ch "]"; + comma_flag := true; + + ) map; + !comma_flag + in + try + let ch = new Xchannel.out_channel ~comp (Xchannel.Destination.of_file fname) in + _fprintf ch "["; + let comma = dump_map ch map in + dump_map ~comma ch s_map; + _fprintf ch "]"; + ch#close + with + | Xchannel.Error s -> WARN_MSG s + method dump_with_info ?(comp=Comp.none) fname = let _fprintf ch fmt = diff --git a/src/ast/analyzing/common/edit_base.ml b/src/ast/analyzing/common/edit_base.ml index 970fb8b..b26cb6e 100644 --- a/src/ast/analyzing/common/edit_base.ml +++ b/src/ast/analyzing/common/edit_base.ml @@ -1228,13 +1228,10 @@ class ['node_t, 'tree_t] seq_base options = object (self : 'edits) method dump_diff_json_ch ?(line_align=[]) (tree1 : 'tree_t) (tree2 : 'tree_t) = let segs_to_json idx ?(st=(-1)) ?(ed=(-1)) _segs = - match _segs with - | [s, e] when st >= 0 && ed >= 0 && st <> ed && st = e && ed = s -> "" - | _ -> let segs = List.filter (fun (s, e) -> s <= e) _segs in let seg_to_json (s, e) = sprintf "{\"start\":%d,\"end\":%d}" s e in let extra = - if st >= 0 && ed >= 0 && st <= ed then + if st >= 0 && st <= ed then sprintf "\"start%d\":%d,\"end%d\":%d" idx st idx ed else "" @@ -1414,7 +1411,8 @@ class ['node_t, 'tree_t] seq_base options = object (self : 'edits) n1#data#is_named_orig && n2#data#is_named_orig || (not n1#data#is_named && n2#data#is_named || n1#data#is_named && not n2#data#is_named) || (not (n1#data#is_compatible_with n2#data) && - n1#data#more_anonymized_label <> n2#data#more_anonymized_label) + n1#data#more_anonymized_label <> n2#data#more_anonymized_label) || + n1#data#has_value && n2#data#has_value && n1#data#get_value <> n2#data#get_value in if ok then begin let loc1 = Info.get_loc info1 in diff --git a/src/ast/analyzing/common/engine_options.ml b/src/ast/analyzing/common/engine_options.ml index c4a070d..8d7327c 100644 --- a/src/ast/analyzing/common/engine_options.ml +++ b/src/ast/analyzing/common/engine_options.ml @@ -45,6 +45,7 @@ class c = object (self) val mutable trust_tree_matcher_flag = true val mutable use_adjacency_matches_flag = true val mutable no_unnamed_node_move_flag = false + val mutable conservative_flag = true (* *) @@ -157,6 +158,10 @@ class c = object (self) method set_no_unnamed_node_move_flag = no_unnamed_node_move_flag <- true method clear_no_unnamed_node_move_flag = no_unnamed_node_move_flag <- false + method conservative_flag = conservative_flag + method set_conservative_flag = conservative_flag <- true + method clear_conservative_flag = conservative_flag <- false + (* *) method dump_size_threshold = dump_size_threshold diff --git a/src/ast/analyzing/common/sourcecode.ml b/src/ast/analyzing/common/sourcecode.ml index 9a34492..9be41af 100644 --- a/src/ast/analyzing/common/sourcecode.ml +++ b/src/ast/analyzing/common/sourcecode.ml @@ -460,6 +460,7 @@ module Tree (L : Spec.LABEL_T) = struct method is_string_literal = L.is_string_literal lab method is_int_literal = L.is_int_literal lab method is_real_literal = L.is_real_literal lab + method is_statement = L.is_statement lab val mutable move_id = MID.unknown method set_mid mid = move_id <- mid diff --git a/src/ast/analyzing/common/spec.ml b/src/ast/analyzing/common/spec.ml index ba54d39..69becb9 100644 --- a/src/ast/analyzing/common/spec.ml +++ b/src/ast/analyzing/common/spec.ml @@ -65,8 +65,9 @@ class type node_data_t = object ('self) method has_value : bool method has_non_trivial_value : bool method is_string_literal : bool - method is_int_literal : bool - method is_real_literal : bool + method is_int_literal : bool + method is_real_literal : bool + method is_statement : bool method move_disallowed : bool method is_common : bool @@ -392,6 +393,8 @@ module type LABEL_T = sig val is_int_literal : t -> bool val is_real_literal : t -> bool + val is_statement : t -> bool + val to_tag : t -> string * (string * string) list end (* of module type LABEL_T *) diff --git a/src/ast/analyzing/diffast.ml b/src/ast/analyzing/diffast.ml index 5c9558d..e1bcc9b 100644 --- a/src/ast/analyzing/diffast.ml +++ b/src/ast/analyzing/diffast.ml @@ -277,6 +277,8 @@ let speclist = "-weak", Arg.Unit (fun () -> options#set_weak_flag), "\tweaken node equation and node permutation detection"; + "-aggressive", Arg.Unit (fun () -> options#clear_conservative_flag), "\tbe aggressive"; + (* mode *) "-searchonly", Arg.Set_string keyword, "\tKEYWORD\tsearch keyword only"; "-parseonly", Arg.Set parseonly_flag, "\tparse only"; diff --git a/src/ast/analyzing/engine/analyzing.ml b/src/ast/analyzing/engine/analyzing.ml index d1dcddd..274bf2e 100644 --- a/src/ast/analyzing/engine/analyzing.ml +++ b/src/ast/analyzing/engine/analyzing.ml @@ -1046,6 +1046,7 @@ module F (Label : Spec.LABEL_T) = struct Stat.File.dump_diff_stat dstat diff_stat; uidmapping#dump_with_info ~comp:Compression.gzip (dmap^".gz"); + uidmapping#dump_json ~comp:Compression.gzip (dmap^".json.gz"); if options#fact_for_mapping_flag then Lang.extract_mapping_fact options lang uidmapping dmapfact tree1 tree2; diff --git a/src/ast/analyzing/engine/postprocessing.ml b/src/ast/analyzing/engine/postprocessing.ml index 376a1b9..c5f1a89 100644 --- a/src/ast/analyzing/engine/postprocessing.ml +++ b/src/ast/analyzing/engine/postprocessing.ml @@ -3516,7 +3516,7 @@ module F (Label : Spec.LABEL_T) = struct if (get_count tbl1 u1) = 0 && (get_count tbl2 u2) = 0 then begin if not (Xset.mem _set1 u1) && not (Xset.mem _set2 u2) || - not options#dump_delta_flag && _are_named_orig u1 u2 || + not (options#dump_delta_flag || options#conservative_flag) && _are_named_orig u1 u2 || _is_mapped u1 u2 then begin tbl_add tbl1 u1; @@ -5917,7 +5917,7 @@ end; end; (* *) - if options#dump_delta_flag then begin + if options#dump_delta_flag || options#conservative_flag then begin elaborate_edits_for_delta options tree1 tree2 uidmapping edits; let dels, inss = diff --git a/src/ast/analyzing/langs/astml/label.ml b/src/ast/analyzing/langs/astml/label.ml index e2b1e6f..0493a99 100644 --- a/src/ast/analyzing/langs/astml/label.ml +++ b/src/ast/analyzing/langs/astml/label.ml @@ -153,6 +153,7 @@ let is_string_literal { elem_name=name; elem_attrs=attrs; } = let is_int_literal lab = false (* not yet *) let is_real_literal lab = false (* not yet *) +let is_statement lab = false (* not yet *) let to_string { elem_name=elem; elem_attrs=attrs; elem_parser=p; elem_ast_ns=ns; } = let attrs_s = diff --git a/src/ast/analyzing/langs/cpp/cpp_label.ml b/src/ast/analyzing/langs/cpp/cpp_label.ml index 26aae78..8ce9db3 100644 --- a/src/ast/analyzing/langs/cpp/cpp_label.ml +++ b/src/ast/analyzing/langs/cpp/cpp_label.ml @@ -1721,6 +1721,8 @@ let is_stmt = function -> true | _ -> false +let is_statement = is_stmt + let get_ident_use = function | _ -> "" diff --git a/src/ast/analyzing/langs/fortran/parsing/src/label.ml b/src/ast/analyzing/langs/fortran/parsing/src/label.ml index b0cedd3..6aea6a4 100644 --- a/src/ast/analyzing/langs/fortran/parsing/src/label.ml +++ b/src/ast/analyzing/langs/fortran/parsing/src/label.ml @@ -1974,6 +1974,8 @@ let is_stmt = function | Stmt _ -> true | _ -> false +let is_statement = is_stmt + let get_stmt_label = function | Stmt stmt -> Stmt.get_stmt_label stmt | _ -> raise Not_found diff --git a/src/ast/analyzing/langs/java/java_label.ml b/src/ast/analyzing/langs/java/java_label.ml index 28cd6b2..0522744 100644 --- a/src/ast/analyzing/langs/java/java_label.ml +++ b/src/ast/analyzing/langs/java/java_label.ml @@ -62,7 +62,7 @@ module type T = sig val is_typeparameter : t -> bool val is_typeparameters : t -> bool val is_typearguments : ?nth:int -> t -> bool - val is_statement : t -> bool + (*val is_statement : t -> bool*) val is_field : t -> bool val is_type : t -> bool val is_class : t -> bool @@ -2727,6 +2727,8 @@ let relabel_allowed (lab1, lab2) = | CatchClause _, CatchClause _ + | LocalVariableDeclaration _, Resource _ | Resource _, LocalVariableDeclaration _ + (* | VariableDeclarator _, Primary (Primary.Name _|Primary.FieldAccess _) | Primary (Primary.Name _|Primary.FieldAccess _), VariableDeclarator _*) @@ -2889,6 +2891,7 @@ let is_boundary = function | Class _ | Interface _ | Method _ + | Constructor _ | ImportDeclarations | TypeDeclarations | CompilationUnit -> true @@ -2970,6 +2973,7 @@ let is_typearguments ?(nth=1) = function let is_statement = function | Statement _ -> true + | LocalVariableDeclaration(true, _) -> true | _ -> false let is_statement_or_block = function diff --git a/src/ast/analyzing/langs/python/py_label.ml b/src/ast/analyzing/langs/python/py_label.ml index 0900f8e..67abd8f 100644 --- a/src/ast/analyzing/langs/python/py_label.ml +++ b/src/ast/analyzing/langs/python/py_label.ml @@ -1361,6 +1361,12 @@ let is_real_literal = function | Primary (Primary.Literal (Literal.FloatNumber _)) -> true | _ -> false +let is_statement = function + | Statement _ + | SimpleStatement _ + -> true + | _ -> false + (* for fact extraction *) diff --git a/src/ast/analyzing/langs/verilog/v_label.ml b/src/ast/analyzing/langs/verilog/v_label.ml index a22b087..23c6bd5 100644 --- a/src/ast/analyzing/langs/verilog/v_label.ml +++ b/src/ast/analyzing/langs/verilog/v_label.ml @@ -2109,6 +2109,8 @@ let is_stmt = function | Stmt _ -> true | _ -> false +let is_statement = is_stmt + let is_pp_define = function | CompilerDirective (CompilerDirective.Define _) -> true | _ -> false From 411680ff67757839072e293c27ddb65b4922a980 Mon Sep 17 00:00:00 2001 From: codinuum Date: Sat, 5 Jun 2021 09:27:51 +0900 Subject: [PATCH 09/11] cpp parser: refactoring --- .../analyzing/langs/cpp/parsing/src/lib.ml | 31 +++++++++++++++++++ .../langs/cpp/parsing/src/parser.mly | 16 +++++----- 2 files changed, 39 insertions(+), 8 deletions(-) diff --git a/src/ast/analyzing/langs/cpp/parsing/src/lib.ml b/src/ast/analyzing/langs/cpp/parsing/src/lib.ml index 00c3eff..3f4cbe3 100644 --- a/src/ast/analyzing/langs/cpp/parsing/src/lib.ml +++ b/src/ast/analyzing/langs/cpp/parsing/src/lib.ml @@ -529,6 +529,8 @@ class parser_c = object (self) | N__init_statement -> "_init_statement" | N__initializer_list -> "_initializer_list" | N__namespace_alias_definition -> "_namespace_alias_definition" + | N__lambda_expression -> "_lambda_expression" + | N__odd_stmt -> "_odd_stmt" | N__opaque_enum_declaration -> "_opaque_enum_declaration" | N__pp_define -> "_pp_define" | N__pp_dinit_if_section_list -> "_pp_dinit_if_section_list" @@ -569,6 +571,7 @@ class parser_c = object (self) | N_alignment_specifier -> "alignment_specifier" | N_and_expression -> "and_expression" | N_asm_block -> "asm_block" + | N_asm_token -> "asm_token" | N_assignment_expression -> "assignment_expression" | N_assignment_operator -> "assignment_operator" | N_attr_macro_call -> "attr_macro_call" @@ -688,9 +691,11 @@ class parser_c = object (self) | N_function_try_block -> "function_try_block" | N_gnu_asm_attr -> "gnu_asm_attr" | N_gnu_asm_frag_seq -> "gnu_asm_frag_seq" + | N_gnu_asm_token -> "gnu_asm_token" | N_gnu_attribute -> "gnu_attribute" | N_gnu_attribute_seq -> "gnu_attribute_seq" | N_handler -> "handler" + | N_header_name_token -> "header_name_token" | N_handler_seq -> "handler_seq" | N_has_attribute_expression -> "has_attribute_expression" | N_has_include_expression -> "has_include_expression" @@ -904,7 +909,9 @@ class parser_c = object (self) | N_objc_try -> "objc_try" | N_objc_visibility_spec -> "objc_visibility_spec" | N_odd_decl -> "odd_decl" + | N_odd_else_stmt -> "odd_else_stmt" | N_odd_expr -> "odd_expr" + | N_odd_if_stmt_open -> "odd_if_stmt_open" | N_odd_mult_expr -> "odd_mult_expr" | N_odd_mem_decl -> "odd_mem_decl" | N_odd_stmt -> "odd_stmt" @@ -1178,6 +1185,7 @@ class parser_c = object (self) | N_ptr_declarator -> "ptr_declarator" | N_ptr_operator -> "ptr_operator" | N_pure_specifier -> "pure_specifier" + | N_q_prop_token -> "q_prop_token" | N_qualified_id -> "qualified_id" | N_quasi_keyword -> "quasi_keyword" | N_ref_qualifier -> "ref_qualifier" @@ -3979,6 +3987,29 @@ class parser_c = object (self) end | _ -> () end + | I.X (I.N N_odd_if_stmt_open), _, I.X (I.T T_LBRACE) -> begin + match scanner#peek_rawtoken() with + | PP_ELIF _ | PP_ELSE _ -> begin + scanner#enter_block(); + DEBUG_MSG "!!! info=%s" (Pinfo.pp_if_section_info_to_string env#pp_if_section_top_info); + iter_items_w ~from_ith:7 ~to_ith:7 menv_ + (function + | sn, I.X (I.N N_pp_stmt_if_group), _, _, _ -> () + | _ -> env#set_broken_info(); + ); + raise Exit + end + | _ -> begin + iter_items_w ~from_ith:7 ~to_ith:7 menv_ + (function + | sn, I.X (I.N N_pp_stmt_if_group), _, _, _ -> begin + ctx_start_of_stmt sn; + raise Exit + end + | _ -> () + ) + end + end | I.X (I.N N_pp_stmt_if_group_broken), _, I.X (I.T T_COMMA_BROKEN) -> begin match scanner#peek_rawtoken() with | PP_ELIF _ | PP_ELSE _ -> begin diff --git a/src/ast/analyzing/langs/cpp/parsing/src/parser.mly b/src/ast/analyzing/langs/cpp/parsing/src/parser.mly index b98fc8a..f6e11f5 100644 --- a/src/ast/analyzing/langs/cpp/parsing/src/parser.mly +++ b/src/ast/analyzing/langs/cpp/parsing/src/parser.mly @@ -825,7 +825,7 @@ odd_stmt: mknode ~pvec:[List.length ll; 1] $startpos $endpos L.LabeledStatement (ll @ [s]) } ; -%inline +(*%inline*) _odd_stmt: | e=expression { e } | e=expression ODD_RBRACE { e } @@ -1110,7 +1110,7 @@ odd_if_stmt: mknode ~pvec $startpos(i) $endpos L.IfStatement (cl @ il @ [c]) } ; -%inline +(*%inline*) odd_else_stmt: | RBRACE ELSE o=odd_if_stmt_open { @@ -1158,7 +1158,7 @@ pp_stmt_else_group: | p=pp_else sl=statement_seq o=odd_stmt { mknode ~pvec:[1; List.length sl + 1] $startpos $endpos (_pp_else_group p) (p::sl@[o]) } ; -%inline +(*%inline*) odd_if_stmt_open: | IF c_opt=ioption(constexpr) LPAREN i_opt=ioption(init_statement) c=condition RPAREN l=LBRACE sl=stmt_seq0 @@ -1979,7 +1979,7 @@ pp_gnu_asm_else_group: { mknode ~pvec:[1; 1; 0] $startpos $endpos (_pp_else_group p) [p; c] } ; -%inline +(*%inline*) gnu_asm_token: | i=IDENT { mktok $startpos $endpos (T.IDENT i) } | i=IDENT_V { mktok $startpos $endpos (T.IDENT i) } @@ -2057,7 +2057,7 @@ gnu_asm_token: | s=OBJC_UNKNOWN { mktok $startpos $endpos (T.OBJC_UNKNOWN s) } ; -%inline +(*%inline*) asm_token: | LPAREN { mktok $startpos $endpos T.LPAREN } | TY_LPAREN { mktok $startpos $endpos T.TY_LPAREN } @@ -6654,7 +6654,7 @@ has_include_expression: | HAS_INCLUDE LPAREN s=STR_LITERAL RPAREN { mkleaf $startpos $endpos (L.HasIncludeExpression s) } ; -%inline +(*%inline*) header_name_token: | i=IDENT { i } | i=IDENT_V { i } @@ -8579,7 +8579,7 @@ lambda_expression: | lh=_lambda_expression DUMMY_STMT { lh } ; -%inline +(*%inline*) _lambda_expression: | l=lambda_introducer { @@ -9850,7 +9850,7 @@ restricted_decl: | d=DECL_MACRO { mkleaf $startpos $endpos (L.DeclarationMacro d) } ; -%inline +(*%inline*) q_prop_token: | i=IDENT { mkleaf $startpos $endpos (L.Identifier i) } | i=IDENT_V { mkleaf $startpos $endpos (L.Identifier i) } From e239bab98f3b47c8c50a1c2a21853797cbe4f2ef Mon Sep 17 00:00:00 2001 From: codinuum Date: Mon, 7 Jun 2021 22:38:20 +0900 Subject: [PATCH 10/11] java: another AST reduction pattern --- src/ast/analyzing/langs/java/java_tree.ml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/ast/analyzing/langs/java/java_tree.ml b/src/ast/analyzing/langs/java/java_tree.ml index dc61bd5..1da695b 100644 --- a/src/ast/analyzing/langs/java/java_tree.ml +++ b/src/ast/analyzing/langs/java/java_tree.ml @@ -1278,6 +1278,8 @@ class translator options = let bid_gen = new BID.generator in object (self) in self#mkleaf ~orig_lab_opt (L.Primary (L.Primary.QualifiedThis (L.conv_name name))) + | Ast.Pparen expr when options#ast_reduction_flag -> self#of_expression expr + | Ast.Pparen expr -> let e_nd = self#of_expression expr in let tid = self#mktid e_nd in From 88ecf8b166eeee45b50b76c5f35b28a5685444d5 Mon Sep 17 00:00:00 2001 From: codinuum Date: Fri, 11 Jun 2021 08:27:16 +0900 Subject: [PATCH 11/11] [java] fixes: src location, label normalization and label compatibility --- src/ast/analyzing/common/edit_base.ml | 10 +++++----- src/ast/analyzing/common/sourcecode.ml | 12 ++++++------ src/ast/analyzing/common/spec.ml | 2 +- src/ast/analyzing/common/spec_base.ml | 2 +- src/ast/analyzing/langs/astml/label.ml | 2 +- src/ast/analyzing/langs/cpp/cpp_label.ml | 2 +- src/ast/analyzing/langs/fortran/f_label.ml | 2 +- src/ast/analyzing/langs/java/java_label.ml | 12 +++++++----- src/ast/analyzing/langs/java/java_tree.ml | 2 +- src/ast/analyzing/langs/python/py_label.ml | 2 +- src/ast/analyzing/langs/verilog/v_label.ml | 2 +- 11 files changed, 26 insertions(+), 24 deletions(-) diff --git a/src/ast/analyzing/common/edit_base.ml b/src/ast/analyzing/common/edit_base.ml index b26cb6e..720f90a 100644 --- a/src/ast/analyzing/common/edit_base.ml +++ b/src/ast/analyzing/common/edit_base.ml @@ -1332,15 +1332,15 @@ class ['node_t, 'tree_t] seq_base options = object (self : 'edits) DEBUG_MSG "* DUMPING DIFF DATA (%d edit(s))\n" self#get_nedits; - let mkpath tree = + (*let mkpath tree = if Storage.kind_is_fs tree#source_kind then tree#source_fullpath else tree#source_path - in + in*) - let iginfos1 = List.map (Info.of_region ~fname:(mkpath tree1)) ignored1 in - let iginfos2 = List.map (Info.of_region ~fname:(mkpath tree2)) ignored2 in + let iginfos1 = List.map (Info.of_region (*~fname:(mkpath tree1)*)) ignored1 in + let iginfos2 = List.map (Info.of_region (*~fname:(mkpath tree2)*)) ignored2 in BEGIN_DEBUG DEBUG_MSG "ignored regions1: %s" (segments_to_string ignored1); @@ -1410,7 +1410,7 @@ class ['node_t, 'tree_t] seq_base options = object (self : 'edits) let n2 = Info.get_node info2 in n1#data#is_named_orig && n2#data#is_named_orig || (not n1#data#is_named && n2#data#is_named || n1#data#is_named && not n2#data#is_named) || - (not (n1#data#is_compatible_with n2#data) && + (not (n1#data#is_compatible_with ?weak:(Some true) n2#data) && n1#data#more_anonymized_label <> n2#data#more_anonymized_label) || n1#data#has_value && n2#data#has_value && n1#data#get_value <> n2#data#get_value in diff --git a/src/ast/analyzing/common/sourcecode.ml b/src/ast/analyzing/common/sourcecode.ml index 9be41af..42258c8 100644 --- a/src/ast/analyzing/common/sourcecode.ml +++ b/src/ast/analyzing/common/sourcecode.ml @@ -326,10 +326,10 @@ module Tree (L : Spec.LABEL_T) = struct method relabel_allowed (ndat : 'self) = L.relabel_allowed (lab, (Obj.obj ndat#_label : L.t)) - method is_compatible_with (ndat : 'self) = - L.is_compatible lab (Obj.obj ndat#_label : L.t) || + method is_compatible_with ?(weak=false) (ndat : 'self) = + L.is_compatible ~weak lab (Obj.obj ndat#_label : L.t) || match orig_lab_opt, ndat#orig_lab_opt with - | Some l1, Some o2 -> L.is_compatible l1 (Obj.obj o2) + | Some l1, Some o2 -> L.is_compatible ~weak l1 (Obj.obj o2) | _ -> false method is_order_insensitive = L.is_order_insensitive lab @@ -481,10 +481,10 @@ module Tree (L : Spec.LABEL_T) = struct self#elem_name_for_delta = x#elem_name_for_delta || (match self#orig_lab_opt, x#orig_lab_opt with | Some o1, Some o2 -> o1 = o2 - | Some o1, None -> L.is_compatible (Obj.obj o1) (Obj.obj x#_label) - | None, Some o2 -> L.is_compatible (Obj.obj _label) (Obj.obj o2) + | Some o1, None -> L.is_compatible ~weak:true (Obj.obj o1) (Obj.obj x#_label) + | None, Some o2 -> L.is_compatible ~weak:true (Obj.obj _label) (Obj.obj o2) | _ -> false) || - self#is_compatible_with x) + self#is_compatible_with ~weak:true x) else (fun x -> _label = x#_label && self#orig_lab_opt = x#orig_lab_opt); self#update diff --git a/src/ast/analyzing/common/spec.ml b/src/ast/analyzing/common/spec.ml index 69becb9..ec99847 100644 --- a/src/ast/analyzing/common/spec.ml +++ b/src/ast/analyzing/common/spec.ml @@ -357,7 +357,7 @@ module type LABEL_T = sig val relabel_allowed : t * t -> bool val quasi_eq : t -> t -> bool - val is_compatible : t -> t -> bool + val is_compatible : ?weak:bool -> t -> t -> bool val is_order_insensitive : t -> bool val move_disallowed : t -> bool val is_common : t -> bool diff --git a/src/ast/analyzing/common/spec_base.ml b/src/ast/analyzing/common/spec_base.ml index fb117df..c0f78f0 100644 --- a/src/ast/analyzing/common/spec_base.ml +++ b/src/ast/analyzing/common/spec_base.ml @@ -64,7 +64,7 @@ class type node_data_t_shared = object ('self) method label : string method _label : Obj.t - method is_compatible_with : 'self -> bool + method is_compatible_with : ?weak:bool -> 'self -> bool method relabel_allowed : 'self -> bool method quasi_eq : 'self -> bool method to_be_notified : bool diff --git a/src/ast/analyzing/langs/astml/label.ml b/src/ast/analyzing/langs/astml/label.ml index 0493a99..8ba729c 100644 --- a/src/ast/analyzing/langs/astml/label.ml +++ b/src/ast/analyzing/langs/astml/label.ml @@ -682,7 +682,7 @@ let check_attrs attrs elem_attrs = let conv_pat pat = "^"^pat^"$" -let is_compatible _ _ = false +let is_compatible ?(weak=false) _ _ = false let is_order_insensitive = function | _ -> false diff --git a/src/ast/analyzing/langs/cpp/cpp_label.ml b/src/ast/analyzing/langs/cpp/cpp_label.ml index 8ce9db3..17c7a8f 100644 --- a/src/ast/analyzing/langs/cpp/cpp_label.ml +++ b/src/ast/analyzing/langs/cpp/cpp_label.ml @@ -2216,7 +2216,7 @@ let is_desig_old = function | _ -> false -let is_compatible lab1 lab2 = +let is_compatible ?(weak=false) lab1 lab2 = match lab1, lab2 with | _ -> false diff --git a/src/ast/analyzing/langs/fortran/f_label.ml b/src/ast/analyzing/langs/fortran/f_label.ml index c8b782a..a76de3b 100644 --- a/src/ast/analyzing/langs/fortran/f_label.ml +++ b/src/ast/analyzing/langs/fortran/f_label.ml @@ -2145,7 +2145,7 @@ let is_expr = function | lab -> is_primary lab -let is_compatible _ _ = false +let is_compatible ?(weak=false) _ _ = false let is_order_insensitive = function | _ -> false diff --git a/src/ast/analyzing/langs/java/java_label.ml b/src/ast/analyzing/langs/java/java_label.ml index 0522744..dc64e31 100644 --- a/src/ast/analyzing/langs/java/java_label.ml +++ b/src/ast/analyzing/langs/java/java_label.ml @@ -411,8 +411,10 @@ module Literal = struct Str.global_replace escaped_double_quote_pat "\"" s let escaped_single_quote_pat = Str.regexp_string "\\'" + let tab_pat = Str.regexp_string "\t" let reduce_string s = (* remove slash followed by single quote *) - Str.global_replace escaped_single_quote_pat "'" s + let s_ = Str.global_replace escaped_single_quote_pat "'" s in + Str.global_replace tab_pat "\\t" s_ let of_literal ?(reduce=false) = function | Ast.Lcharacter str when reduce -> (Character (reduce_char str)) @@ -2635,14 +2637,14 @@ let is_statement_expression = function end | _ -> false -let is_compatible lab1 lab2 = +let is_compatible ?(weak=false) lab1 lab2 = match lab1, lab2 with | Primary p1, Primary p2 -> Primary.is_compatible p1 p2 | Method(n1, _), Method(n2, _) -> n1 = n2 | Constructor(n1, _), Constructor(n2, _) -> n1 = n2 - | ClassBody _, InterfaceBody _ | InterfaceBody _, ClassBody _ -> true - | ClassBody _, EnumBody _ | EnumBody _, ClassBody _ -> true - | EnumBody _, InterfaceBody _ | InterfaceBody _, EnumBody _ -> true + | ClassBody _, InterfaceBody _ | InterfaceBody _, ClassBody _ when weak -> true (* invalid when dumping delta *) + | ClassBody _, EnumBody _ | EnumBody _, ClassBody _ when weak -> true + | EnumBody _, InterfaceBody _ | InterfaceBody _, EnumBody _ when weak -> true | _ -> false let quasi_eq lab1 lab2 = diff --git a/src/ast/analyzing/langs/java/java_tree.ml b/src/ast/analyzing/langs/java/java_tree.ml index 1da695b..39660f6 100644 --- a/src/ast/analyzing/langs/java/java_tree.ml +++ b/src/ast/analyzing/langs/java/java_tree.ml @@ -644,7 +644,7 @@ class translator options = let bid_gen = new BID.generator in object (self) match tbound with | None -> [] | Some tb when options#ast_reduction_flag -> begin - let t = P.type_to_string tb.Ast.tb_reference_type in + let t = P.type_to_string ~show_attr:false tb.Ast.tb_reference_type in if t = "java.lang.Object" then [] else diff --git a/src/ast/analyzing/langs/python/py_label.ml b/src/ast/analyzing/langs/python/py_label.ml index 67abd8f..eaddb84 100644 --- a/src/ast/analyzing/langs/python/py_label.ml +++ b/src/ast/analyzing/langs/python/py_label.ml @@ -1269,7 +1269,7 @@ let is_named_orig = function | Statement stmt -> Statement.is_named_orig stmt | _ -> false -let is_compatible _ _ = false +let is_compatible ?(weak=false) _ _ = false let is_order_insensitive = function | _ -> false diff --git a/src/ast/analyzing/langs/verilog/v_label.ml b/src/ast/analyzing/langs/verilog/v_label.ml index 23c6bd5..7bccac1 100644 --- a/src/ast/analyzing/langs/verilog/v_label.ml +++ b/src/ast/analyzing/langs/verilog/v_label.ml @@ -1885,7 +1885,7 @@ let has_non_trivial_value lab = with Not_found -> false -let is_compatible _ _ = false +let is_compatible ?(weak=false) _ _ = false let is_order_insensitive = function | _ -> false