From 44989f1dd5d3653c21c8e0d23366c6e133f7c501 Mon Sep 17 00:00:00 2001 From: Luke Slater Date: Mon, 23 Aug 2021 16:28:18 +0100 Subject: [PATCH] 0.0.11 release. multiple group membership delimited by ;. --thread option. rewriting datastructures for time efficiency, etc --- klarigi/build.gradle | 2 +- klarigi/src/main/groovy/klarigi/App.groovy | 16 ++++++- .../src/main/groovy/klarigi/Klarigi.groovy | 32 ++++++++++---- klarigi/src/main/groovy/klarigi/Scorer.groovy | 44 +++++++++++++------ .../test/groovy/klarigi/KlarigiTest.groovy | 2 +- 5 files changed, 71 insertions(+), 25 deletions(-) diff --git a/klarigi/build.gradle b/klarigi/build.gradle index 7263788..c5aa00d 100644 --- a/klarigi/build.gradle +++ b/klarigi/build.gradle @@ -88,4 +88,4 @@ jacocoTestReport { } } -version = '0.0.10' +version = '0.0.11' diff --git a/klarigi/src/main/groovy/klarigi/App.groovy b/klarigi/src/main/groovy/klarigi/App.groovy index 8b80348..2d67bed 100644 --- a/klarigi/src/main/groovy/klarigi/App.groovy +++ b/klarigi/src/main/groovy/klarigi/App.groovy @@ -48,6 +48,8 @@ class App { _ longOpt: 'output-classification-scores', 'Output classification scores and true/false labels for each group into files. Useful for generating AUCs.', type: Boolean _ longOpt: 'output-exp-dataframe', "Output a TSV describing a 'data-frame' of categorical values for each term appearing in derived explanations. Easy to load into R and do stuff with.", type: Boolean + _ longOpt: 'threads', 'Number of threads to use, particularly for calculating scoring. This should speed things up a lot with larger datasets.', args: 1 + _ longOpt: 'output', 'File to output results to. If not given, will print to stdout', args: 1 _ longOpt: 'print-members', 'Print members of groups by label (first column of data file). Only works with standard output (not LaTeX)', type: Boolean @@ -66,10 +68,20 @@ class App { cliBuilder.usage() } + def threads = 1 + if(o['threads']) { + try { + threads = Integer.parseInt(o['threads']) + } catch(e) { + println 'Warning: Could not parse --threads argument. Defaulting to 1.' + threads = 1 + } + } + def k = new Klarigi(o) if(!o['similarity-mode']) { if(!o['group'] || (o['group'] && o['group'] == '*')) { - def allExplanations = k.explainAllClusters(o['output-scores'], o['power']) + def allExplanations = k.explainAllClusters(o['output-scores'], o['power'], threads) allExplanations.each { k.output(it.cluster, it.results, o['output-type'], o['print-members'], o['output']) } @@ -89,7 +101,7 @@ class App { } } } else { - def r = k.explainCluster(o['group'], o['power'], o['output-scores']) + def r = k.explainCluster(o['group'], o['power'], o['output-scores'], threads) k.output(o['group'], r, o['output-type'], o['print-members'], o['output']) if(o['reclassify'] || o['output-exp-dataframe']) { diff --git a/klarigi/src/main/groovy/klarigi/Klarigi.groovy b/klarigi/src/main/groovy/klarigi/Klarigi.groovy index 21e5c2f..6f0a01d 100644 --- a/klarigi/src/main/groovy/klarigi/Klarigi.groovy +++ b/klarigi/src/main/groovy/klarigi/Klarigi.groovy @@ -30,11 +30,12 @@ public class Klarigi { def icFactory Klarigi(o) { + verbose = o['verbose'] + loadData(o['data']) loadOntology(o['ontology']) loadIc(o['ic'], o['ontology'], o['data'], o['resnik-ic'], o['save-ic'], o['turtle']) coefficients = Coefficients.Generate(o) - verbose = o['verbose'] if(o['output']) { // blank the output file, since we will subsequently append to it. all the output stuff could probs be better abstracted. new File(o['output']).text = '' @@ -64,14 +65,20 @@ public class Klarigi { data.associations[entity][it] = true } - if(!data.groupings.containsKey(group)) { - data.groupings[group] = [] + group.tokenize(';').each { g -> + if(!data.groupings.containsKey(g)) { + data.groupings[g] = [] + } + data.groupings[g] << entity } - data.groupings[group] << entity } } catch(e) { HandleError(e, verbose, "Error loading data file ($dataFile)") } + + if(verbose) { + println "Done loading dataset" + } } def loadIc(icFile, ontologyFile, annotFile, resnikIc, saveIc, turtle) { @@ -94,6 +101,10 @@ public class Klarigi { } } + if(verbose) { + println "Done loading IC values" + } + if(saveIc) { try { InformationContent.Write(data.ic, saveIc) @@ -140,11 +151,15 @@ public class Klarigi { ontoHelper.dataFactory = manager.getOWLDataFactory() ontoHelper.reasoner = elkFactory.createReasoner(ontology, config) ontoHelper.labels = labels + + if(verbose) { + println "Done loading the ontology" + } } - def explainCluster(cid, powerMode, outputScores) { + def explainCluster(cid, powerMode, outputScores, threads) { def scorer = new Scorer(ontoHelper, data) - def candidates = scorer.scoreClasses(cid) + def candidates = scorer.scoreClasses(cid, threads) println "$cid: Scoring completed. Candidates: ${candidates.size()}" @@ -156,6 +171,7 @@ public class Klarigi { } } + // TODO: now we have to, ah, add the multiprocessing if(powerMode) { StepDown.RunNewAlgorithm(coefficients, cid, candidates, data) } else { @@ -163,9 +179,9 @@ public class Klarigi { } } - def explainAllClusters(outputScores, powerMode) { + def explainAllClusters(outputScores, powerMode, threads) { data.groupings.collect { g, v -> - [ cluster: g, results: explainCluster(g, powerMode, outputScores) ] + [ cluster: g, results: explainCluster(g, powerMode, outputScores, threads) ] } } diff --git a/klarigi/src/main/groovy/klarigi/Scorer.groovy b/klarigi/src/main/groovy/klarigi/Scorer.groovy index 609cfee..f3e688e 100644 --- a/klarigi/src/main/groovy/klarigi/Scorer.groovy +++ b/klarigi/src/main/groovy/klarigi/Scorer.groovy @@ -14,6 +14,7 @@ public class Scorer { this.data = data } + // so what if we could build a list of the classes of interest and their subclasses, and then go through it once private def processClass(explainers, cid, c) { if(explainers.containsKey(c)) { return; } def ce = ontoHelper.dataFactory.getOWLClass(IRI.create(c)) @@ -31,13 +32,6 @@ public class Scorer { ] explainers[c].inclusion = explainers[c].internalIncluded.size() explainers[c].exclusion = explainers[c].externalIncluded.size() - - ontoHelper.reasoner.getSuperClasses(ce, true).each { n -> - n.getEntities().each { sc -> - def strc = sc.getIRI().toString() - processClass(explainers, cid, strc) - } - } } private def normalise(explainers, cid) { @@ -58,14 +52,38 @@ public class Scorer { } } - def scoreClasses(cid) { - def classList = data.associations.collect { k, v -> v.collect { kk, vv -> kk } }.flatten().unique(false) // all classes used in entity descriptions - def explainers = [:] - //GParsPool.withPool(4) { p -> - classList.each { + private def findRelevantClasses(relevant, c) { + if(relevant.contains(c)) { return; } + relevant << c + + def ce = ontoHelper.dataFactory.getOWLClass(IRI.create(c)) + ontoHelper.reasoner.getSuperClasses(ce, false).each { n -> + n.getEntities().each { sc -> + def strc = sc.getIRI().toString() + findRelevantClasses(relevant, strc) + } + } + + return relevant + } + + def scoreClasses(cid, threads) { + // quick, though it should probably be built elsewhere + def classMap = [:] + data.associations.each { k, v -> v.collect { kk, vv -> classMap[kk] = true }} + def classList = classMap.collect { k, v -> k } + + def relevant = [] + classList.each { + findRelevantClasses(relevant, it) + } + + def explainers = new ConcurrentHashMap() + GParsPool.withPool(threads) { p -> + relevant.eachParallel { processClass(explainers, cid, it) } - //} + } explainers = normalise(explainers, cid) // Note, this turns it into a list rather than a hashmap explainers } diff --git a/klarigi/src/test/groovy/klarigi/KlarigiTest.groovy b/klarigi/src/test/groovy/klarigi/KlarigiTest.groovy index e7df871..9d07f49 100644 --- a/klarigi/src/test/groovy/klarigi/KlarigiTest.groovy +++ b/klarigi/src/test/groovy/klarigi/KlarigiTest.groovy @@ -66,7 +66,7 @@ class KlarigiTest extends Specification { def "test_scoring"() { when: def clusterId = "OMIM:604271" - explanations = s.scoreClasses(clusterId) + explanations = s.scoreClasses(clusterId, 1) def items = [ [ term: "http://purl.obolibrary.org/obo/HP_0004322",