From 44989f1dd5d3653c21c8e0d23366c6e133f7c501 Mon Sep 17 00:00:00 2001
From: Luke Slater <tinmachin3@gmail.com>
Date: Mon, 23 Aug 2021 16:28:18 +0100
Subject: [PATCH] 0.0.11 release. multiple group membership delimited by ;.
 --thread option. rewriting datastructures for time efficiency, etc

---
 klarigi/build.gradle                          |  2 +-
 klarigi/src/main/groovy/klarigi/App.groovy    | 16 ++++++-
 .../src/main/groovy/klarigi/Klarigi.groovy    | 32 ++++++++++----
 klarigi/src/main/groovy/klarigi/Scorer.groovy | 44 +++++++++++++------
 .../test/groovy/klarigi/KlarigiTest.groovy    |  2 +-
 5 files changed, 71 insertions(+), 25 deletions(-)

diff --git a/klarigi/build.gradle b/klarigi/build.gradle
index 7263788..c5aa00d 100644
--- a/klarigi/build.gradle
+++ b/klarigi/build.gradle
@@ -88,4 +88,4 @@ jacocoTestReport {
   }
 }
 
-version = '0.0.10'
+version = '0.0.11'
diff --git a/klarigi/src/main/groovy/klarigi/App.groovy b/klarigi/src/main/groovy/klarigi/App.groovy
index 8b80348..2d67bed 100644
--- a/klarigi/src/main/groovy/klarigi/App.groovy
+++ b/klarigi/src/main/groovy/klarigi/App.groovy
@@ -48,6 +48,8 @@ class App {
       _ longOpt: 'output-classification-scores', 'Output classification scores and true/false labels for each group into files. Useful for generating AUCs.', type: Boolean
       _ longOpt: 'output-exp-dataframe', "Output a TSV describing a 'data-frame' of categorical values for each term appearing in derived explanations. Easy to load into R and do stuff with.", type: Boolean
 
+      _ longOpt: 'threads', 'Number of threads to use, particularly for calculating scoring. This should speed things up a lot with larger datasets.', args: 1
+
       _ longOpt: 'output', 'File to output results to. If not given, will print to stdout', args: 1
       _ longOpt: 'print-members', 'Print members of groups by label (first column of data file). Only works with standard output (not LaTeX)', type: Boolean
 
@@ -66,10 +68,20 @@ class App {
       cliBuilder.usage()
     }
 
+    def threads = 1
+    if(o['threads']) {
+      try {
+        threads = Integer.parseInt(o['threads'])
+      } catch(e) {
+        println 'Warning: Could not parse --threads argument. Defaulting to 1.'
+        threads = 1
+      }
+    }
+
     def k = new Klarigi(o)
     if(!o['similarity-mode']) {
       if(!o['group'] || (o['group'] && o['group'] == '*')) {
-        def allExplanations = k.explainAllClusters(o['output-scores'], o['power'])
+        def allExplanations = k.explainAllClusters(o['output-scores'], o['power'], threads)
         allExplanations.each {
           k.output(it.cluster, it.results, o['output-type'], o['print-members'], o['output'])
         }
@@ -89,7 +101,7 @@ class App {
           }
         }
       } else {
-        def r = k.explainCluster(o['group'], o['power'], o['output-scores'])
+        def r = k.explainCluster(o['group'], o['power'], o['output-scores'], threads)
         k.output(o['group'], r, o['output-type'], o['print-members'], o['output'])
 
         if(o['reclassify'] || o['output-exp-dataframe']) {
diff --git a/klarigi/src/main/groovy/klarigi/Klarigi.groovy b/klarigi/src/main/groovy/klarigi/Klarigi.groovy
index 21e5c2f..6f0a01d 100644
--- a/klarigi/src/main/groovy/klarigi/Klarigi.groovy
+++ b/klarigi/src/main/groovy/klarigi/Klarigi.groovy
@@ -30,11 +30,12 @@ public class Klarigi {
   def icFactory
 
   Klarigi(o) {
+    verbose = o['verbose']
+
     loadData(o['data'])
     loadOntology(o['ontology'])
     loadIc(o['ic'], o['ontology'], o['data'], o['resnik-ic'], o['save-ic'], o['turtle'])
     coefficients = Coefficients.Generate(o)
-    verbose = o['verbose']
 
     if(o['output']) { // blank the output file, since we will subsequently append to it. all the output stuff could probs be better abstracted.
       new File(o['output']).text = ''
@@ -64,14 +65,20 @@ public class Klarigi {
           data.associations[entity][it] = true
         }
 
-        if(!data.groupings.containsKey(group)) {
-          data.groupings[group] = []
+        group.tokenize(';').each { g ->
+          if(!data.groupings.containsKey(g)) {
+            data.groupings[g] = []
+          }
+          data.groupings[g] << entity
         }
-        data.groupings[group] << entity
       }
     } catch(e) {
       HandleError(e, verbose, "Error loading data file ($dataFile)")
     }
+
+    if(verbose) {
+      println "Done loading dataset"
+    }
   }
 
   def loadIc(icFile, ontologyFile, annotFile, resnikIc, saveIc, turtle) {
@@ -94,6 +101,10 @@ public class Klarigi {
       }
     }
 
+    if(verbose) {
+      println "Done loading IC values"
+    }
+
     if(saveIc) {
       try {
         InformationContent.Write(data.ic, saveIc)
@@ -140,11 +151,15 @@ public class Klarigi {
     ontoHelper.dataFactory = manager.getOWLDataFactory()
     ontoHelper.reasoner = elkFactory.createReasoner(ontology, config)
     ontoHelper.labels = labels
+
+    if(verbose) {
+      println "Done loading the ontology"
+    }
   }
 
-  def explainCluster(cid, powerMode, outputScores) {
+  def explainCluster(cid, powerMode, outputScores, threads) {
     def scorer = new Scorer(ontoHelper, data)
-    def candidates = scorer.scoreClasses(cid)
+    def candidates = scorer.scoreClasses(cid, threads)
 
     println "$cid: Scoring completed. Candidates: ${candidates.size()}"
 
@@ -156,6 +171,7 @@ public class Klarigi {
       }
     }
 
+    // TODO: now we have to, ah, add the multiprocessing
     if(powerMode) {
       StepDown.RunNewAlgorithm(coefficients, cid, candidates, data)
     } else {
@@ -163,9 +179,9 @@ public class Klarigi {
     }
   }
 
-  def explainAllClusters(outputScores, powerMode) {
+  def explainAllClusters(outputScores, powerMode, threads) {
     data.groupings.collect { g, v ->
-      [ cluster: g, results: explainCluster(g, powerMode, outputScores) ]
+      [ cluster: g, results: explainCluster(g, powerMode, outputScores, threads) ]
     }
   }
 
diff --git a/klarigi/src/main/groovy/klarigi/Scorer.groovy b/klarigi/src/main/groovy/klarigi/Scorer.groovy
index 609cfee..f3e688e 100644
--- a/klarigi/src/main/groovy/klarigi/Scorer.groovy
+++ b/klarigi/src/main/groovy/klarigi/Scorer.groovy
@@ -14,6 +14,7 @@ public class Scorer {
     this.data = data
   }
 
+  // so what if we could build a list of the classes of interest and their subclasses, and then go through it once 
   private def processClass(explainers, cid, c) {
 		if(explainers.containsKey(c)) { return; }
 		def ce = ontoHelper.dataFactory.getOWLClass(IRI.create(c))
@@ -31,13 +32,6 @@ public class Scorer {
 		]
     explainers[c].inclusion = explainers[c].internalIncluded.size()
     explainers[c].exclusion = explainers[c].externalIncluded.size()
-
-		ontoHelper.reasoner.getSuperClasses(ce, true).each { n ->
-			n.getEntities().each { sc ->
-				def strc = sc.getIRI().toString()
-				processClass(explainers, cid, strc)
-			}
-		}
 	}
 
   private def normalise(explainers, cid) {
@@ -58,14 +52,38 @@ public class Scorer {
       }
   }
 
-  def scoreClasses(cid) {
-    def classList = data.associations.collect { k, v -> v.collect { kk, vv -> kk } }.flatten().unique(false) // all classes used in entity descriptions
-    def explainers = [:]
-    //GParsPool.withPool(4) { p ->
-      classList.each {
+  private def findRelevantClasses(relevant, c) {
+		if(relevant.contains(c)) { return; }
+    relevant << c
+
+		def ce = ontoHelper.dataFactory.getOWLClass(IRI.create(c))
+		ontoHelper.reasoner.getSuperClasses(ce, false).each { n ->
+			n.getEntities().each { sc ->
+				def strc = sc.getIRI().toString()
+				findRelevantClasses(relevant, strc)
+			}
+		}
+
+    return relevant
+  }
+
+  def scoreClasses(cid, threads) {
+    // quick, though it should probably be built elsewhere
+    def classMap = [:]
+    data.associations.each { k, v -> v.collect { kk, vv -> classMap[kk] = true }}
+    def classList = classMap.collect { k, v -> k }
+
+    def relevant = []
+    classList.each {
+      findRelevantClasses(relevant, it)   
+    }
+
+    def explainers = new ConcurrentHashMap()
+    GParsPool.withPool(threads) { p ->
+      relevant.eachParallel {
         processClass(explainers, cid, it)
       }
-    //}
+    }
     explainers = normalise(explainers, cid) // Note, this turns it into a list rather than a hashmap
     explainers
   }
diff --git a/klarigi/src/test/groovy/klarigi/KlarigiTest.groovy b/klarigi/src/test/groovy/klarigi/KlarigiTest.groovy
index e7df871..9d07f49 100644
--- a/klarigi/src/test/groovy/klarigi/KlarigiTest.groovy
+++ b/klarigi/src/test/groovy/klarigi/KlarigiTest.groovy
@@ -66,7 +66,7 @@ class KlarigiTest extends Specification {
   def "test_scoring"() {
     when:
       def clusterId = "OMIM:604271"
-      explanations = s.scoreClasses(clusterId)
+      explanations = s.scoreClasses(clusterId, 1)
       def items = [
         [ 
           term: "http://purl.obolibrary.org/obo/HP_0004322",