Merge pull request #185 from charvolant/master

Release 4.3
AtlasOfLivingAustralia · Feb 23, 2023 · 2d111f9 · 2d111f9
2 parents ba5be3f + 039062a
commit 2d111f9
Show file tree

Hide file tree

Showing 74 changed files with 27,879 additions and 232 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -10,10 +10,10 @@ branches:
 
 before_install:
 - mkdir -p ~/.m2; wget -q -O ~/.m2/settings.xml https://raw.githubusercontent.com/AtlasOfLivingAustralia/travis-build-configuration/master/travis_maven_settings_simple.xml
-- sudo mkdir -p /data/lucene; sudo wget -O /data/lucene/namematching-20210811-3.tgz https://archives.ala.org.au/archives/nameindexes/20210811-3/namematching-20210811-3.tgz
+- sudo mkdir -p /data/lucene; sudo wget -O /data/lucene/namematching-20210811-5.tgz https://archives.ala.org.au/archives/nameindexes/20210811-5/namematching-20210811-5.tgz
 - cd /data/lucene
-- sudo tar zxvf namematching-20210811-3.tgz
-- sudo ln -s namematching-20210811-3 namematching
+- sudo tar zxvf namematching-20210811-5.tgz
+- sudo ln -s namematching-20210811-5 namematching
 - ls -laF
 - cd $TRAVIS_BUILD_DIR
 

diff --git a/README.md b/README.md
@@ -55,13 +55,13 @@ You can download the IRMNG DwCA for homonyms from the following URL:
 
 An assembly zip file for this can be downloaded from our maven repository : 
 
-[ala-name-matching-4.2-distribution.zip](https://nexus.ala.org.au/service/local/repositories/releases/content/au/org/ala/ala-name-matching-distribution/4.2/ala-name-matching-distribution-4.2-distribution.zip)
+[ala-name-matching-4.3-distribution.zip](https://nexus.ala.org.au/service/local/repositories/releases/content/au/org/ala/ala-name-matching-distribution/4,3/ala-name-matching-distribution-4.3-distribution.zip)
 
 To generate the name index using the data described above, follow these steps. Alternatively use the [ALA Ansible scripts](https://github.com/AtlasOfLivingAustralia/ala-install) 
 here using the playbook [nameindexer.yml](https://github.com/AtlasOfLivingAustralia/ala-install/blob/master/ansible/nameindexer-standalone.yml) which does it all for you.
 
 * Download the zip files linked above to a directory e.g. /data/names/ and extract them
-* Download the distribution zip [ala-name-matching-disribution-4.2-distribution.zip](https://nexus.ala.org.au/service/local/repositories/releases/content/au/org/ala/ala-name-matching-distribution/4.2/ala-name-matching-distribution-4.2-distribution.zip)
+* Download the distribution zip [ala-name-matching-disribution-4.3-distribution.zip](https://nexus.ala.org.au/service/local/repositories/releases/content/au/org/ala/ala-name-matching-distribution/4.3/ala-name-matching-distribution-4.3-distribution.zip)
   and unzip it.
  You wil find a number of shell scripts in the base directory.
 * Generate the names index with command:
@@ -82,7 +82,7 @@ into a single, combined taxonomy.
 An example command for the taxonomy builder is:
 
 ```
-./merge.sh -c /data/names/ala-taxon-config.json -w tmp -o /data/names/combined /data/names/APNI/DwC /data/names/AFD/DwC /data/names/CAAB/DwC
+./merge.sh -c /data/names/ala-taxon-config.json -w /tmp -o /data/names/combined /data/names/APNI/DwC /data/names/AFD/DwC /data/names/CAAB/DwC
 ```
 
 More information about the merge configuration can be found [here](doc/merge-config.md).
@@ -94,16 +94,16 @@ To skip this step, run a build with ```mvn install -DskipTests=true```.
 
 The build creates one artefact in the `ala-name-matching-distribution/target` directory: 
 
-* ala-name-matching-distribution-4.2-distribution.zip - zip containing the project jar and dependencies
+* ala-name-matching-distribution-4.3-distribution.zip - zip containing the project jar and dependencies
 
 Each module contains two artefacts in the 
 `ala-name-matching/ala-name-matching-<module>/target` directory:
 
-* ala-name-matching-<module>-4.2.jar - built jar for the project code only
-* ala-name-matching-<module>-4.2-sources.jar - source jar for the project code only
+* ala-name-matching-<module>-4.3.jar - built jar for the project code only
+* ala-name-matching-<module>-4.3-sources.jar - source jar for the project code only
 
-The name index for Australian names lists used in unit tests can be downloaded [from here](https://biocache.ala.org.au/archives/nameindexes/20210811-3) and needs to be extracted to the
-directory `/data/lucene/namematching-20210811-3`
+The name index for Australian names lists used in unit tests can be downloaded [from here](https://biocache.ala.org.au/archives/nameindexes/20210811-5) and needs to be extracted to the
+directory `/data/lucene/namematching-20210811-5`
 
 ## ALA Names List
 
@@ -139,7 +139,7 @@ To use ala-name-matching, include it as a dependency in your pom file:
 <dependency>
   <groupId>au.org.ala</groupId>
   <artifactId>ala-name-matching-search</artifactId>
-  <version>4.2</version>
+  <version>4.3</version>
 </dependency>
 ```
 
@@ -148,7 +148,7 @@ If you just want the handy enums and such-like, use
 <dependency>
   <groupId>au.org.ala</groupId>
   <artifactId>ala-name-matching-model</artifactId>
-  <version>4.2</version>
+  <version>4.3</version>
 </dependency>
 ```
 
@@ -158,7 +158,7 @@ libraries having validation code that conflicts with spring validation.
 You can correct this by using
 
 ```
-compile("au.org.ala:ala-name-matching-search:4.2") {
+compile("au.org.ala:ala-name-matching-search:4.3") {
     exclude group: 'org.slf4j', module: 'slf4j-log4j12'
     exclude group: 'org.apache.bval', module: 'org.apache.bval.bundle'
 }

diff --git a/ala-name-matching-builder/pom.xml b/ala-name-matching-builder/pom.xml
@@ -5,7 +5,7 @@
     <parent>
         <groupId>au.org.ala</groupId>
         <artifactId>ala-name-matching</artifactId>
-        <version>4.2</version>
+        <version>4.3</version>
     </parent>
 
     <artifactId>ala-name-matching-builder</artifactId>
@@ -76,4 +76,17 @@
             <version>${commons-cli.version}</version>
         </dependency>
     </dependencies>
+
+    <build>
+        <resources>
+            <resource>
+                <directory>src/main/resources</directory>
+                <filtering>false</filtering>
+            </resource>
+            <resource>
+                <directory>src/main/resources-filtered</directory>
+                <filtering>true</filtering>
+            </resource>
+        </resources>
+    </build>
 </project>
diff --git a/ala-name-matching-builder/src/main/java/au/org/ala/names/index/ALANameAnalyser.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/ALANameAnalyser.java
@@ -26,12 +26,12 @@
 import org.gbif.api.exception.UnparsableException;
 import org.gbif.api.model.checklistbank.ParsedName;
 import org.gbif.api.service.checklistbank.NameParser;
-import org.gbif.api.vocabulary.NameType;
-import org.gbif.api.vocabulary.NomenclaturalStatus;
-import org.gbif.api.vocabulary.Rank;
+import org.gbif.api.vocabulary.*;
 import org.gbif.checklistbank.authorship.AuthorComparator;
 import org.gbif.checklistbank.utils.SciNameNormalizer;
+import org.gbif.common.parsers.LifeStageParser;
 import org.gbif.common.parsers.NomStatusParser;
+import org.gbif.common.parsers.OccurrenceStatusParser;
 import org.gbif.common.parsers.core.ParseResult;
 import org.gbif.nameparser.PhraseNameParser;
 import org.slf4j.Logger;
@@ -180,11 +180,15 @@ public class ALANameAnalyser extends NameAnalyser {
     private List<Pattern> informalPatterns;
     private NameParser nameParser;
     private NomStatusParser nomStatusParser;
+    private LifeStageParser lifeStageParser;
+    private OccurrenceStatusParser occurrenceStatusParser;
 
     public ALANameAnalyser(AuthorComparator authorComparator, Reporter reporter) {
         super(authorComparator, reporter);
         this.nameParser = new PhraseNameParser();
         this.nomStatusParser = NomStatusParser.getInstance();
+        this.lifeStageParser = LifeStageParser.getInstance();
+        this.occurrenceStatusParser = OccurrenceStatusParser.getInstance();
         this.buildTaxonomicTypeMap();
         this.buildRankMap();
         this.buildNomenclaturalStatusMap();
@@ -237,7 +241,7 @@ public AnalysisResult analyse(@Nullable NomenclaturalClassifier code, String sci
                     rankType = RankType.getForCBRank(name.getRank());
             }
         } catch (UnparsableException ex) {
-            LOGGER.info("Unable to parse " + name + ": " + ex.getMessage());
+            LOGGER.info("Unable to parse " + ex.getMessage());
         }
         if (UNSURE_MARKER.matcher(scientificName).find()) {
             // Leave this well alone but indicate that it is doubtful
@@ -584,6 +588,44 @@ public NomenclaturalStatus canonicaliseNomenclaturalStatus(String nomenclaturalS
         return status;
     }
 
+    /**
+     * Canonicalise the life stage
+     *
+     * @param lifeStage the life stage string
+     *
+     * @return The matching life stage or null for non-matched
+     */
+    @Override
+    public LifeStage canonicaliseLifeStage(String lifeStage) {
+        if (lifeStage == null || lifeStage.isEmpty())
+            return null;
+        ParseResult<LifeStage> result = this.lifeStageParser.parse(lifeStage);
+        if (!result.isSuccessful()) {
+            this.report(IssueType.VALIDATION, "taxonomy.load.lifeStage.invalid", lifeStage, "");
+            return null;
+        }
+        return result.getPayload();
+    }
+
+    /**
+     * Canonicalise the occurrence status
+     *
+     * @param occurrenceStatus the occurrence status string
+     *
+     * @return The occurrence status or null for non-matched
+     */
+    @Override
+    public OccurrenceStatus canonicaliseOccurrenceStatus(String occurrenceStatus) {
+        if (occurrenceStatus == null || occurrenceStatus.isEmpty())
+            return null;
+        ParseResult<OccurrenceStatus> result = this.occurrenceStatusParser.parse(occurrenceStatus);
+        if (!result.isSuccessful()) {
+            this.report(IssueType.VALIDATION, "taxonomy.load.occurrenceStatus.invalid", occurrenceStatus, "");
+            return null;
+        }
+        return result.getPayload();
+    }
+
     /**
      * Test for a known informal name.
      * <p>

diff --git a/ala-name-matching-builder/src/main/java/au/org/ala/names/index/ALATaxonResolver.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/ALATaxonResolver.java
@@ -54,14 +54,19 @@ public ALATaxonResolver(Taxonomy taxonomy) {
      * Principals are effectively chosen on the basis of the provider that provides the highest priority
      * for one of the instances.
      * </p>
+     * <p>
+     * Geographic taxa (ie those with a {@link TaxonomicType#isGeographic()} of true
+     * Are always included as principals at the end of anything else that might be available.
+     * </p>
      */
     @Override
     public List<TaxonConceptInstance> principals(TaxonConcept concept, Collection<TaxonConceptInstance> instances) throws IndexBuilderException {
         final int cutoff = taxonomy.getAcceptedCutoff();
-        List<TaxonConceptInstance> principals = instances.stream().filter(tci -> tci.isPrimary() && tci.getScore() > cutoff).collect(Collectors.toList());
+        List<TaxonConceptInstance> geographic = instances.stream().filter(tci -> tci.isGeographic() && tci.getScore() > cutoff).collect(Collectors.toList());
+        List<TaxonConceptInstance> principals = instances.stream().filter(tci -> tci.isPrimary() && !tci.isGeographic() && tci.getScore() > cutoff).collect(Collectors.toList());
         if (principals.isEmpty()) {
            this.taxonomy.report(IssueType.NOTE, "taxonResolver.noPrincipals", concept, null);
-            principals = new ArrayList<>(instances);
+            principals = instances.stream().filter(tci -> !tci.isGeographic()).collect(Collectors.toList());
         }
         Optional<TaxonConceptInstance> max = principals.stream().max(TaxonConceptInstance.SCORE_COMPARATOR);
         Optional<NameProvider> authority = max.map(TaxonConceptInstance::getAuthority);
@@ -74,6 +79,8 @@ public List<TaxonConceptInstance> principals(TaxonConcept concept, Collection<Ta
         final NameProvider source = authority.orElse(taxonomy.getInferenceProvider());
         principals = principals.stream().filter(instance -> instance.getAuthority() == source).collect(Collectors.toList());
         principals.sort(TaxonConceptInstance.INVERSE_SCORE_COMPARATOR);
+        geographic.sort(TaxonConceptInstance.INVERSE_SCORE_COMPARATOR);
+        principals.addAll(geographic);
         return principals;
     }
 
@@ -322,6 +329,85 @@ public void reallocateSecondaryConcepts(ScientificName scientificName, Taxonomy
 
     }
 
+    /**
+     * Resolve distribution information.
+     * <p>
+     * Build a disjoint list of distribution information
+     * from the principal instances.
+     * </p>
+     * <p>
+     * Distinct distributions are always merged.
+     * Non-primary distritions are only merged if they are distinct from the
+     * primary distribitions.
+     * </p>
+     *
+     * @param concept The concept
+     * @param taxonomy The taxonomy
+     *
+     * @return The resolved distribution list or null for global
+     *
+     * @throws IndexBuilderException If unable to resolve distribution data
+     */
+    @Override
+    public void resolveDistribution(TaxonConcept concept, TaxonResolution resolution, Taxonomy taxonomy) throws IndexBuilderException {
+        for (TaxonConceptInstance used: resolution.getUsed()) {
+            if (used.getDistribution() == null) {
+                resolution.addDistribution(used, null);
+                continue;
+            }
+            List<Distribution> distribution = used.getDistribution();
+            for (TaxonConceptInstance child: resolution.getChildren(used)) {
+                List<Distribution> more = child.getDistribution();
+                if (child == used || !child.isOutput() || child.getContainer() != concept || more == null) {
+                    continue;
+                }
+                final List<Distribution> coverage = distribution;
+                more = more.stream().filter(d -> !coverage.stream().anyMatch(d1 -> d1.covers(d))).collect(Collectors.toList());
+                distribution = this.mergeDistribution(distribution, more);
+            }
+            resolution.addDistribution(used, distribution.isEmpty() ? null : distribution);
+        }
+    }
+
+    /**
+     * Merge a distribution list into a list of distribitions.
+     *
+     * @param distribution The current distribution list
+     * @param more The distribution list to merge
+     *
+     * @return The merged list
+     */
+    protected List<Distribution> mergeDistribution(List<Distribution> distribution, List<Distribution> more) {
+        for (Distribution d: more)
+            distribution = this.mergeDistribution(distribution, d);
+        return distribution;
+    }
+
+
+    /**
+     * Merge a distribution into a list of distribitions.
+     *
+     * @param distribution The current distribution list
+     * @param dist The distribution to merge
+     *
+     * @return The merged list
+     */
+    protected List<Distribution> mergeDistribution(List<Distribution> distribution, Distribution dist) {
+        List<Distribution> merged = new ArrayList<>(distribution.size());
+        for (Distribution d: distribution) {
+            if (dist != null && dist.covers(d)) {
+            } else if (dist != null && d.covers(dist)) {
+                merged.add(d);
+                dist = null;
+            } else {
+               merged.add(d);
+            }
+        }
+        if (dist != null)
+            merged.add(dist);
+        return merged;
+    }
+
     /**
      * Get the resolved least upper bound (lub) of two instances.
      * <p>