Merge pull request #171 from MITLibraries/tco-114-validate-isbns

Validate ISBNs detected via regex
MITLibraries · Jan 15, 2025 · 1c8ce6d · 1c8ce6d
2 parents 7f8bd83 + 624e6fc
commit 1c8ce6d
Show file tree

Hide file tree

Showing 3 changed files with 101 additions and 8 deletions.
diff --git a/.yardopts b/.yardopts
@@ -1 +1 @@
---no-private --protected app/**/*.rb - docs/**/*.md
+--private --protected app/**/*.rb - docs/**/*.md
diff --git a/app/models/detector/standard_identifiers.rb b/app/models/detector/standard_identifiers.rb
@@ -17,12 +17,15 @@ def self.table_name_prefix
     extend Detector::BulkChecker
 
     # Initialization process will run pattern checkers and strip invalid ISSN detections.
-    #   @param phrase String. Often a `Term.phrase`.
-    #   @return Nothing intentional. Data is written to Hash `@detections` during processing.
+    #
+    # @param phrase String. Often a `Term.phrase`.
+    # @return nil. Data is written to Hash `@detections` during processing. Things technically get
+    #   returned here but it is a side effect and should not be relied on.
     def initialize(phrase)
       @detections = {}
       pattern_checker(phrase)
       strip_invalid_issns
+      strip_invalid_isbns
     end
 
     # The record method will consult the set of regex-based detectors that are defined in
@@ -53,13 +56,76 @@ def self.record(term)
     def patterns
       {
         barcode: /^39080[0-9]{9}$/,
-        isbn: /\b(ISBN-*(1[03])* *(: ){0,1})*(([0-9Xx][- ]*){13}|([0-9Xx][- ]*){10})\b/,
+        isbn: /\b(([0-9Xx][- ]*){13}|([0-9Xx][- ]*){10})\b/,
         issn: /\b[0-9]{4}-[0-9]{3}[0-9xX]\b/,
         pmid: /\b((pmid|PMID):\s?(\d{7,8}))\b/,
         doi: %r{\b10\.(\d+\.*)+/(([^\s.])+\.*)+\b}
       }
     end
 
+    # strip_invalid_isbns coordinates the logic to remove ISBNs that are not valid from our list of detected ISBNs
+    #
+    # ISBNs cannot be validated via regex. Regex gives us a list of candidates that look like ISBNs. We remove invalid
+    # ISBNs by following validation specifications defined in the standard.
+    def strip_invalid_isbns
+      return unless @detections[:isbn]
+
+      @detections.delete(:isbn) unless valid_isbn?(@detections[:isbn])
+    end
+
+    # valid_isbn? checks for 10 or 13 digit ISBNs and defers to appropriate methods for each
+    #
+    # @param candidate String. A string representation of a regex detected ISBN.
+    # @return boolean
+    def valid_isbn?(candidate)
+      digits = candidate.delete('-').chars
+
+      # check 10 digit
+      if digits.length == 10
+        valid_isbn_10?(digits)
+      # check 13 digit
+      elsif digits.length == 13
+        valid_isbn_13?(digits)
+      # This shouldn't happen, log an error.
+      else
+        Rails.logger.error("Non-10 or 13 digit sequence detected as ISBN: #{candidate}")
+        Sentry.capture_message('Non-10 or 13 digit sequence detected as ISBN')
+        false
+      end
+    end
+
+    # valid_isbn_10? follows the ISBN 10 specification for validation
+    # https://en.wikipedia.org/wiki/ISBN#ISBN-10_check_digits
+    #
+    # @param digits Array. An array of strings representing each character from a detected ISBN candidate.
+    # @return boolean
+    def valid_isbn_10?(digits)
+      sum = 0
+      digits.each_with_index do |digit, index|
+        digit = '10' if digit.casecmp('x').zero?
+        sum += digit.to_i * (10 - index)
+      end
+      (sum % 11).zero?
+    end
+
+    # valid_isbn_13? follows the ISBN 13 specification for validation
+    # https://en.wikipedia.org/wiki/ISBN#ISBN-13_check_digit_calculation
+    #
+    # @param digits Array. An array of strings representing each character from a detected ISBN candidate.
+    # @return boolean
+    def valid_isbn_13?(digits)
+      sum = 0
+      digits.map(&:to_i).each_with_index do |digit, index|
+        sum += digit * (index.even? ? 1 : 3)
+      end
+
+      (sum % 10).zero?
+    end
+
+    # strip_invalid_issns coordinates the logic to remove ISSNs that are not valid from our list of detected ISSNs
+    #
+    # ISSNs cannot be validated via regex. Regex gives us a list of candidates that look like ISSNs. We remove invalid
+    # ISSNs by following validation specifications defined in the standard.
     def strip_invalid_issns
       return unless @detections[:issn]
 

diff --git a/test/models/detector/standard_identifiers_test.rb b/test/models/detector/standard_identifiers_test.rb
@@ -23,10 +23,24 @@ class StandardIdentifiersTest < ActiveSupport::TestCase
       end
     end
 
+    test 'ISBN-10 examples with incorrect check digits are not detected' do
+      # from wikipedia
+      samples = ['99921-58-10-1', '9971-5-0210-1', '960-425-059-1', '80-902734-1-1', '85-359-0277-1',
+                 '1-84356-028-1', '0-684-84328-1', '0-8044-2957-1', '0-85131-041-1', '93-86954-21-1', '0-943396-04-1',
+                 '0-9752298-0-1']
+
+      samples.each do |isbn|
+        actual = Detector::StandardIdentifiers.new(isbn).detections
+
+        assert_nil(actual[:isbn])
+      end
+    end
+
     test 'ISBN-13 examples' do
-      samples = ['978-99921-58-10-7', '979-9971-5-0210-0', '978-960-425-059-0', '979-80-902734-1-6', '978-85-359-0277-5',
-                 '979-1-84356-028-3', '978-0-684-84328-5', '979-0-8044-2957-X', '978-0-85131-041-9', '979-93-86954-21-4',
-                 '978-0-943396-04-2', '979-0-9752298-0-X']
+      samples = ['978-99921-58-10-4', '978-9971-5-0210-2', '978-960-425-059-2', '978-80-902734-1-2',
+                 '978-85-359-0277-8', '978-1-84356-028-9', '978-0-684-84328-5', '978-0-8044-2957-3',
+                 '978-0-85131-041-1', '978-93-86954-21-3', '978-0-943396-04-0', '978-0-9752298-0-4', '9798531132178',
+                 '9798577456832', '979-8-886-45174-0', '9781319145446']
 
       samples.each do |isbn|
         actual = Detector::StandardIdentifiers.new(isbn).detections
@@ -35,8 +49,21 @@ class StandardIdentifiersTest < ActiveSupport::TestCase
       end
     end
 
+    test 'ISBN-13 examples with incorrect check digits are not detected' do
+      samples = ['978-99921-58-10-1', '978-9971-5-0210-1', '978-960-425-059-1', '978-80-902734-1-1',
+                 '978-85-359-0277-1', '978-1-84356-028-1', '978-0-684-84328-1', '978-0-8044-2957-1',
+                 '978-0-85131-041-2', '978-93-86954-21-1', '978-0-943396-04-1', '978-0-9752298-0-1', '9798531132171',
+                 '9798577456831', '979-8-886-45174-1', '9781319145441']
+
+      samples.each do |isbn|
+        actual = Detector::StandardIdentifiers.new(isbn).detections
+
+        assert_nil(actual[:isbn])
+      end
+    end
+
     test 'not ISBNs' do
-      samples = ['orange cats like popcorn', '1234-6798', 'another ISBN not found here']
+      samples = ['orange cats like popcorn', '1234-6798', 'another ISBN not found here', '99921-58-10-1', '979-8-886-45174-1']
 
       samples.each do |notisbn|
         actual = Detector::StandardIdentifiers.new(notisbn).detections
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		--no-private --protected app/*/.rb - docs/*/.md
		--private --protected app/*/.rb - docs/*/.md