Skip to content

Commit

Permalink
Merge pull request #171 from MITLibraries/tco-114-validate-isbns
Browse files Browse the repository at this point in the history
Validate ISBNs detected via regex
  • Loading branch information
JPrevost authored Jan 15, 2025
2 parents 7f8bd83 + 624e6fc commit 1c8ce6d
Show file tree
Hide file tree
Showing 3 changed files with 101 additions and 8 deletions.
2 changes: 1 addition & 1 deletion .yardopts
Original file line number Diff line number Diff line change
@@ -1 +1 @@
--no-private --protected app/**/*.rb - docs/**/*.md
--private --protected app/**/*.rb - docs/**/*.md
72 changes: 69 additions & 3 deletions app/models/detector/standard_identifiers.rb
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,15 @@ def self.table_name_prefix
extend Detector::BulkChecker

# Initialization process will run pattern checkers and strip invalid ISSN detections.
# @param phrase String. Often a `Term.phrase`.
# @return Nothing intentional. Data is written to Hash `@detections` during processing.
#
# @param phrase String. Often a `Term.phrase`.
# @return nil. Data is written to Hash `@detections` during processing. Things technically get
# returned here but it is a side effect and should not be relied on.
def initialize(phrase)
@detections = {}
pattern_checker(phrase)
strip_invalid_issns
strip_invalid_isbns
end

# The record method will consult the set of regex-based detectors that are defined in
Expand Down Expand Up @@ -53,13 +56,76 @@ def self.record(term)
def patterns
{
barcode: /^39080[0-9]{9}$/,
isbn: /\b(ISBN-*(1[03])* *(: ){0,1})*(([0-9Xx][- ]*){13}|([0-9Xx][- ]*){10})\b/,
isbn: /\b(([0-9Xx][- ]*){13}|([0-9Xx][- ]*){10})\b/,
issn: /\b[0-9]{4}-[0-9]{3}[0-9xX]\b/,
pmid: /\b((pmid|PMID):\s?(\d{7,8}))\b/,
doi: %r{\b10\.(\d+\.*)+/(([^\s.])+\.*)+\b}
}
end

# strip_invalid_isbns coordinates the logic to remove ISBNs that are not valid from our list of detected ISBNs
#
# ISBNs cannot be validated via regex. Regex gives us a list of candidates that look like ISBNs. We remove invalid
# ISBNs by following validation specifications defined in the standard.
def strip_invalid_isbns
return unless @detections[:isbn]

@detections.delete(:isbn) unless valid_isbn?(@detections[:isbn])
end

# valid_isbn? checks for 10 or 13 digit ISBNs and defers to appropriate methods for each
#
# @param candidate String. A string representation of a regex detected ISBN.
# @return boolean
def valid_isbn?(candidate)
digits = candidate.delete('-').chars

# check 10 digit
if digits.length == 10
valid_isbn_10?(digits)
# check 13 digit
elsif digits.length == 13
valid_isbn_13?(digits)
# This shouldn't happen, log an error.
else
Rails.logger.error("Non-10 or 13 digit sequence detected as ISBN: #{candidate}")
Sentry.capture_message('Non-10 or 13 digit sequence detected as ISBN')
false
end
end

# valid_isbn_10? follows the ISBN 10 specification for validation
# https://en.wikipedia.org/wiki/ISBN#ISBN-10_check_digits
#
# @param digits Array. An array of strings representing each character from a detected ISBN candidate.
# @return boolean
def valid_isbn_10?(digits)
sum = 0
digits.each_with_index do |digit, index|
digit = '10' if digit.casecmp('x').zero?
sum += digit.to_i * (10 - index)
end
(sum % 11).zero?
end

# valid_isbn_13? follows the ISBN 13 specification for validation
# https://en.wikipedia.org/wiki/ISBN#ISBN-13_check_digit_calculation
#
# @param digits Array. An array of strings representing each character from a detected ISBN candidate.
# @return boolean
def valid_isbn_13?(digits)
sum = 0
digits.map(&:to_i).each_with_index do |digit, index|
sum += digit * (index.even? ? 1 : 3)
end

(sum % 10).zero?
end

# strip_invalid_issns coordinates the logic to remove ISSNs that are not valid from our list of detected ISSNs
#
# ISSNs cannot be validated via regex. Regex gives us a list of candidates that look like ISSNs. We remove invalid
# ISSNs by following validation specifications defined in the standard.
def strip_invalid_issns
return unless @detections[:issn]

Expand Down
35 changes: 31 additions & 4 deletions test/models/detector/standard_identifiers_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,24 @@ class StandardIdentifiersTest < ActiveSupport::TestCase
end
end

test 'ISBN-10 examples with incorrect check digits are not detected' do
# from wikipedia
samples = ['99921-58-10-1', '9971-5-0210-1', '960-425-059-1', '80-902734-1-1', '85-359-0277-1',
'1-84356-028-1', '0-684-84328-1', '0-8044-2957-1', '0-85131-041-1', '93-86954-21-1', '0-943396-04-1',
'0-9752298-0-1']

samples.each do |isbn|
actual = Detector::StandardIdentifiers.new(isbn).detections

assert_nil(actual[:isbn])
end
end

test 'ISBN-13 examples' do
samples = ['978-99921-58-10-7', '979-9971-5-0210-0', '978-960-425-059-0', '979-80-902734-1-6', '978-85-359-0277-5',
'979-1-84356-028-3', '978-0-684-84328-5', '979-0-8044-2957-X', '978-0-85131-041-9', '979-93-86954-21-4',
'978-0-943396-04-2', '979-0-9752298-0-X']
samples = ['978-99921-58-10-4', '978-9971-5-0210-2', '978-960-425-059-2', '978-80-902734-1-2',
'978-85-359-0277-8', '978-1-84356-028-9', '978-0-684-84328-5', '978-0-8044-2957-3',
'978-0-85131-041-1', '978-93-86954-21-3', '978-0-943396-04-0', '978-0-9752298-0-4', '9798531132178',
'9798577456832', '979-8-886-45174-0', '9781319145446']

samples.each do |isbn|
actual = Detector::StandardIdentifiers.new(isbn).detections
Expand All @@ -35,8 +49,21 @@ class StandardIdentifiersTest < ActiveSupport::TestCase
end
end

test 'ISBN-13 examples with incorrect check digits are not detected' do
samples = ['978-99921-58-10-1', '978-9971-5-0210-1', '978-960-425-059-1', '978-80-902734-1-1',
'978-85-359-0277-1', '978-1-84356-028-1', '978-0-684-84328-1', '978-0-8044-2957-1',
'978-0-85131-041-2', '978-93-86954-21-1', '978-0-943396-04-1', '978-0-9752298-0-1', '9798531132171',
'9798577456831', '979-8-886-45174-1', '9781319145441']

samples.each do |isbn|
actual = Detector::StandardIdentifiers.new(isbn).detections

assert_nil(actual[:isbn])
end
end

test 'not ISBNs' do
samples = ['orange cats like popcorn', '1234-6798', 'another ISBN not found here']
samples = ['orange cats like popcorn', '1234-6798', 'another ISBN not found here', '99921-58-10-1', '979-8-886-45174-1']

samples.each do |notisbn|
actual = Detector::StandardIdentifiers.new(notisbn).detections
Expand Down

0 comments on commit 1c8ce6d

Please sign in to comment.