diff --git a/lib/natural/tfidf/tfidf.js b/lib/natural/tfidf/tfidf.js index 9feca8fac..7faf42c24 100644 --- a/lib/natural/tfidf/tfidf.js +++ b/lib/natural/tfidf/tfidf.js @@ -178,7 +178,7 @@ class TfIdf { term, tf: TfIdf.tf(term, _this.documents[d]), idf: _this.idf(term), - tfidf: _this.tfidf(term, d) + tfidf: _this.tfidf([term], d) }) } } diff --git a/spec/tfidf_spec.ts b/spec/tfidf_spec.ts index 6419fbdde..e56feae1b 100644 --- a/spec/tfidf_spec.ts +++ b/spec/tfidf_spec.ts @@ -95,6 +95,17 @@ describe('tfidf', function () { }) }) + // Issue #634 - prevent tfidf to apply a tokenizer to terms that are already tokenized + describe('tfidf with tokenized terms', function () { + tfidf = new TfIdf() + tfidf.addDocument(['domain', 'google.com']) + const terms: TfIdfTerm[] = tfidf.listTerms(0) + it ('should list important terms correctly without tokenizing again', function () { + expect(terms[0].tfidf).toBeGreaterThan(0) + expect(terms[1].tfidf).toBeGreaterThan(0) + }) + }) + describe('special cases', function () { // In response to it('should handle reserved function names correctly in documents', function () {