added use_embedding_model_tokenizer option and fixed wordcloud issues

ddangelov · Jan 9, 2021 · 9686e44 · 9686e44
1 parent e67b210
commit 9686e44
Show file tree

Hide file tree

Showing 6 changed files with 57 additions and 29 deletions.
diff --git a/docs/conf.py b/docs/conf.py
@@ -29,7 +29,7 @@
 author = 'Dimo Angelov'
 
 # The full version, including alpha/beta/rc tags
-release = '1.0.19'
+release = '1.0.20'
 
 
 # -- General configuration ---------------------------------------------------

diff --git a/requirements.txt b/requirements.txt
@@ -10,4 +10,5 @@ tensorflow_hub
 tensorflow_text
 torch
 sentence_transformers
-hnswlib
+hnswlib
+joblib<1.0.0
diff --git a/setup.py b/setup.py
@@ -6,7 +6,7 @@
 setuptools.setup(
     name="top2vec",
     packages=["top2vec"],
-    version="1.0.19",
+    version="1.0.20",
     author="Dimo Angelov",
     author_email="dimo.angelov@gmail.com",
     description="Top2Vec learns jointly embedded topic, document and word vectors.",
@@ -33,6 +33,7 @@
         'umap-learn',
         'hdbscan',
         'wordcloud',
+        'joblib < 1.0.0',
     ],
     extras_require={
         'sentence_encoders': [

diff --git a/top2vec/Top2Vec.py b/top2vec/Top2Vec.py
@@ -16,6 +16,7 @@
 import tempfile
 from sklearn.feature_extraction.text import CountVectorizer
 from sklearn.preprocessing import normalize
+from scipy.special import softmax
 
 try:
     import hnswlib
@@ -90,8 +91,9 @@ class Top2Vec:
 
         The distiluse-base-multilingual-cased pre-trained sentence transformer
         is suggested for multilingual datasets and languages that are not
-        covered by the multilingual universal sentence encoder. The transformer
-        is significantly slower than the universal sentence encoder options.
+        covered by the multilingual universal sentence encoder. The
+        transformer is significantly slower than the universal sentence
+        encoder options.
 
         For more informati ond istiluse-base-multilingual-cased visit:
         https://www.sbert.net/docs/pretrained_models.html
@@ -119,9 +121,9 @@ class Top2Vec:
         It will determine how fast the model takes to train. The
         fast-learn option is the fastest and will generate the lowest quality
         vectors. The learn option will learn better quality vectors but take
-        a longer time to train. The deep-learn option will learn the best quality
-        vectors but will take significant time to train. The valid string speed
-        options are:
+        a longer time to train. The deep-learn option will learn the best
+        quality vectors but will take significant time to train. The valid
+        string speed options are:
         
             * fast-learn
             * learn
@@ -131,10 +133,10 @@ class Top2Vec:
 
         This parameter is only used when using doc2vec as embedding_model.
 
-        Setting use_corpus_file to True can sometimes provide speedup for large
-        datasets when multiple worker threads are available. Documents are
-        still passed to the model as a list of str, the model will create a
-        temporary corpus file for training.
+        Setting use_corpus_file to True can sometimes provide speedup for
+        large datasets when multiple worker threads are available. Documents
+        are still passed to the model as a list of str, the model will create
+        a temporary corpus file for training.
 
     document_ids: List of str, int (Optional)
         A unique value per document that will be used for referring to
@@ -144,7 +146,8 @@ class Top2Vec:
     keep_documents: bool (Optional, default True)
         If set to False documents will only be used for training and not saved
         as part of the model. This will reduce model size. When using search
-        functions only document ids will be returned, not the actual documents.
+        functions only document ids will be returned, not the actual
+        documents.
 
     workers: int (Optional)
         The amount of worker threads to be used in training the model. Larger
@@ -153,6 +156,12 @@ class Top2Vec:
     tokenizer: callable (Optional, default None)
         Override the default tokenization method. If None then
         gensim.utils.simple_preprocess will be used.
+
+    use_embedding_model_tokenizer: bool (Optional, default False)
+        If using an embedding model other than doc2vec, use the model's
+        tokenizer for document embedding. If set to True the tokenizer, either
+        default or passed callable will be used to tokenize the text to
+        extract the vocabulary for word embedding.
     
     verbose: bool (Optional, default True)
         Whether to print status data during training.
@@ -169,6 +178,7 @@ def __init__(self,
                  keep_documents=True,
                  workers=None,
                  tokenizer=None,
+                 use_embedding_model_tokenizer=False,
                  verbose=True):
 
         if verbose:
@@ -320,7 +330,10 @@ def __init__(self,
             self.word_vectors = self._l2_normalize(np.array(self.embed(self.vocab)))
 
             # embed documents
-            self.document_vectors = self._embed_documents(train_corpus)
+            if use_embedding_model_tokenizer:
+                self.document_vectors = self._embed_documents(documents)
+            else:
+                self.document_vectors = self._embed_documents(train_corpus)
 
         else:
             raise ValueError(f"{embedding_model} is an invalid embedding model.")
@@ -430,7 +443,7 @@ def load(cls, file):
             if not _HAVE_HNSWLIB:
                 raise ImportError(f"Cannot load document index.\n\n"
                                   "Try: pip install top2vec[indexing]\n\n"
-                                  "Alternatively try: pip hnswlib")
+                                  "Alternatively try: pip install hnswlib")
 
             temp = tempfile.NamedTemporaryFile(mode='w+b')
             temp.write(top2vec_model.serialized_document_index)
@@ -452,7 +465,7 @@ def load(cls, file):
             if not _HAVE_HNSWLIB:
                 raise ImportError(f"Cannot load word index.\n\n"
                                   "Try: pip install top2vec[indexing]\n\n"
-                                  "Alternatively try: pip hnswlib")
+                                  "Alternatively try: pip install hnswlib")
 
             temp = tempfile.NamedTemporaryFile(mode='w+b')
             temp.write(top2vec_model.serialized_word_index)
@@ -735,7 +748,7 @@ def _check_hnswlib_status():
         if not _HAVE_HNSWLIB:
             raise ImportError(f"Indexing is not available.\n\n"
                               "Try: pip install top2vec[indexing]\n\n"
-                              "Alternatively try: pip hnswlib")
+                              "Alternatively try: pip install hnswlib")
 
     def _check_document_index_status(self):
         if self.document_index is None:
@@ -2084,10 +2097,12 @@ def generate_topic_wordcloud(self, topic_num, background_color="black", reduced=
         if reduced:
             self._validate_hierarchical_reduction()
             self._validate_topic_num(topic_num, reduced)
-            word_score_dict = dict(zip(self.topic_words_reduced[topic_num], self.topic_word_scores_reduced[topic_num]))
+            word_score_dict = dict(zip(self.topic_words_reduced[topic_num],
+                                       softmax(self.topic_word_scores_reduced[topic_num])))
         else:
             self._validate_topic_num(topic_num, reduced)
-            word_score_dict = dict(zip(self.topic_words[topic_num], self.topic_word_scores[topic_num]))
+            word_score_dict = dict(zip(self.topic_words[topic_num],
+                                       softmax(self.topic_word_scores[topic_num])))
 
         plt.figure(figsize=(16, 4),
                    dpi=200)

diff --git a/top2vec/__init__.py b/top2vec/__init__.py
@@ -1,3 +1,3 @@
 from top2vec.Top2Vec import Top2Vec
 
-__version__ = '1.0.19'
+__version__ = '1.0.20'
diff --git a/top2vec/tests/test_top2vec.py b/top2vec/tests/test_top2vec.py
@@ -1,5 +1,5 @@
 import pytest
-from top2vec import Top2Vec
+from top2vec.Top2Vec import Top2Vec
 from sklearn.datasets import fetch_20newsgroups
 import numpy as np
 import tempfile
@@ -24,16 +24,27 @@
 # test USE
 top2vec_use = Top2Vec(documents=newsgroups_documents, embedding_model='universal-sentence-encoder')
 
+# test USE with model embedding
+top2vec_use_model_embedding = Top2Vec(documents=newsgroups_documents,
+                                      embedding_model='universal-sentence-encoder',
+                                      use_embedding_model_tokenizer=True)
+
 # test USE-multilang
 top2vec_use_multilang = Top2Vec(documents=newsgroups_documents,
                                 embedding_model='universal-sentence-encoder-multilingual')
 
-# test USE-multilang
+# test Sentence Transformer-multilang
 top2vec_transformer_multilang = Top2Vec(documents=newsgroups_documents,
                                         embedding_model='distiluse-base-multilingual-cased')
 
+# test Sentence Transformer with model emebdding
+top2vec_transformer_model_embedding = Top2Vec(documents=newsgroups_documents,
+                                              embedding_model='distiluse-base-multilingual-cased',
+                                              use_embedding_model_tokenizer=True)
+
 models = [top2vec, top2vec_docids, top2vec_no_docs, top2vec_corpus_file,
-          top2vec_use, top2vec_use_multilang, top2vec_transformer_multilang]
+          top2vec_use, top2vec_use_multilang, top2vec_transformer_multilang,
+          top2vec_use_model_embedding, top2vec_transformer_model_embedding]
 
 
 def get_model_vocab(top2vec_model):
@@ -177,12 +188,12 @@ def test_get_topic_size(top2vec_model, reduced):
     assert all(topic_sizes[i] >= topic_sizes[i + 1] for i in range(len(topic_sizes) - 1))
 
 
-# @pytest.mark.parametrize('top2vec_model', models)
-# @pytest.mark.parametrize('reduced', [False, True])
-# def test_generate_topic_wordcloud(top2vec_model, reduced):
-#     # generate word cloud
-#     num_topics = top2vec_model.get_num_topics(reduced=reduced)
-#     top2vec_model.generate_topic_wordcloud(num_topics - 1, reduced=reduced)
+@pytest.mark.parametrize('top2vec_model', models)
+@pytest.mark.parametrize('reduced', [False, True])
+def test_generate_topic_wordcloud(top2vec_model, reduced):
+    # generate word cloud
+    num_topics = top2vec_model.get_num_topics(reduced=reduced)
+    top2vec_model.generate_topic_wordcloud(num_topics - 1, reduced=reduced)
 
 
 @pytest.mark.parametrize('top2vec_model', models)