addressing #90, #125, #126

ddangelov · Feb 5, 2021 · 059e0a4 · 059e0a4
1 parent 9686e44
commit 059e0a4
Show file tree

Hide file tree

Showing 5 changed files with 54 additions and 25 deletions.
diff --git a/docs/conf.py b/docs/conf.py
@@ -29,7 +29,7 @@
 author = 'Dimo Angelov'
 
 # The full version, including alpha/beta/rc tags
-release = '1.0.20'
+release = '1.0.21'
 
 
 # -- General configuration ---------------------------------------------------

diff --git a/requirements.txt b/requirements.txt
@@ -11,4 +11,5 @@ tensorflow_text
 torch
 sentence_transformers
 hnswlib
-joblib<1.0.0
+joblib<1.0.0
+numpy===1.19.2
diff --git a/setup.py b/setup.py
@@ -6,7 +6,7 @@
 setuptools.setup(
     name="top2vec",
     packages=["top2vec"],
-    version="1.0.20",
+    version="1.0.21",
     author="Dimo Angelov",
     author_email="dimo.angelov@gmail.com",
     description="Top2Vec learns jointly embedded topic, document and word vectors.",
@@ -34,6 +34,7 @@
         'hdbscan',
         'wordcloud',
         'joblib < 1.0.0',
+        'numpy == 1.19.2',
     ],
     extras_require={
         'sentence_encoders': [

diff --git a/top2vec/Top2Vec.py b/top2vec/Top2Vec.py
@@ -103,7 +103,6 @@ class Top2Vec:
         default. However they can also be uploaded from a file that is in the
         location of embedding_model_path.
 
-        Warning: the model at embedding_model_path must match the
         Warning: the model at embedding_model_path must match the
         embedding_model parameter type.
 
@@ -162,6 +161,12 @@ class Top2Vec:
         tokenizer for document embedding. If set to True the tokenizer, either
         default or passed callable will be used to tokenize the text to
         extract the vocabulary for word embedding.
+
+    umap_args: dict (Optional, default None)
+        Pass custom arguments to UMAP.
+
+    hdbscan_args: dict (Optional, default None)
+        Pass custom arguments to HDBSCAN.
     
     verbose: bool (Optional, default True)
         Whether to print status data during training.
@@ -179,7 +184,10 @@ def __init__(self,
                  workers=None,
                  tokenizer=None,
                  use_embedding_model_tokenizer=False,
-                 verbose=True):
+                 umap_args=None,
+                 hdbscan_args=None,
+                 verbose=True
+                 ):
 
         if verbose:
             logger.setLevel(logging.DEBUG)
@@ -188,10 +196,8 @@ def __init__(self,
             logger.setLevel(logging.WARNING)
             self.verbose = False
 
-        if tokenizer is not None:
-            self._tokenizer = tokenizer
-        else:
-            self._tokenizer = default_tokenizer
+        if tokenizer is None:
+            tokenizer = default_tokenizer
 
         # validate documents
         if not (isinstance(documents, list) or isinstance(documents, np.ndarray)):
@@ -280,14 +286,14 @@ def __init__(self,
             logger.info('Pre-processing documents for training')
 
             if use_corpus_file:
-                processed = [' '.join(self._tokenizer(doc)) for doc in documents]
+                processed = [' '.join(tokenizer(doc)) for doc in documents]
                 lines = "\n".join(processed)
                 temp = tempfile.NamedTemporaryFile(mode='w+t')
                 temp.write(lines)
                 doc2vec_args["corpus_file"] = temp.name
 
             else:
-                train_corpus = [TaggedDocument(self._tokenizer(doc), [i]) for i, doc in enumerate(documents)]
+                train_corpus = [TaggedDocument(tokenizer(doc), [i]) for i, doc in enumerate(documents)]
                 doc2vec_args["documents"] = train_corpus
 
             logger.info('Creating joint document/word embedding')
@@ -307,7 +313,7 @@ def __init__(self,
             logger.info('Pre-processing documents for training')
 
             # preprocess documents
-            train_corpus = [' '.join(self._tokenizer(doc)) for doc in documents]
+            train_corpus = [' '.join(tokenizer(doc)) for doc in documents]
 
             # preprocess vocabulary
             vectorizer = CountVectorizer()
@@ -340,15 +346,23 @@ def __init__(self,
 
         # create 5D embeddings of documents
         logger.info('Creating lower dimension embedding of documents')
-        umap_model = umap.UMAP(n_neighbors=15,
-                               n_components=5,
-                               metric='cosine').fit(self._get_document_vectors(norm=False))
+
+        if umap_args is None:
+            umap_args = {'n_neighbors': 15,
+                         'n_components': 5,
+                         'metric': 'cosine'}
+
+        umap_model = umap.UMAP(**umap_args).fit(self._get_document_vectors(norm=False))
 
         # find dense areas of document vectors
         logger.info('Finding dense areas of documents')
-        cluster = hdbscan.HDBSCAN(min_cluster_size=15,
-                                  metric='euclidean',
-                                  cluster_selection_method='eom').fit(umap_model.embedding_)
+
+        if hdbscan_args is None:
+            hdbscan_args = {'min_cluster_size': 15,
+                             'metric': 'euclidean',
+                             'cluster_selection_method': 'eom'}
+
+        cluster = hdbscan.HDBSCAN(**hdbscan_args).fit(umap_model.embedding_)
 
         # calculate topic vectors from dense areas of documents
         logger.info('Finding topics')
@@ -402,6 +416,7 @@ def save(self, file):
         file: str
             File where model will be saved.
         """
+
         # do not save sentence encoders and sentence transformers
         if self.embedding_model != "doc2vec":
             self.embed = None
@@ -1101,7 +1116,7 @@ def get_documents_topics(self, doc_ids, reduced=False):
 
         return doc_topics, doc_dist, topic_words, topic_word_scores
 
-    def add_documents(self, documents, doc_ids=None):
+    def add_documents(self, documents, doc_ids=None, tokenizer=None, use_embedding_model_tokenizer=False):
         """
         Update the model with new documents.
 
@@ -1117,12 +1132,22 @@ def add_documents(self, documents, doc_ids=None):
         documents: List of str
 
         doc_ids: List of str, int (Optional)
-
             Only required when doc_ids were given to the original model.
 
             A unique value per document that will be used for referring to
             documents in search results.
+
+        tokenizer: callable (Optional, default None)
+            Override the default tokenization method. If None then
+            gensim.utils.simple_preprocess will be used.
+
+        use_embedding_model_tokenizer: bool (Optional, default False)
+            If using an embedding model other than doc2vec, use the model's
+            tokenizer for document embedding.
         """
+        # if tokenizer is not passed use default
+        if tokenizer is None:
+            tokenizer = default_tokenizer
 
         # add documents
         self._validate_documents(documents)
@@ -1146,10 +1171,8 @@ def add_documents(self, documents, doc_ids=None):
         else:
             raise ValueError("doc_ids cannot be used because they were not provided to model during training.")
 
-        # get document vectors
-        docs_processed = [self._tokenizer(doc) for doc in documents]
-
         if self.embedding_model == "doc2vec":
+            docs_processed = [tokenizer(doc) for doc in documents]
             document_vectors = np.vstack([self.model.infer_vector(doc_words=doc,
                                                                   alpha=0.025,
                                                                   min_alpha=0.01,
@@ -1162,7 +1185,11 @@ def add_documents(self, documents, doc_ids=None):
             self.model.docvecs.init_sims()
 
         else:
-            docs_training = [' '.join(doc) for doc in docs_processed]
+            if use_embedding_model_tokenizer:
+                docs_training = documents
+            else:
+                docs_processed = [tokenizer(doc) for doc in documents]
+                docs_training = [' '.join(doc) for doc in docs_processed]
             document_vectors = self._embed_documents(docs_training)
             self._set_document_vectors(np.vstack([self._get_document_vectors(), document_vectors]))
 

diff --git a/top2vec/__init__.py b/top2vec/__init__.py
@@ -1,3 +1,3 @@
 from top2vec.Top2Vec import Top2Vec
 
-__version__ = '1.0.20'
+__version__ = '1.0.21'