Skip to content

Commit

Permalink
addressing #90, #125, #126
Browse files Browse the repository at this point in the history
  • Loading branch information
ddangelov committed Feb 5, 2021
1 parent 9686e44 commit 059e0a4
Show file tree
Hide file tree
Showing 5 changed files with 54 additions and 25 deletions.
2 changes: 1 addition & 1 deletion docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
author = 'Dimo Angelov'

# The full version, including alpha/beta/rc tags
release = '1.0.20'
release = '1.0.21'


# -- General configuration ---------------------------------------------------
Expand Down
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,5 @@ tensorflow_text
torch
sentence_transformers
hnswlib
joblib<1.0.0
joblib<1.0.0
numpy===1.19.2
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
setuptools.setup(
name="top2vec",
packages=["top2vec"],
version="1.0.20",
version="1.0.21",
author="Dimo Angelov",
author_email="dimo.angelov@gmail.com",
description="Top2Vec learns jointly embedded topic, document and word vectors.",
Expand Down Expand Up @@ -34,6 +34,7 @@
'hdbscan',
'wordcloud',
'joblib < 1.0.0',
'numpy == 1.19.2',
],
extras_require={
'sentence_encoders': [
Expand Down
69 changes: 48 additions & 21 deletions top2vec/Top2Vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,6 @@ class Top2Vec:
default. However they can also be uploaded from a file that is in the
location of embedding_model_path.
Warning: the model at embedding_model_path must match the
Warning: the model at embedding_model_path must match the
embedding_model parameter type.
Expand Down Expand Up @@ -162,6 +161,12 @@ class Top2Vec:
tokenizer for document embedding. If set to True the tokenizer, either
default or passed callable will be used to tokenize the text to
extract the vocabulary for word embedding.
umap_args: dict (Optional, default None)
Pass custom arguments to UMAP.
hdbscan_args: dict (Optional, default None)
Pass custom arguments to HDBSCAN.
verbose: bool (Optional, default True)
Whether to print status data during training.
Expand All @@ -179,7 +184,10 @@ def __init__(self,
workers=None,
tokenizer=None,
use_embedding_model_tokenizer=False,
verbose=True):
umap_args=None,
hdbscan_args=None,
verbose=True
):

if verbose:
logger.setLevel(logging.DEBUG)
Expand All @@ -188,10 +196,8 @@ def __init__(self,
logger.setLevel(logging.WARNING)
self.verbose = False

if tokenizer is not None:
self._tokenizer = tokenizer
else:
self._tokenizer = default_tokenizer
if tokenizer is None:
tokenizer = default_tokenizer

# validate documents
if not (isinstance(documents, list) or isinstance(documents, np.ndarray)):
Expand Down Expand Up @@ -280,14 +286,14 @@ def __init__(self,
logger.info('Pre-processing documents for training')

if use_corpus_file:
processed = [' '.join(self._tokenizer(doc)) for doc in documents]
processed = [' '.join(tokenizer(doc)) for doc in documents]
lines = "\n".join(processed)
temp = tempfile.NamedTemporaryFile(mode='w+t')
temp.write(lines)
doc2vec_args["corpus_file"] = temp.name

else:
train_corpus = [TaggedDocument(self._tokenizer(doc), [i]) for i, doc in enumerate(documents)]
train_corpus = [TaggedDocument(tokenizer(doc), [i]) for i, doc in enumerate(documents)]
doc2vec_args["documents"] = train_corpus

logger.info('Creating joint document/word embedding')
Expand All @@ -307,7 +313,7 @@ def __init__(self,
logger.info('Pre-processing documents for training')

# preprocess documents
train_corpus = [' '.join(self._tokenizer(doc)) for doc in documents]
train_corpus = [' '.join(tokenizer(doc)) for doc in documents]

# preprocess vocabulary
vectorizer = CountVectorizer()
Expand Down Expand Up @@ -340,15 +346,23 @@ def __init__(self,

# create 5D embeddings of documents
logger.info('Creating lower dimension embedding of documents')
umap_model = umap.UMAP(n_neighbors=15,
n_components=5,
metric='cosine').fit(self._get_document_vectors(norm=False))

if umap_args is None:
umap_args = {'n_neighbors': 15,
'n_components': 5,
'metric': 'cosine'}

umap_model = umap.UMAP(**umap_args).fit(self._get_document_vectors(norm=False))

# find dense areas of document vectors
logger.info('Finding dense areas of documents')
cluster = hdbscan.HDBSCAN(min_cluster_size=15,
metric='euclidean',
cluster_selection_method='eom').fit(umap_model.embedding_)

if hdbscan_args is None:
hdbscan_args = {'min_cluster_size': 15,
'metric': 'euclidean',
'cluster_selection_method': 'eom'}

cluster = hdbscan.HDBSCAN(**hdbscan_args).fit(umap_model.embedding_)

# calculate topic vectors from dense areas of documents
logger.info('Finding topics')
Expand Down Expand Up @@ -402,6 +416,7 @@ def save(self, file):
file: str
File where model will be saved.
"""

# do not save sentence encoders and sentence transformers
if self.embedding_model != "doc2vec":
self.embed = None
Expand Down Expand Up @@ -1101,7 +1116,7 @@ def get_documents_topics(self, doc_ids, reduced=False):

return doc_topics, doc_dist, topic_words, topic_word_scores

def add_documents(self, documents, doc_ids=None):
def add_documents(self, documents, doc_ids=None, tokenizer=None, use_embedding_model_tokenizer=False):
"""
Update the model with new documents.
Expand All @@ -1117,12 +1132,22 @@ def add_documents(self, documents, doc_ids=None):
documents: List of str
doc_ids: List of str, int (Optional)
Only required when doc_ids were given to the original model.
A unique value per document that will be used for referring to
documents in search results.
tokenizer: callable (Optional, default None)
Override the default tokenization method. If None then
gensim.utils.simple_preprocess will be used.
use_embedding_model_tokenizer: bool (Optional, default False)
If using an embedding model other than doc2vec, use the model's
tokenizer for document embedding.
"""
# if tokenizer is not passed use default
if tokenizer is None:
tokenizer = default_tokenizer

# add documents
self._validate_documents(documents)
Expand All @@ -1146,10 +1171,8 @@ def add_documents(self, documents, doc_ids=None):
else:
raise ValueError("doc_ids cannot be used because they were not provided to model during training.")

# get document vectors
docs_processed = [self._tokenizer(doc) for doc in documents]

if self.embedding_model == "doc2vec":
docs_processed = [tokenizer(doc) for doc in documents]
document_vectors = np.vstack([self.model.infer_vector(doc_words=doc,
alpha=0.025,
min_alpha=0.01,
Expand All @@ -1162,7 +1185,11 @@ def add_documents(self, documents, doc_ids=None):
self.model.docvecs.init_sims()

else:
docs_training = [' '.join(doc) for doc in docs_processed]
if use_embedding_model_tokenizer:
docs_training = documents
else:
docs_processed = [tokenizer(doc) for doc in documents]
docs_training = [' '.join(doc) for doc in docs_processed]
document_vectors = self._embed_documents(docs_training)
self._set_document_vectors(np.vstack([self._get_document_vectors(), document_vectors]))

Expand Down
2 changes: 1 addition & 1 deletion top2vec/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
from top2vec.Top2Vec import Top2Vec

__version__ = '1.0.20'
__version__ = '1.0.21'

0 comments on commit 059e0a4

Please sign in to comment.