Skip to content

Commit

Permalink
added use_embedding_model_tokenizer option and fixed wordcloud issues
Browse files Browse the repository at this point in the history
  • Loading branch information
ddangelov committed Jan 9, 2021
1 parent e67b210 commit 9686e44
Show file tree
Hide file tree
Showing 6 changed files with 57 additions and 29 deletions.
2 changes: 1 addition & 1 deletion docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
author = 'Dimo Angelov'

# The full version, including alpha/beta/rc tags
release = '1.0.19'
release = '1.0.20'


# -- General configuration ---------------------------------------------------
Expand Down
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,5 @@ tensorflow_hub
tensorflow_text
torch
sentence_transformers
hnswlib
hnswlib
joblib<1.0.0
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
setuptools.setup(
name="top2vec",
packages=["top2vec"],
version="1.0.19",
version="1.0.20",
author="Dimo Angelov",
author_email="dimo.angelov@gmail.com",
description="Top2Vec learns jointly embedded topic, document and word vectors.",
Expand All @@ -33,6 +33,7 @@
'umap-learn',
'hdbscan',
'wordcloud',
'joblib < 1.0.0',
],
extras_require={
'sentence_encoders': [
Expand Down
47 changes: 31 additions & 16 deletions top2vec/Top2Vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import tempfile
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import normalize
from scipy.special import softmax

try:
import hnswlib
Expand Down Expand Up @@ -90,8 +91,9 @@ class Top2Vec:
The distiluse-base-multilingual-cased pre-trained sentence transformer
is suggested for multilingual datasets and languages that are not
covered by the multilingual universal sentence encoder. The transformer
is significantly slower than the universal sentence encoder options.
covered by the multilingual universal sentence encoder. The
transformer is significantly slower than the universal sentence
encoder options.
For more informati ond istiluse-base-multilingual-cased visit:
https://www.sbert.net/docs/pretrained_models.html
Expand Down Expand Up @@ -119,9 +121,9 @@ class Top2Vec:
It will determine how fast the model takes to train. The
fast-learn option is the fastest and will generate the lowest quality
vectors. The learn option will learn better quality vectors but take
a longer time to train. The deep-learn option will learn the best quality
vectors but will take significant time to train. The valid string speed
options are:
a longer time to train. The deep-learn option will learn the best
quality vectors but will take significant time to train. The valid
string speed options are:
* fast-learn
* learn
Expand All @@ -131,10 +133,10 @@ class Top2Vec:
This parameter is only used when using doc2vec as embedding_model.
Setting use_corpus_file to True can sometimes provide speedup for large
datasets when multiple worker threads are available. Documents are
still passed to the model as a list of str, the model will create a
temporary corpus file for training.
Setting use_corpus_file to True can sometimes provide speedup for
large datasets when multiple worker threads are available. Documents
are still passed to the model as a list of str, the model will create
a temporary corpus file for training.
document_ids: List of str, int (Optional)
A unique value per document that will be used for referring to
Expand All @@ -144,7 +146,8 @@ class Top2Vec:
keep_documents: bool (Optional, default True)
If set to False documents will only be used for training and not saved
as part of the model. This will reduce model size. When using search
functions only document ids will be returned, not the actual documents.
functions only document ids will be returned, not the actual
documents.
workers: int (Optional)
The amount of worker threads to be used in training the model. Larger
Expand All @@ -153,6 +156,12 @@ class Top2Vec:
tokenizer: callable (Optional, default None)
Override the default tokenization method. If None then
gensim.utils.simple_preprocess will be used.
use_embedding_model_tokenizer: bool (Optional, default False)
If using an embedding model other than doc2vec, use the model's
tokenizer for document embedding. If set to True the tokenizer, either
default or passed callable will be used to tokenize the text to
extract the vocabulary for word embedding.
verbose: bool (Optional, default True)
Whether to print status data during training.
Expand All @@ -169,6 +178,7 @@ def __init__(self,
keep_documents=True,
workers=None,
tokenizer=None,
use_embedding_model_tokenizer=False,
verbose=True):

if verbose:
Expand Down Expand Up @@ -320,7 +330,10 @@ def __init__(self,
self.word_vectors = self._l2_normalize(np.array(self.embed(self.vocab)))

# embed documents
self.document_vectors = self._embed_documents(train_corpus)
if use_embedding_model_tokenizer:
self.document_vectors = self._embed_documents(documents)
else:
self.document_vectors = self._embed_documents(train_corpus)

else:
raise ValueError(f"{embedding_model} is an invalid embedding model.")
Expand Down Expand Up @@ -430,7 +443,7 @@ def load(cls, file):
if not _HAVE_HNSWLIB:
raise ImportError(f"Cannot load document index.\n\n"
"Try: pip install top2vec[indexing]\n\n"
"Alternatively try: pip hnswlib")
"Alternatively try: pip install hnswlib")

temp = tempfile.NamedTemporaryFile(mode='w+b')
temp.write(top2vec_model.serialized_document_index)
Expand All @@ -452,7 +465,7 @@ def load(cls, file):
if not _HAVE_HNSWLIB:
raise ImportError(f"Cannot load word index.\n\n"
"Try: pip install top2vec[indexing]\n\n"
"Alternatively try: pip hnswlib")
"Alternatively try: pip install hnswlib")

temp = tempfile.NamedTemporaryFile(mode='w+b')
temp.write(top2vec_model.serialized_word_index)
Expand Down Expand Up @@ -735,7 +748,7 @@ def _check_hnswlib_status():
if not _HAVE_HNSWLIB:
raise ImportError(f"Indexing is not available.\n\n"
"Try: pip install top2vec[indexing]\n\n"
"Alternatively try: pip hnswlib")
"Alternatively try: pip install hnswlib")

def _check_document_index_status(self):
if self.document_index is None:
Expand Down Expand Up @@ -2084,10 +2097,12 @@ def generate_topic_wordcloud(self, topic_num, background_color="black", reduced=
if reduced:
self._validate_hierarchical_reduction()
self._validate_topic_num(topic_num, reduced)
word_score_dict = dict(zip(self.topic_words_reduced[topic_num], self.topic_word_scores_reduced[topic_num]))
word_score_dict = dict(zip(self.topic_words_reduced[topic_num],
softmax(self.topic_word_scores_reduced[topic_num])))
else:
self._validate_topic_num(topic_num, reduced)
word_score_dict = dict(zip(self.topic_words[topic_num], self.topic_word_scores[topic_num]))
word_score_dict = dict(zip(self.topic_words[topic_num],
softmax(self.topic_word_scores[topic_num])))

plt.figure(figsize=(16, 4),
dpi=200)
Expand Down
2 changes: 1 addition & 1 deletion top2vec/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
from top2vec.Top2Vec import Top2Vec

__version__ = '1.0.19'
__version__ = '1.0.20'
29 changes: 20 additions & 9 deletions top2vec/tests/test_top2vec.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import pytest
from top2vec import Top2Vec
from top2vec.Top2Vec import Top2Vec
from sklearn.datasets import fetch_20newsgroups
import numpy as np
import tempfile
Expand All @@ -24,16 +24,27 @@
# test USE
top2vec_use = Top2Vec(documents=newsgroups_documents, embedding_model='universal-sentence-encoder')

# test USE with model embedding
top2vec_use_model_embedding = Top2Vec(documents=newsgroups_documents,
embedding_model='universal-sentence-encoder',
use_embedding_model_tokenizer=True)

# test USE-multilang
top2vec_use_multilang = Top2Vec(documents=newsgroups_documents,
embedding_model='universal-sentence-encoder-multilingual')

# test USE-multilang
# test Sentence Transformer-multilang
top2vec_transformer_multilang = Top2Vec(documents=newsgroups_documents,
embedding_model='distiluse-base-multilingual-cased')

# test Sentence Transformer with model emebdding
top2vec_transformer_model_embedding = Top2Vec(documents=newsgroups_documents,
embedding_model='distiluse-base-multilingual-cased',
use_embedding_model_tokenizer=True)

models = [top2vec, top2vec_docids, top2vec_no_docs, top2vec_corpus_file,
top2vec_use, top2vec_use_multilang, top2vec_transformer_multilang]
top2vec_use, top2vec_use_multilang, top2vec_transformer_multilang,
top2vec_use_model_embedding, top2vec_transformer_model_embedding]


def get_model_vocab(top2vec_model):
Expand Down Expand Up @@ -177,12 +188,12 @@ def test_get_topic_size(top2vec_model, reduced):
assert all(topic_sizes[i] >= topic_sizes[i + 1] for i in range(len(topic_sizes) - 1))


# @pytest.mark.parametrize('top2vec_model', models)
# @pytest.mark.parametrize('reduced', [False, True])
# def test_generate_topic_wordcloud(top2vec_model, reduced):
# # generate word cloud
# num_topics = top2vec_model.get_num_topics(reduced=reduced)
# top2vec_model.generate_topic_wordcloud(num_topics - 1, reduced=reduced)
@pytest.mark.parametrize('top2vec_model', models)
@pytest.mark.parametrize('reduced', [False, True])
def test_generate_topic_wordcloud(top2vec_model, reduced):
# generate word cloud
num_topics = top2vec_model.get_num_topics(reduced=reduced)
top2vec_model.generate_topic_wordcloud(num_topics - 1, reduced=reduced)


@pytest.mark.parametrize('top2vec_model', models)
Expand Down

0 comments on commit 9686e44

Please sign in to comment.