From e19034b032c945b40dbbea45c3e7ab38d27765ae Mon Sep 17 00:00:00 2001 From: yasarshaikh78657 <56046615+yasarshaikh78657@users.noreply.github.com> Date: Thu, 1 Oct 2020 17:19:59 +0530 Subject: [PATCH 1/4] Delete DataPrep.py --- DataPrep.py | 172 ---------------------------------------------------- 1 file changed, 172 deletions(-) delete mode 100644 DataPrep.py diff --git a/DataPrep.py b/DataPrep.py deleted file mode 100644 index b281db3..0000000 --- a/DataPrep.py +++ /dev/null @@ -1,172 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Created on Sat Nov 4 12:00:49 2017 - -@author: NishitP -""" -#import os -import pandas as pd -import csv -import numpy as np -import nltk -from nltk.stem import SnowballStemmer -from nltk.stem.porter import PorterStemmer -from nltk.tokenize import word_tokenize -import seaborn as sb - -#before reading the files, setup the working directory to point to project repo -#reading data files - - -test_filename = 'test.csv' -train_filename = 'train.csv' -valid_filename = 'valid.csv' - -train_news = pd.read_csv(train_filename) -test_news = pd.read_csv(test_filename) -valid_news = pd.read_csv(valid_filename) - - - -#data observation -def data_obs(): - print("training dataset size:") - print(train_news.shape) - print(train_news.head(10)) - - #below dataset were used for testing and validation purposes - print(test_news.shape) - print(test_news.head(10)) - - print(valid_news.shape) - print(valid_news.head(10)) - -#check the data by calling below function -#data_obs() - -#distribution of classes for prediction -def create_distribution(dataFile): - - return sb.countplot(x='Label', data=dataFile, palette='hls') - - -#by calling below we can see that training, test and valid data seems to be failry evenly distributed between the classes -create_distribution(train_news) -create_distribution(test_news) -create_distribution(valid_news) - - -#data integrity check (missing label values) -#none of the datasets contains missing values therefore no cleaning required -def data_qualityCheck(): - - print("Checking data qualitites...") - train_news.isnull().sum() - train_news.info() - - print("check finished.") - - #below datasets were used to - test_news.isnull().sum() - test_news.info() - - valid_news.isnull().sum() - valid_news.info() - -#run the below function call to see the quality check results -#data_qualityCheck() - - - -#eng_stemmer = SnowballStemmer('english') -#stopwords = set(nltk.corpus.stopwords.words('english')) - -#Stemming -def stem_tokens(tokens, stemmer): - stemmed = [] - for token in tokens: - stemmed.append(stemmer.stem(token)) - return stemmed - -#process the data -def process_data(data,exclude_stopword=True,stem=True): - tokens = [w.lower() for w in data] - tokens_stemmed = tokens - tokens_stemmed = stem_tokens(tokens, eng_stemmer) - tokens_stemmed = [w for w in tokens_stemmed if w not in stopwords ] - return tokens_stemmed - - -#creating ngrams -#unigram -def create_unigram(words): - assert type(words) == list - return words - -#bigram -def create_bigrams(words): - assert type(words) == list - skip = 0 - join_str = " " - Len = len(words) - if Len > 1: - lst = [] - for i in range(Len-1): - for k in range(1,skip+2): - if i+k < Len: - lst.append(join_str.join([words[i],words[i+k]])) - else: - #set it as unigram - lst = create_unigram(words) - return lst - -""" -#trigrams -def create_trigrams(words): - assert type(words) == list - skip == 0 - join_str = " " - Len = len(words) - if L > 2: - lst = [] - for i in range(1,skip+2): - for k1 in range(1, skip+2): - for k2 in range(1,skip+2): - for i+k1 < Len and i+k1+k2 < Len: - lst.append(join_str.join([words[i], words[i+k1],words[i+k1+k2])]) - else: - #set is as bigram - lst = create_bigram(words) - return lst -""" - - -porter = PorterStemmer() - -def tokenizer(text): - return text.split() - - -def tokenizer_porter(text): - return [porter.stem(word) for word in text.split()] - -#doc = ['runners like running and thus they run','this is a test for tokens'] -#tokenizer([word for line in test_news.iloc[:,1] for word in line.lower().split()]) - -#show the distribution of labels in the train and test data -"""def create_datafile(filename) - #function to slice the dataframe to keep variables necessary to be used for classification - return "return df to be used" -""" - -"""#converting multiclass labels present in our datasets to binary class labels -for i , row in data_TrainNews.iterrows(): - if (data_TrainNews.iloc[:,0] == "mostly-true" | data_TrainNews.iloc[:,0] == "half-true" | data_TrainNews.iloc[:,0] == "true"): - data_TrainNews.iloc[:,0] = "true" - else : - data_TrainNews.iloc[:,0] = "false" - -for i,row in data_TrainNews.iterrows(): - print(row) -""" - From e7ce858375bd31fca3c4d922bd8047abd7db7ad0 Mon Sep 17 00:00:00 2001 From: yasarshaikh78657 <56046615+yasarshaikh78657@users.noreply.github.com> Date: Thu, 1 Oct 2020 17:20:12 +0530 Subject: [PATCH 2/4] Delete FeatureSelection.py --- FeatureSelection.py | 159 -------------------------------------------- 1 file changed, 159 deletions(-) delete mode 100644 FeatureSelection.py diff --git a/FeatureSelection.py b/FeatureSelection.py deleted file mode 100644 index 4c27b39..0000000 --- a/FeatureSelection.py +++ /dev/null @@ -1,159 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Created on Sat Nov 4 14:13:38 2017 - -@author: NishitP - -Note: before we can train an algorithm to classify fake news labels, we need to extract features from it. It means reducing the mass -of unstructured data into some uniform set of attributes that an algorithm can understand. For fake news detection, it could be -word counts (bag of words). -""" -import DataPrep -import pandas as pd -import numpy as np -from sklearn.feature_extraction.text import CountVectorizer -from sklearn.feature_extraction.text import TfidfTransformer -from sklearn.feature_extraction.text import TfidfVectorizer -from sklearn.pipeline import Pipeline -import nltk -import nltk.corpus -from nltk.tokenize import word_tokenize -from gensim.models.word2vec import Word2Vec - - -#we will start with simple bag of words technique -#creating feature vector - document term matrix -countV = CountVectorizer() -train_count = countV.fit_transform(DataPrep.train_news['Statement'].values) - -print(countV) -print(train_count) - -#print training doc term matrix -#we have matrix of size of (10240, 12196) by calling below -def get_countVectorizer_stats(): - - #vocab size - train_count.shape - - #check vocabulary using below command - print(countV.vocabulary_) - - #get feature names - print(countV.get_feature_names()[:25]) - - -#create tf-df frequency features -#tf-idf -tfidfV = TfidfTransformer() -train_tfidf = tfidfV.fit_transform(train_count) - -def get_tfidf_stats(): - train_tfidf.shape - #get train data feature names - print(train_tfidf.A[:10]) - - -#bag of words - with n-grams -#countV_ngram = CountVectorizer(ngram_range=(1,3),stop_words='english') -#tfidf_ngram = TfidfTransformer(use_idf=True,smooth_idf=True) - -tfidf_ngram = TfidfVectorizer(stop_words='english',ngram_range=(1,4),use_idf=True,smooth_idf=True) - - -#POS Tagging -tagged_sentences = nltk.corpus.treebank.tagged_sents() - -cutoff = int(.75 * len(tagged_sentences)) -training_sentences = DataPrep.train_news['Statement'] - -print(training_sentences) - -#training POS tagger based on words -def features(sentence, index): - """ sentence: [w1, w2, ...], index: the index of the word """ - return { - 'word': sentence[index], - 'is_first': index == 0, - 'is_last': index == len(sentence) - 1, - 'is_capitalized': sentence[index][0].upper() == sentence[index][0], - 'is_all_caps': sentence[index].upper() == sentence[index], - 'is_all_lower': sentence[index].lower() == sentence[index], - 'prefix-1': sentence[index][0], - 'prefix-2': sentence[index][:2], - 'prefix-3': sentence[index][:3], - 'suffix-1': sentence[index][-1], - 'suffix-2': sentence[index][-2:], - 'suffix-3': sentence[index][-3:], - 'prev_word': '' if index == 0 else sentence[index - 1], - 'next_word': '' if index == len(sentence) - 1 else sentence[index + 1], - 'has_hyphen': '-' in sentence[index], - 'is_numeric': sentence[index].isdigit(), - 'capitals_inside': sentence[index][1:].lower() != sentence[index][1:] - } - - -#helper function to strip tags from tagged corpus -def untag(tagged_sentence): - return [w for w, t in tagged_sentence] - - - -#Using Word2Vec -with open("glove.6B.50d.txt", "rb") as lines: - w2v = {line.split()[0]: np.array(map(float, line.split()[1:])) - for line in lines} - - - -#model = gensim.models.Word2Vec(X, size=100) # x be tokenized text -#w2v = dict(zip(model.wv.index2word, model.wv.syn0)) - - -class MeanEmbeddingVectorizer(object): - def __init__(self, word2vec): - self.word2vec = word2vec - # if a text is empty we should return a vector of zeros - # with the same dimensionality as all the other vectors - self.dim = len(word2vec.itervalues().next()) - - def fit(self, X, y): - return self - - def transform(self, X): - return np.array([ - np.mean([self.word2vec[w] for w in words if w in self.word2vec] - or [np.zeros(self.dim)], axis=0) - for words in X - ]) - - -""" -class TfidfEmbeddingVectorizer(object): - def __init__(self, word2vec): - self.word2vec = word2vec - self.word2weight = None - self.dim = len(word2vec.itervalues().next()) - - def fit(self, X, y): - tfidf = TfidfVectorizer(analyzer=lambda x: x) - tfidf.fit(X) - # if a word was never seen - it must be at least as infrequent - # as any of the known words - so the default idf is the max of - # known idf's - max_idf = max(tfidf.idf_) - self.word2weight = defaultdict( - lambda: max_idf, - [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()]) - - return self - - def transform(self, X): - return np.array([ - np.mean([self.word2vec[w] * self.word2weight[w] - for w in words if w in self.word2vec] or - [np.zeros(self.dim)], axis=0) - for words in X - ]) - -""" From da83cf6e349c1508c0ee8c908fa1b1898724fe17 Mon Sep 17 00:00:00 2001 From: yasarshaikh78657 <56046615+yasarshaikh78657@users.noreply.github.com> Date: Thu, 1 Oct 2020 17:20:26 +0530 Subject: [PATCH 3/4] Delete classifier.py --- classifier.py | 481 -------------------------------------------------- 1 file changed, 481 deletions(-) delete mode 100644 classifier.py diff --git a/classifier.py b/classifier.py deleted file mode 100644 index 9e82f57..0000000 --- a/classifier.py +++ /dev/null @@ -1,481 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Created on Sun Nov 5 12:58:52 2017 - -@author: NishitP -""" - -import DataPrep -import FeatureSelection -import numpy as np -import pandas as pd -import pickle -from sklearn.feature_extraction.text import CountVectorizer -from sklearn.feature_extraction.text import TfidfTransformer -from sklearn.feature_extraction.text import TfidfVectorizer -from sklearn.pipeline import Pipeline -from sklearn.naive_bayes import MultinomialNB -from sklearn.linear_model import LogisticRegression -from sklearn.linear_model import SGDClassifier -from sklearn import svm -from sklearn.ensemble import RandomForestClassifier -from sklearn.model_selection import KFold -from sklearn.metrics import confusion_matrix, f1_score, classification_report -from sklearn.model_selection import GridSearchCV -from sklearn.model_selection import learning_curve -import matplotlib.pyplot as plt -from sklearn.metrics import precision_recall_curve -from sklearn.metrics import average_precision_score - -#string to test -doc_new = ['obama is running for president in 2016'] - -#the feature selection has been done in FeatureSelection.py module. here we will create models using those features for prediction - -#first we will use bag of words techniques - -#building classifier using naive bayes -nb_pipeline = Pipeline([ - ('NBCV',FeatureSelection.countV), - ('nb_clf',MultinomialNB())]) - -nb_pipeline.fit(DataPrep.train_news['Statement'],DataPrep.train_news['Label']) -predicted_nb = nb_pipeline.predict(DataPrep.test_news['Statement']) -np.mean(predicted_nb == DataPrep.test_news['Label']) - - -#building classifier using logistic regression -logR_pipeline = Pipeline([ - ('LogRCV',FeatureSelection.countV), - ('LogR_clf',LogisticRegression()) - ]) - -logR_pipeline.fit(DataPrep.train_news['Statement'],DataPrep.train_news['Label']) -predicted_LogR = logR_pipeline.predict(DataPrep.test_news['Statement']) -np.mean(predicted_LogR == DataPrep.test_news['Label']) - - -#building Linear SVM classfier -svm_pipeline = Pipeline([ - ('svmCV',FeatureSelection.countV), - ('svm_clf',svm.LinearSVC()) - ]) - -svm_pipeline.fit(DataPrep.train_news['Statement'],DataPrep.train_news['Label']) -predicted_svm = svm_pipeline.predict(DataPrep.test_news['Statement']) -np.mean(predicted_svm == DataPrep.test_news['Label']) - - -#using SVM Stochastic Gradient Descent on hinge loss -sgd_pipeline = Pipeline([ - ('svm2CV',FeatureSelection.countV), - ('svm2_clf',SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5)) - ]) - -sgd_pipeline.fit(DataPrep.train_news['Statement'],DataPrep.train_news['Label']) -predicted_sgd = sgd_pipeline.predict(DataPrep.test_news['Statement']) -np.mean(predicted_sgd == DataPrep.test_news['Label']) - - -#random forest -random_forest = Pipeline([ - ('rfCV',FeatureSelection.countV), - ('rf_clf',RandomForestClassifier(n_estimators=200,n_jobs=3)) - ]) - -random_forest.fit(DataPrep.train_news['Statement'],DataPrep.train_news['Label']) -predicted_rf = random_forest.predict(DataPrep.test_news['Statement']) -np.mean(predicted_rf == DataPrep.test_news['Label']) - - -#User defined functon for K-Fold cross validatoin -def build_confusion_matrix(classifier): - - k_fold = KFold(n_splits=5) - scores = [] - confusion = np.array([[0,0],[0,0]]) - - for train_ind, test_ind in k_fold.split(DataPrep.train_news): - train_text = DataPrep.train_news.iloc[train_ind]['Statement'] - train_y = DataPrep.train_news.iloc[train_ind]['Label'] - - test_text = DataPrep.train_news.iloc[test_ind]['Statement'] - test_y = DataPrep.train_news.iloc[test_ind]['Label'] - - classifier.fit(train_text,train_y) - predictions = classifier.predict(test_text) - - confusion += confusion_matrix(test_y,predictions) - score = f1_score(test_y,predictions) - scores.append(score) - - return (print('Total statements classified:', len(DataPrep.train_news)), - print('Score:', sum(scores)/len(scores)), - print('score length', len(scores)), - print('Confusion matrix:'), - print(confusion)) - -#K-fold cross validation for all classifiers -build_confusion_matrix(nb_pipeline) -build_confusion_matrix(logR_pipeline) -build_confusion_matrix(svm_pipeline) -build_confusion_matrix(sgd_pipeline) -build_confusion_matrix(random_forest) - -#======================================================================================== -#Bag of words confusion matrix and F1 scores - -#Naive bayes -# [2118 2370] -# [1664 4088] -# f1-Score: 0.669611539651 - -#Logistic regression -# [2252 2236] -# [1933 3819] -# f1-Score: 0.646909097798 - -#svm -# [2260 2228] -# [2246 3506] -#f1-score: 0.610468748792 - -#sgdclassifier -# [2414 2074] -# [2042 3710] -# f1-Score: 0.640874558778 - -#random forest classifier -# [1821 2667] -# [1192 4560] -# f1-Score: 0.702651511011 -#========================================================================================= - - -"""So far we have used bag of words technique to extract the features and passed those featuers into classifiers. We have also seen the -f1 scores of these classifiers. now lets enhance these features using term frequency weights with various n-grams -""" - -##Now using n-grams -#naive-bayes classifier -nb_pipeline_ngram = Pipeline([ - ('nb_tfidf',FeatureSelection.tfidf_ngram), - ('nb_clf',MultinomialNB())]) - -nb_pipeline_ngram.fit(DataPrep.train_news['Statement'],DataPrep.train_news['Label']) -predicted_nb_ngram = nb_pipeline_ngram.predict(DataPrep.test_news['Statement']) -np.mean(predicted_nb_ngram == DataPrep.test_news['Label']) - - -#logistic regression classifier -logR_pipeline_ngram = Pipeline([ - ('LogR_tfidf',FeatureSelection.tfidf_ngram), - ('LogR_clf',LogisticRegression(penalty="l2",C=1)) - ]) - -logR_pipeline_ngram.fit(DataPrep.train_news['Statement'],DataPrep.train_news['Label']) -predicted_LogR_ngram = logR_pipeline_ngram.predict(DataPrep.test_news['Statement']) -np.mean(predicted_LogR_ngram == DataPrep.test_news['Label']) - - -#linear SVM classifier -svm_pipeline_ngram = Pipeline([ - ('svm_tfidf',FeatureSelection.tfidf_ngram), - ('svm_clf',svm.LinearSVC()) - ]) - -svm_pipeline_ngram.fit(DataPrep.train_news['Statement'],DataPrep.train_news['Label']) -predicted_svm_ngram = svm_pipeline_ngram.predict(DataPrep.test_news['Statement']) -np.mean(predicted_svm_ngram == DataPrep.test_news['Label']) - - -#sgd classifier -sgd_pipeline_ngram = Pipeline([ - ('sgd_tfidf',FeatureSelection.tfidf_ngram), - ('sgd_clf',SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5)) - ]) - -sgd_pipeline_ngram.fit(DataPrep.train_news['Statement'],DataPrep.train_news['Label']) -predicted_sgd_ngram = sgd_pipeline_ngram.predict(DataPrep.test_news['Statement']) -np.mean(predicted_sgd_ngram == DataPrep.test_news['Label']) - - -#random forest classifier -random_forest_ngram = Pipeline([ - ('rf_tfidf',FeatureSelection.tfidf_ngram), - ('rf_clf',RandomForestClassifier(n_estimators=300,n_jobs=3)) - ]) - -random_forest_ngram.fit(DataPrep.train_news['Statement'],DataPrep.train_news['Label']) -predicted_rf_ngram = random_forest_ngram.predict(DataPrep.test_news['Statement']) -np.mean(predicted_rf_ngram == DataPrep.test_news['Label']) - - -#K-fold cross validation for all classifiers -build_confusion_matrix(nb_pipeline_ngram) -build_confusion_matrix(logR_pipeline_ngram) -build_confusion_matrix(svm_pipeline_ngram) -build_confusion_matrix(sgd_pipeline_ngram) -build_confusion_matrix(random_forest_ngram) - -#======================================================================================== -#n-grams & tfidf confusion matrix and F1 scores - -#Naive bayes -# [841 3647] -# [427 5325] -# f1-Score: 0.723262051071 - -#Logistic regression -# [1617 2871] -# [1097 4655] -# f1-Score: 0.70113000531 - -#svm -# [2016 2472] -# [1524 4228] -# f1-Score: 0.67909201429 - -#sgdclassifier -# [ 10 4478] -# [ 13 5739] -# f1-Score: 0.718731637053 - -#random forest -# [1979 2509] -# [1630 4122] -# f1-Score: 0.665720333284 -#========================================================================================= - -print(classification_report(DataPrep.test_news['Label'], predicted_nb_ngram)) -print(classification_report(DataPrep.test_news['Label'], predicted_LogR_ngram)) -print(classification_report(DataPrep.test_news['Label'], predicted_svm_ngram)) -print(classification_report(DataPrep.test_news['Label'], predicted_sgd_ngram)) -print(classification_report(DataPrep.test_news['Label'], predicted_rf_ngram)) - -DataPrep.test_news['Label'].shape - -""" -Out of all the models fitted, we would take 2 best performing model. we would call them candidate models -from the confusion matrix, we can see that random forest and logistic regression are best performing -in terms of precision and recall (take a look into false positive and true negative counts which appeares -to be low compared to rest of the models) -""" - -#grid-search parameter optimization -#random forest classifier parameters -parameters = {'rf_tfidf__ngram_range': [(1, 1), (1, 2),(1,3),(1,4),(1,5)], - 'rf_tfidf__use_idf': (True, False), - 'rf_clf__max_depth': (1,2,3,4,5,6,7,8,9,10,11,12,13,14,15) -} - -gs_clf = GridSearchCV(random_forest_ngram, parameters, n_jobs=-1) -gs_clf = gs_clf.fit(DataPrep.train_news['Statement'][:10000],DataPrep.train_news['Label'][:10000]) - -gs_clf.best_score_ -gs_clf.best_params_ -gs_clf.cv_results_ - -#logistic regression parameters -parameters = {'LogR_tfidf__ngram_range': [(1, 1), (1, 2),(1,3),(1,4),(1,5)], - 'LogR_tfidf__use_idf': (True, False), - 'LogR_tfidf__smooth_idf': (True, False) -} - -gs_clf = GridSearchCV(logR_pipeline_ngram, parameters, n_jobs=-1) -gs_clf = gs_clf.fit(DataPrep.train_news['Statement'][:10000],DataPrep.train_news['Label'][:10000]) - -gs_clf.best_score_ -gs_clf.best_params_ -gs_clf.cv_results_ - -#Linear SVM -parameters = {'svm_tfidf__ngram_range': [(1, 1), (1, 2),(1,3),(1,4),(1,5)], - 'svm_tfidf__use_idf': (True, False), - 'svm_tfidf__smooth_idf': (True, False), - 'svm_clf__penalty': ('l1','l2'), -} - -gs_clf = GridSearchCV(svm_pipeline_ngram, parameters, n_jobs=-1) -gs_clf = gs_clf.fit(DataPrep.train_news['Statement'][:10000],DataPrep.train_news['Label'][:10000]) - -gs_clf.best_score_ -gs_clf.best_params_ -gs_clf.cv_results_ - -#by running above commands we can find the model with best performing parameters - - -#running both random forest and logistic regression models again with best parameter found with GridSearch method -random_forest_final = Pipeline([ - ('rf_tfidf',TfidfVectorizer(stop_words='english',ngram_range=(1,3),use_idf=True,smooth_idf=True)), - ('rf_clf',RandomForestClassifier(n_estimators=300,n_jobs=3,max_depth=10)) - ]) - -random_forest_final.fit(DataPrep.train_news['Statement'],DataPrep.train_news['Label']) -predicted_rf_final = random_forest_final.predict(DataPrep.test_news['Statement']) -np.mean(predicted_rf_final == DataPrep.test_news['Label']) -print(metrics.classification_report(DataPrep.test_news['Label'], predicted_rf_final)) - -logR_pipeline_final = Pipeline([ - #('LogRCV',countV_ngram), - ('LogR_tfidf',TfidfVectorizer(stop_words='english',ngram_range=(1,5),use_idf=True,smooth_idf=False)), - ('LogR_clf',LogisticRegression(penalty="l2",C=1)) - ]) - -logR_pipeline_final.fit(DataPrep.train_news['Statement'],DataPrep.train_news['Label']) -predicted_LogR_final = logR_pipeline_final.predict(DataPrep.test_news['Statement']) -np.mean(predicted_LogR_final == DataPrep.test_news['Label']) -#accuracy = 0.62 -print(metrics.classification_report(DataPrep.test_news['Label'], predicted_LogR_final)) - - -""" -by running both random forest and logistic regression with GridSearch's best parameter estimation, we found that for random -forest model with n-gram has better accuracty than with the parameter estimated. The logistic regression model with best parameter -has almost similar performance as n-gram model so logistic regression will be out choice of model for prediction. -""" - -#saving best model to the disk -model_file = 'final_model.sav' -pickle.dump(logR_pipeline_ngram,open(model_file,'wb')) - - -#Plotting learing curve -def plot_learing_curve(pipeline,title): - size = 10000 - cv = KFold(size, shuffle=True) - - X = DataPrep.train_news["Statement"] - y = DataPrep.train_news["Label"] - - pl = pipeline - pl.fit(X,y) - - train_sizes, train_scores, test_scores = learning_curve(pl, X, y, n_jobs=-1, cv=cv, train_sizes=np.linspace(.1, 1.0, 5), verbose=0) - - train_scores_mean = np.mean(train_scores, axis=1) - train_scores_std = np.std(train_scores, axis=1) - test_scores_mean = np.mean(test_scores, axis=1) - test_scores_std = np.std(test_scores, axis=1) - - plt.figure() - plt.title(title) - plt.legend(loc="best") - plt.xlabel("Training examples") - plt.ylabel("Score") - plt.gca().invert_yaxis() - - # box-like grid - plt.grid() - - # plot the std deviation as a transparent range at each training set size - plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r") - plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g") - - # plot the average training and test score lines at each training set size - plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score") - plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score") - - # sizes the window for readability and displays the plot - # shows error from 0 to 1.1 - plt.ylim(-.1,1.1) - plt.show() - - -#below command will plot learing curves for each of the classifiers -plot_learing_curve(logR_pipeline_ngram,"Naive-bayes Classifier") -plot_learing_curve(nb_pipeline_ngram,"LogisticRegression Classifier") -plot_learing_curve(svm_pipeline_ngram,"SVM Classifier") -plot_learing_curve(sgd_pipeline_ngram,"SGD Classifier") -plot_learing_curve(random_forest_ngram,"RandomForest Classifier") - -""" -by plotting the learning cureve for logistic regression, it can be seen that cross-validation score is stagnating throughout and it -is unable to learn from data. Also we see that there are high errors that indicates model is simple and we may want to increase the -model complexity. -""" - - -#plotting Precision-Recall curve -def plot_PR_curve(classifier): - - precision, recall, thresholds = precision_recall_curve(DataPrep.test_news['Label'], classifier) - average_precision = average_precision_score(DataPrep.test_news['Label'], classifier) - - plt.step(recall, precision, color='b', alpha=0.2, - where='post') - plt.fill_between(recall, precision, step='post', alpha=0.2, - color='b') - - plt.xlabel('Recall') - plt.ylabel('Precision') - plt.ylim([0.0, 1.05]) - plt.xlim([0.0, 1.0]) - plt.title('2-class Random Forest Precision-Recall curve: AP={0:0.2f}'.format( - average_precision)) - -plot_PR_curve(predicted_LogR_ngram) -plot_PR_curve(predicted_rf_ngram) - - -""" -Now let's extract the most informative feature from ifidf vectorizer for all fo the classifiers and see of there are any common -words that we can identify i.e. are these most informative feature acorss the classifiers are same? we will create a function that -will extract top 50 features. -""" - -def show_most_informative_features(model, vect, clf, text=None, n=50): - # Extract the vectorizer and the classifier from the pipeline - vectorizer = model.named_steps[vect] - classifier = model.named_steps[clf] - - # Check to make sure that we can perform this computation - if not hasattr(classifier, 'coef_'): - raise TypeError( - "Cannot compute most informative features on {}.".format( - classifier.__class__.__name__ - ) - ) - - if text is not None: - # Compute the coefficients for the text - tvec = model.transform([text]).toarray() - else: - # Otherwise simply use the coefficients - tvec = classifier.coef_ - - # Zip the feature names with the coefs and sort - coefs = sorted( - zip(tvec[0], vectorizer.get_feature_names()), - reverse=True - ) - - # Get the top n and bottom n coef, name pairs - topn = zip(coefs[:n], coefs[:-(n+1):-1]) - - # Create the output string to return - output = [] - - # If text, add the predicted value to the output. - if text is not None: - output.append("\"{}\"".format(text)) - output.append( - "Classified as: {}".format(model.predict([text])) - ) - output.append("") - - # Create two columns with most negative and most positive features. - for (cp, fnp), (cn, fnn) in topn: - output.append( - "{:0.4f}{: >15} {:0.4f}{: >15}".format( - cp, fnp, cn, fnn - ) - ) - #return "\n".join(output) - print(output) - -show_most_informative_features(logR_pipeline_ngram,vect='LogR_tfidf',clf='LogR_clf') -show_most_informative_features(nb_pipeline_ngram,vect='nb_tfidf',clf='nb_clf') -show_most_informative_features(svm_pipeline_ngram,vect='svm_tfidf',clf='svm_clf') -show_most_informative_features(sgd_pipeline_ngram,vect='sgd_tfidf',clf='sgd_clf') From b4463d18962e5eb6f1d3ec2569659cbe4ab34268 Mon Sep 17 00:00:00 2001 From: yasarshaikh78657 <56046615+yasarshaikh78657@users.noreply.github.com> Date: Thu, 1 Oct 2020 17:21:53 +0530 Subject: [PATCH 4/4] Add files via upload --- modelcreation.py | 606 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 606 insertions(+) create mode 100644 modelcreation.py diff --git a/modelcreation.py b/modelcreation.py new file mode 100644 index 0000000..0d8707b --- /dev/null +++ b/modelcreation.py @@ -0,0 +1,606 @@ +# -*- coding: utf-8 -*- +""" +Created on Mon Apr 20 20:00:49 2020 + +@author: AHTESHAM SANANDWALA +""" + +import numpy as np +import pandas as pd +import pickle +from sklearn.feature_extraction.text import CountVectorizer +from sklearn.feature_extraction.text import TfidfTransformer +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.pipeline import Pipeline +from sklearn.naive_bayes import MultinomialNB +from sklearn.linear_model import LogisticRegression +from sklearn.linear_model import SGDClassifier +from sklearn import svm +from sklearn.ensemble import RandomForestClassifier +from sklearn.model_selection import KFold +from sklearn.metrics import confusion_matrix, f1_score, classification_report +from sklearn.model_selection import GridSearchCV +from sklearn.model_selection import learning_curve +import matplotlib.pyplot as plt +from sklearn.metrics import precision_recall_curve +from sklearn.metrics import average_precision_score +import nltk +import nltk.corpus +from nltk.tokenize import word_tokenize +from gensim.models.word2vec import Word2Vec +import csv +import numpy as np +import nltk +from nltk.stem import SnowballStemmer +from nltk.stem.porter import PorterStemmer +from nltk.tokenize import word_tokenize +import seaborn as sb + + +test_filename = 'test.csv' +train_filename = 'train.csv' +valid_filename = 'valid.csv' + +train_news = pd.read_csv(train_filename) +test_news = pd.read_csv(test_filename) +valid_news = pd.read_csv(valid_filename) + +def data_obs(): + print("training dataset size:") + print(train_news.shape) + print(train_news.head(10)) + + #below dataset were used for testing and validation purposes + print(test_news.shape) + print(test_news.head(10)) + + print(valid_news.shape) + print(valid_news.head(10)) + +def create_distribution(dataFile): + create_distribution(train_news) + create_distribution(test_news) + create_distribution(valid_news) + return sb.countplot(x='Label', data=dataFile, palette='hls') + +def data_qualityCheck(): + + print("Checking data qualitites...") + train_news.isnull().sum() + train_news.info() + + print("check finished.") + + #below datasets were used to + test_news.isnull().sum() + test_news.info() + + valid_news.isnull().sum() + valid_news.info() + +#run the below function call to see the quality check results +#data_qualityCheck() + + + +#eng_stemmer = SnowballStemmer('english') +#stopwords = set(nltk.corpus.stopwords.words('english')) + +#Stemming +def stem_tokens(tokens, stemmer): + stemmed = [] + for token in tokens: + stemmed.append(stemmer.stem(token)) + return stemmed + +#process the data +def process_data(data,exclude_stopword=True,stem=True): + tokens = [w.lower() for w in data] + tokens_stemmed = tokens + tokens_stemmed = stem_tokens(tokens, eng_stemmer) + tokens_stemmed = [w for w in tokens_stemmed if w not in stopwords ] + return tokens_stemmed + + +#creating ngrams +#unigram +def create_unigram(words): + assert type(words) == list + return words + +#bigram +def create_bigrams(words): + assert type(words) == list + skip = 0 + join_str = " " + Len = len(words) + if Len > 1: + lst = [] + for i in range(Len-1): + for k in range(1,skip+2): + if i+k < Len: + lst.append(join_str.join([words[i],words[i+k]])) + else: + #set it as unigram + lst = create_unigram(words) + return lst + +countV = CountVectorizer() +train_count = countV.fit_transform(train_news['Statement'].values) + +print(countV) +print(train_count) + +def get_countVectorizer_stats(): + + #vocab size + train_count.shape + + #check vocabulary using below command + print(countV.vocabulary_) + + #get feature names + print(countV.get_feature_names()[:25]) + +#create tf-df frequency features +#tf-idf +tfidfV = TfidfTransformer() +train_tfidf = tfidfV.fit_transform(train_count) + +def get_tfidf_stats(): + train_tfidf.shape + #get train data feature names + print(train_tfidf.A[:10]) + +tfidf_ngram = TfidfVectorizer(stop_words='english',ngram_range=(1,4),use_idf=True,smooth_idf=True) + + +#POS Tagging +tagged_sentences = nltk.corpus.treebank.tagged_sents() + +cutoff = int(.75 * len(tagged_sentences)) +training_sentences = train_news['Statement'] + +print(training_sentences) + +#training POS tagger based on words +def features(sentence, index): + """ sentence: [w1, w2, ...], index: the index of the word """ + return { + 'word': sentence[index], + 'is_first': index == 0, + 'is_last': index == len(sentence) - 1, + 'is_capitalized': sentence[index][0].upper() == sentence[index][0], + 'is_all_caps': sentence[index].upper() == sentence[index], + 'is_all_lower': sentence[index].lower() == sentence[index], + 'prefix-1': sentence[index][0], + 'prefix-2': sentence[index][:2], + 'prefix-3': sentence[index][:3], + 'suffix-1': sentence[index][-1], + 'suffix-2': sentence[index][-2:], + 'suffix-3': sentence[index][-3:], + 'prev_word': '' if index == 0 else sentence[index - 1], + 'next_word': '' if index == len(sentence) - 1 else sentence[index + 1], + 'has_hyphen': '-' in sentence[index], + 'is_numeric': sentence[index].isdigit(), + 'capitals_inside': sentence[index][1:].lower() != sentence[index][1:] + } + + +#helper function to strip tags from tagged corpus +def untag(tagged_sentence): + return [w for w, t in tagged_sentence] + +class MeanEmbeddingVectorizer(object): + def __init__(self, word2vec): + self.word2vec = word2vec + # if a text is empty we should return a vector of zeros + # with the same dimensionality as all the other vectors + self.dim = len(word2vec.itervalues().next()) + + def fit(self, X, y): + return self + + def transform(self, X): + return np.array([ + np.mean([self.word2vec[w] for w in words if w in self.word2vec] + or [np.zeros(self.dim)], axis=0) + for words in X + ]) + +class TfidfEmbeddingVectorizer(object): + def __init__(self, word2vec): + self.word2vec = word2vec + self.word2weight = None + self.dim = len(word2vec.itervalues().next()) + + def fit(self, X, y): + tfidf = TfidfVectorizer(analyzer=lambda x: x) + tfidf.fit(X) + # if a word was never seen - it must be at least as infrequent + # as any of the known words - so the default idf is the max of + # known idf's + max_idf = max(tfidf.idf_) + self.word2weight = defaultdict( + lambda: max_idf, + [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()]) + + return self + + def transform(self, X): + return np.array([ + np.mean([self.word2vec[w] * self.word2weight[w] + for w in words if w in self.word2vec] or + [np.zeros(self.dim)], axis=0) + for words in X + ]) + +doc_new = ['Corona is virus'] + +nb_pipeline = Pipeline([('NBCV',countV), ('nb_clf',MultinomialNB())]) + +nb_pipeline.fit(train_news['Statement'],train_news['Label']) +predicted_nb = nb_pipeline.predict(test_news['Statement']) +np.mean(predicted_nb == test_news['Label']) + +#building classifier using logistic regression +logR_pipeline = Pipeline([ + ('LogRCV', countV), + ('LogR_clf',LogisticRegression()) + ]) + +logR_pipeline.fit(train_news['Statement'],train_news['Label']) +predicted_LogR = logR_pipeline.predict(test_news['Statement']) +np.mean(predicted_LogR == test_news['Label']) + +#building Linear SVM classfier +svm_pipeline = Pipeline([ + ('svmCV', countV), + ('svm_clf',svm.LinearSVC()) + ]) + +svm_pipeline.fit(train_news['Statement'],train_news['Label']) +predicted_svm = svm_pipeline.predict(test_news['Statement']) +np.mean(predicted_svm == test_news['Label']) + + +#using SVM Stochastic Gradient Descent on hinge loss +sgd_pipeline = Pipeline([ + ('svm2CV', countV), + ('svm2_clf',SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3)) + ]) + +sgd_pipeline.fit(train_news['Statement'], train_news['Label']) +predicted_sgd = sgd_pipeline.predict(test_news['Statement']) +np.mean(predicted_sgd == test_news['Label']) + + +#random forest +random_forest = Pipeline([ + ('rfCV', countV), + ('rf_clf',RandomForestClassifier(n_estimators=200,n_jobs=3)) + ]) + +random_forest.fit(train_news['Statement'], train_news['Label']) +predicted_rf = random_forest.predict(test_news['Statement']) +np.mean(predicted_rf == test_news['Label']) + + +def build_confusion_matrix(classifier): + + k_fold = KFold(n_splits=5) + scores = [] + confusion = np.array([[0,0],[0,0]]) + + for train_ind, test_ind in k_fold.split(train_news): + train_text = train_news.iloc[train_ind]['Statement'] + train_y = train_news.iloc[train_ind]['Label'] + + test_text = train_news.iloc[test_ind]['Statement'] + test_y = train_news.iloc[test_ind]['Label'] + + classifier.fit(train_text,train_y) + predictions = classifier.predict(test_text) + + confusion += confusion_matrix(test_y,predictions) + score = f1_score(test_y,predictions) + scores.append(score) + + return (print('Total statements classified:', len(train_news)), + print('Score:', sum(scores)/len(scores)), + print('score length', len(scores)), + print('Confusion matrix:'), + print(confusion)) + + +#K-fold cross validation for all classifiers +build_confusion_matrix(nb_pipeline) +build_confusion_matrix(logR_pipeline) +build_confusion_matrix(svm_pipeline) +build_confusion_matrix(sgd_pipeline) +build_confusion_matrix(random_forest) + +##Now using n-grams +#naive-bayes classifier +nb_pipeline_ngram = Pipeline([ + ('nb_tfidf', tfidf_ngram), + ('nb_clf',MultinomialNB())]) + +nb_pipeline_ngram.fit(train_news['Statement'],train_news['Label']) +predicted_nb_ngram = nb_pipeline_ngram.predict(test_news['Statement']) +np.mean(predicted_nb_ngram == test_news['Label']) + + +#logistic regression classifier +logR_pipeline_ngram = Pipeline([ + ('LogR_tfidf', tfidf_ngram), + ('LogR_clf',LogisticRegression(penalty="l2",C=1)) + ]) + +logR_pipeline_ngram.fit(train_news['Statement'], train_news['Label']) +predicted_LogR_ngram = logR_pipeline_ngram.predict(test_news['Statement']) +np.mean(predicted_LogR_ngram == test_news['Label']) + + +#linear SVM classifier +svm_pipeline_ngram = Pipeline([ + ('svm_tfidf', tfidf_ngram), + ('svm_clf',svm.LinearSVC()) + ]) + +svm_pipeline_ngram.fit(train_news['Statement'], train_news['Label']) +predicted_svm_ngram = svm_pipeline_ngram.predict(test_news['Statement']) +np.mean(predicted_svm_ngram == test_news['Label']) + + +#sgd classifier +sgd_pipeline_ngram = Pipeline([ + ('sgd_tfidf', tfidf_ngram), + ('sgd_clf',SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3)) + ]) + +sgd_pipeline_ngram.fit(train_news['Statement'], train_news['Label']) +predicted_sgd_ngram = sgd_pipeline_ngram.predict(test_news['Statement']) +np.mean(predicted_sgd_ngram == test_news['Label']) + + +#random forest classifier +random_forest_ngram = Pipeline([ + ('rf_tfidf', tfidf_ngram), + ('rf_clf',RandomForestClassifier(n_estimators=300,n_jobs=3)) + ]) + +random_forest_ngram.fit(train_news['Statement'], train_news['Label']) +predicted_rf_ngram = random_forest_ngram.predict(test_news['Statement']) +np.mean(predicted_rf_ngram == test_news['Label']) + + +#K-fold cross validation for all classifiers +build_confusion_matrix(nb_pipeline_ngram) +build_confusion_matrix(logR_pipeline_ngram) +build_confusion_matrix(svm_pipeline_ngram) +build_confusion_matrix(sgd_pipeline_ngram) +build_confusion_matrix(random_forest_ngram) + +print(classification_report(test_news['Label'], predicted_nb_ngram)) +print(classification_report(test_news['Label'], predicted_LogR_ngram)) +print(classification_report(test_news['Label'], predicted_svm_ngram)) +print(classification_report(test_news['Label'], predicted_sgd_ngram)) +print(classification_report(test_news['Label'], predicted_rf_ngram)) + +test_news['Label'].shape + +""" +Out of all the models fitted, we would take 2 best performing model. we would call them candidate models +from the confusion matrix, we can see that random forest and logistic regression are best performing +in terms of precision and recall (take a look into false positive and true negative counts which appeares +to be low compared to rest of the models) +""" + +#grid-search parameter optimization +#random forest classifier parameters +parameters = {'rf_tfidf__ngram_range': [(1, 1), (1, 2),(1,3),(1,4),(1,5)], + 'rf_tfidf__use_idf': (True, False), + 'rf_clf__max_depth': (1,2,3,4,5,6,7,8,9,10,11,12,13,14,15) +} + +gs_clf = GridSearchCV(random_forest_ngram, parameters, n_jobs=-1) +gs_clf = gs_clf.fit(train_news['Statement'][:10000],train_news['Label'][:10000]) + +gs_clf.best_score_ +gs_clf.best_params_ +gs_clf.cv_results_ + + +#logistic regression parameters +parameters = {'LogR_tfidf__ngram_range': [(1, 1), (1, 2),(1,3),(1,4),(1,5)], + 'LogR_tfidf__use_idf': (True, False), + 'LogR_tfidf__smooth_idf': (True, False) +} + +gs_clf = GridSearchCV(logR_pipeline_ngram, parameters, n_jobs=-1) +gs_clf = gs_clf.fit(train_news['Statement'][:10000],train_news['Label'][:10000]) + +gs_clf.best_score_ +gs_clf.best_params_ +gs_clf.cv_results_ + +''' +#Linear SVM +parameters = {'svm_tfidf__ngram_range': [(1, 1), (1, 2),(1,3),(1,4),(1,5)], + 'svm_tfidf__use_idf': (True, False), + 'svm_tfidf__smooth_idf': (True, False), + 'svm_clf__penalty': ('l1','l2'), +} + +gs_clf = GridSearchCV(svm_pipeline_ngram, parameters, n_jobs=-1) +gs_clf = gs_clf.fit(train_news['Statement'][:10000],train_news['Label'][:10000]) + +gs_clf.best_score_ +gs_clf.best_params_ +gs_clf.cv_results_ +''' + +#by running above commands we can find the model with best performing parameters + + +#running both random forest and logistic regression models again with best parameter found with GridSearch method +random_forest_final = Pipeline([ + ('rf_tfidf',TfidfVectorizer(stop_words='english',ngram_range=(1,3),use_idf=True,smooth_idf=True)), + ('rf_clf',RandomForestClassifier(n_estimators=300,n_jobs=3,max_depth=10)) + ]) + +random_forest_final.fit(train_news['Statement'],train_news['Label']) +predicted_rf_final = random_forest_final.predict(test_news['Statement']) +np.mean(predicted_rf_final == test_news['Label']) +print(classification_report(test_news['Label'], predicted_rf_final)) + +logR_pipeline_final = Pipeline([ + #('LogRCV',countV_ngram), + ('LogR_tfidf',TfidfVectorizer(stop_words='english',ngram_range=(1,5),use_idf=True,smooth_idf=False)), + ('LogR_clf',LogisticRegression(penalty="l2",C=1)) + ]) + +logR_pipeline_final.fit(train_news['Statement'],train_news['Label']) +predicted_LogR_final = logR_pipeline_final.predict(test_news['Statement']) +np.mean(predicted_LogR_final == test_news['Label']) +#accuracy = 0.699 +print(classification_report(test_news['Label'], predicted_LogR_final)) + + +model_file = 'final_model.sav' +pickle.dump(logR_pipeline_ngram,open(model_file,'wb')) + + +#Plotting learing curve +def plot_learing_curve(pipeline,title): + size = 159 + cv = KFold(size, shuffle=True) + + X = train_news["Statement"] + y = train_news["Label"] + + pl = pipeline + pl.fit(X,y) + + train_sizes, train_scores, test_scores = learning_curve(pl, X, y, n_jobs=-1, cv=cv, train_sizes=np.linspace(.1, 1.0, 5), verbose=0) + + train_scores_mean = np.mean(train_scores, axis=1) + train_scores_std = np.std(train_scores, axis=1) + test_scores_mean = np.mean(test_scores, axis=1) + test_scores_std = np.std(test_scores, axis=1) + + plt.figure() + plt.title(title) + plt.legend(loc="best") + plt.xlabel("Training examples") + plt.ylabel("Score") + plt.gca().invert_yaxis() + + # box-like grid + plt.grid() + + # plot the std deviation as a transparent range at each training set size + plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r") + plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g") + + # plot the average training and test score lines at each training set size + plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score") + plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score") + + # sizes the window for readability and displays the plot + # shows error from 0 to 1.1 + plt.ylim(-.1,1.1) + plt.show() + + +#below command will plot learing curves for each of the classifiers +plot_learing_curve(logR_pipeline_ngram,"Naive-bayes Classifier") +plot_learing_curve(nb_pipeline_ngram,"LogisticRegression Classifier") +plot_learing_curve(svm_pipeline_ngram,"SVM Classifier") +plot_learing_curve(sgd_pipeline_ngram,"SGD Classifier") +plot_learing_curve(random_forest_ngram,"RandomForest Classifier") + +#plotting Precision-Recall curve +def plot_PR_curve(classifier): + + precision, recall, thresholds = precision_recall_curve(test_news['Label'], classifier) + average_precision = average_precision_score(test_news['Label'], classifier) + + plt.step(recall, precision, color='b', alpha=0.2, + where='post') + plt.fill_between(recall, precision, step='post', alpha=0.2, + color='b') + + plt.xlabel('Recall') + plt.ylabel('Precision') + plt.ylim([0.0, 1.05]) + plt.xlim([0.0, 1.0]) + plt.title('2-class Random Forest Precision-Recall curve: AP={0:0.2f}'.format( + average_precision)) + +plot_PR_curve(predicted_LogR_ngram) +plot_PR_curve(predicted_rf_ngram) + +""" +Now let's extract the most informative feature from ifidf vectorizer for all fo the classifiers and see of there are any common +words that we can identify i.e. are these most informative feature acorss the classifiers are same? we will create a function that +will extract top 50 features. +""" + +def show_most_informative_features(model, vect, clf, text=None, n=50): + # Extract the vectorizer and the classifier from the pipeline + vectorizer = model.named_steps[vect] + classifier = model.named_steps[clf] + + # Check to make sure that we can perform this computation + if not hasattr(classifier, 'coef_'): + raise TypeError( + "Cannot compute most informative features on {}.".format( + classifier.__class__.__name__ + ) + ) + + if text is not None: + # Compute the coefficients for the text + tvec = model.transform([text]).toarray() + else: + # Otherwise simply use the coefficients + tvec = classifier.coef_ + + # Zip the feature names with the coefs and sort + coefs = sorted( + zip(tvec[0], vectorizer.get_feature_names()), + reverse=True + ) + + # Get the top n and bottom n coef, name pairs + topn = zip(coefs[:n], coefs[:-(n+1):-1]) + + # Create the output string to return + output = [] + + # If text, add the predicted value to the output. + if text is not None: + output.append("\"{}\"".format(text)) + output.append( + "Classified as: {}".format(model.predict([text])) + ) + output.append("") + + # Create two columns with most negative and most positive features. + for (cp, fnp), (cn, fnn) in topn: + output.append( + "{:0.4f}{: >15} {:0.4f}{: >15}".format( + cp, fnp, cn, fnn + ) + ) + #return "\n".join(output) + print(output) + + +show_most_informative_features(logR_pipeline_ngram,vect='LogR_tfidf',clf='LogR_clf') +show_most_informative_features(nb_pipeline_ngram,vect='nb_tfidf',clf='nb_clf') +show_most_informative_features(svm_pipeline_ngram,vect='svm_tfidf',clf='svm_clf') +show_most_informative_features(sgd_pipeline_ngram,vect='sgd_tfidf',clf='sgd_clf') +