Code.py

# -*- coding: utf-8 -*-
"""Qassim Summer Training.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1HU0fJntEAnck120HjwRq3R5kpx-ZN8hR

Import the dataset from Kaggle
"""

!pip install kaggle

!ls

import json
token = {"username":"sarah6suliman","key":"23e6dbf3a23c4ad1feffe137900df93d"}
with open('/content/kaggle.json', 'w') as file:
    json.dump(token, file)

!cp /content/kaggle.json ~/.kaggle/kaggle.json

!kaggle config set -n path -v{/content}

!kaggle datasets list

!chmod 600 /root/.kaggle/kaggle.json

!kaggle datasets download -d mlg-ulb/creditcardfraud

import pandas as pd
features_names=['Time','V1','V2','V3','V4','V5','V6','V7','V8','V9','V10','V11',
                    'V12','V13','V14','V15','V16','V17','V18','V19','V20','V21',
                    'V22','V23','V24','V25','V26','V27','V28','Amount','Class']
dataset = pd.read_csv('/content/{/content}/datasets/mlg-ulb/creditcardfraud/creditcardfraud.zip', header=0, names=features_names)
dataset.head()

"""Load packages"""

import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from pandas import  DataFrame

"""**train_test_split**"""

X= dataset.iloc[:, :-1].values # independent 
y= dataset.iloc[:, 30].values  # dependent

from sklearn.model_selection import train_test_split 
   
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0) 

models=[]

"""Preprocessing and Scale"""

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

clf = make_pipeline(StandardScaler(), SVC())

from sklearn.preprocessing import MinMaxScaler
scaling = MinMaxScaler(feature_range=(-1,1)).fit(X_train)
X_train = scaling.transform(X_train)
X_test = scaling.transform(X_test)

"""Handle Missing Data """

from sklearn.impute import SimpleImputer # Missing data 
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer = imputer.fit(X[: , 1:3])
X[:, 1:3]=imputer.transform(X[:, 1:3])

#no missing values
total = dataset.isnull().sum().sort_values(ascending = False)
percent = (dataset.isnull().sum()/dataset.isnull().count()*100).sort_values(ascending = False)
pd.concat([total, percent], axis=1, keys=['Total', 'Percent']).transpose()

"""**Class percentage**"""

def check_balance(dataset,target):
    check=[]
    print('size of dataset is:',dataset.shape[0] )
    for i in [0,1]:
        print('for target  {} ='.format(i))
        print(dataset[target].value_counts()[i]/dataset.shape[0]*100,'%')

print("Number of valid data",len(dataset[dataset["Class"]==0]))
print("Number of fraud  data",len(dataset[dataset["Class"]==1]))

"""Distribution """

check_balance(dataset,'Class')

# Commented out IPython magic to ensure Python compatibility.
import seaborn as sns
import matplotlib.pyplot as plt
# %matplotlib inline

#visualize the imbalance with a bar chart
plt.title('Distribution of Frauds', fontdict={'size' : 16, 'color':'brown'})
sns.countplot(x='Class', data=dataset)
labels = ['Non-Fraud', 'Fraud']   #to label the plot
vals = [0, 1]   #to put the labels right

plt.xticks(vals, labels)
plt.xlabel('Class', fontdict={'size' : 14, 'color' : 'green'})
plt.ylabel('Number of transactions', fontdict={'size' : 12, 'color':'green'})

"""Class percentage after SMOTE

**Handling Duplicate**
"""

#Identify duplicates records in the data
dupes=dataset.duplicated()
sum(dupes)

#Identify number of row before delete duplicate
length1 = len(dataset) 
print(length1)

# Select duplicate rows except last occurrence based on all columns
duplicateRowsDF = dataset[dataset.duplicated(keep='last')]
print("Duplicate Rows except last occurrence based on all columns are :")
print(duplicateRowsDF)

"""Remove Duplicate"""

#Remove Duplicate fro the dataset
dataset.drop_duplicates(keep=False,inplace=True)

#Identify duplicates records in the data After delete it
dupes=dataset.duplicated()
sum(dupes)

#Identify number of row after delete duplicate
length1 = len(dataset) 
print(length1)

"""# **SMOTE**"""

print("Number transactions X_train dataset: ", X_train.shape)
print("Number transactions y_train dataset: ", y_train.shape)
print("Number transactions X_test dataset: ", X_test.shape)
print("Number transactions y_test dataset: ", y_test.shape)

#to print how many number in x&y train after smote
print(len(X_train_res))
print(len(y_train_res.ravel()))

valid_sm=len(y_train_res[y_train_res==0]) #number of valid in dataset after smote
fraud_sm=len(y_train_res[y_train_res==1]) #number of fraud in dataset after smote
length=len(y_train_res) #number of all transecation in dataset after smote
preValid=(((valid_sm)/(length))*100) #percentage of valid in dataset after smote
preFraud=(((fraud_sm)/(length))*100) #percentage of fraud in dataset after smote

print('Size of data :',length )
print('for target 0 = \n {}% '.format(round(preValid, 2)))
print('for target 1 = \n {}%'.format(round(preValid, 2)))

"""Plot the resampled data"""

def plot_2d_space(X, y, label='Classes'):   
    colors = ['#1F77B4', '#FF7F0E']
    markers = ['o', 's']
    for l, c, m in zip(np.unique(y), colors, markers):
        plt.scatter(
            X[y==l, 0],
            X[y==l, 1],
            c=c, label=l, marker=m
        )
    plt.title(label)
    plt.legend(loc='upper right')
    plt.show()

plot_2d_space(X_train, y_train, 'DATA_SET BEFORE SMOTE')

plot_2d_space(X_train_res, y_train_res.ravel(), 'SMOTE over-sampling')

"""# **Algorithms**"""

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score , recall_score, f1_score 
from sklearn import metrics
from sklearn.metrics import roc_curve, auc

"""Stochastic gradient descent (SGD) """

from sklearn.linear_model import  SGDClassifier
#classifier =SGDClassifier(loss='hinge', max_iter=100)
#name='SGDClassifier'

SGD = SGDClassifier()
models.append(('SGDClassifier', SGD))

import numpy as np
import matplotlib.pyplot as plt
from itertools import cycle

from sklearn import svm, datasets
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from scipy import interp
from sklearn.metrics import roc_auc_score


from sklearn.linear_model import  SGDClassifier

#SGD = SGDClassifier()
#SGD.fit(X_train_res, y_train_res)

#y_pred_SGD = SGD.decision_function(X_test)

#SGD_fpr, SGD_tpr, threshold = roc_curve(y_test, y_pred_SGD)
#auc_SGD = auc(SGD_fpr, SGD_tpr)

#plt.figure(figsize=(5, 5), dpi=100)
#plt.plot(SGD_fpr, SGD_tpr, linestyle='-', label='SGD (auc = %0.3f)' % auc_SGD)
#plt.plot(SGD_fpr, SGD_tpr, linestyle='-')


#plt.plot([0, 1], ls="--")
#plt.plot([0, 0], [1, 0] , c=".7"), plt.plot([1, 1] , c=".7")

#plt.title('Receiver Operating Characteristic - SGD')
#plt.xlabel('False Positive Rate')
#plt.ylabel('True Positive Rate')

#plt.legend()

#plt.show()

"""Latent Dirichlet allocation (LDA) """

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
#classifier =LDA()
#name='LDA'

lda = LDA()
models.append(('LDA', lda))

"""**SVM Kernel**

linear
"""

#from sklearn.svm import SVC
#classifier = SVC(kernel='linear')
#name='Linear'

"""Polynomial kernel"""

#from sklearn.svm import SVC
#classifier = SVC(kernel='poly')
#name='poly'

"""Radial Basis Function (RBF)"""

#from sklearn.svm import SVC
#svmclassifier = SVC(kernel='rbf')
#name='rbf'

"""MLP"""

from sklearn.neural_network import MLPClassifier
#classifier =MLPClassifier()
#name='MLPClassifier'

MLP = MLPClassifier()
models.append((' MLPClassifier', MLP))

"""**Naïve Bayes**

BernoulliNB
"""

from sklearn.naive_bayes import BernoulliNB
#classifier = BernoulliNB(binarize = True)
#name='BernoulliNB'

bernoulli = BernoulliNB(binarize = True)
models.append(('BernoulliNB', bernoulli))

"""AadBoost"""

from sklearn import ensemble

#classifier = ensemble.AdaBoostClassifier()
#name='AdaBoost'

ada= ensemble.AdaBoostClassifier()
models.append(('AdaBoost',ada))

"""Decision Trees"""

from sklearn import tree
#classifier =tree.DecisionTreeClassifier()
#name='DecisionTree'

dt=tree.DecisionTreeClassifier()
models.append(('DecisionTree', dt))

#for name,model in models:
 # model.fit(X_train_res, y_train_res)
 # y_pred= model.predict(X_test)

#from sklearn import model_selection

#results=[] 
#names=['SGD','LDA','MLP','Bernoulli','Adabost','DT'] 
#for name, model in models:
 # cv_results=model_selection.cross_val_score(model,X_train_res, y_train_res,scoring='accuracy')
 # results.append(cv_results) 
 # names.append(name)

#import matplotlib.pyplot as plt # to plot graph
#fig=plt.figure() 
#fig.suptitle('Algorithm Comparison') 
#ax=fig.add_subplot(111) 
#plt.boxplot(results) 
#ax.set_xticklabels(names)

"""**Evaluation the Algorithm**"""

#print(name)
#classifier.fit(X_train, y_train)
#y_pred = classifier.predict(X_test)
#fpr, tpr, thresholds = metrics.roc_curve(y_test,y_pred)
#print("AUC Score is %s"%(metrics.auc(fpr, tpr)),"%")
#print("Confusion Matrix is \n %s"%(confusion_matrix(y_test, y_pred)))
#print("The Classification report is \n %s"%(classification_report(y_test, y_pred)))
#print("%s: %f"%("The Precision Score is", precision_score(y_test,y_pred, average='weighted')))
#print("%s: %f"%("The Recall Score is", recall_score(y_test,y_pred, average='weighted')))
#print("%s: %f"%("The F1 Score is", f1_score(y_test,y_pred, average='weighted')))
#print("Accuracy Score is %s"%(accuracy_score(y_test,y_pred)),"%")

for name, model in models:
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)
  fpr, tpr, thresholds = metrics.roc_curve(y_test,y_pred)
  print("%s:"%(name))
  print("AUC Score is %s"%(metrics.auc(fpr, tpr)),"%")
  print("Confusion Matrix is \n %s"%(confusion_matrix(y_test, y_pred)))
  print("The Classification report is \n %s"%(classification_report(y_test, y_pred)))
  print("%s: %f"%("The Precision Score is", precision_score(y_test,y_pred, average='weighted')))
  print("%s: %f"%("The Recall Score is", recall_score(y_test,y_pred, average='weighted')))
  print("%s: %f"%("The F1 Score is", f1_score(y_test,y_pred, average='weighted')))
  print("Accuracy Score is %s"%(accuracy_score(y_test,y_pred)),"%")
  print("\n \n")

"""After SMOTE"""

#print(name, "After SMOTE")
#classifier.fit(X_train_res, y_train_res)
#y_pred = classifier.predict(X_test)
#fpr, tpr, thresholds = metrics.roc_curve(y_test,y_pred)
#print("AUC Score is %s"%(metrics.auc(fpr, tpr)),"%")
#print("Confusion Matrix is \n %s"%(confusion_matrix(y_test, y_pred)))
#print("The Classification report is \n %s"%(classification_report(y_test, y_pred)))
#print("%s: %f"%("The Precision Score is", precision_score(y_test,y_pred, average='weighted')))
#print("%s: %f"%("The Recall Score is", recall_score(y_test,y_pred, average='weighted')))
#print("%s: %f"%("The F1 Score is", f1_score(y_test,y_pred, average='weighted')))
#print("Accuracy Score is %s"%(accuracy_score(y_test,y_pred)),"%")

for name, model in models:
  model.fit(X_train_res, y_train_res)
  y_pred = model.predict(X_test)
  fpr, tpr, thresholds = metrics.roc_curve(y_test,y_pred)
  print("%s: After SMOTE" % (name))
  print("AUC Score is %s"%(metrics.auc(fpr, tpr)),"%")
  print("Confusion Matrix is \n %s"%(confusion_matrix(y_test, y_pred)))
  print("The Classification report is \n %s"%(classification_report(y_test, y_pred)))
  print("%s: %f"%("The Precision Score is", precision_score(y_test,y_pred, average='weighted')))
  print("%s: %f"%("The Recall Score is", recall_score(y_test,y_pred, average='weighted')))
  print("%s: %f"%("The F1 Score is", f1_score(y_test,y_pred, average='weighted')))
  print("Accuracy Score is %s"%(accuracy_score(y_test,y_pred)),"%")
  print("\n \n")

results=[]
names=['ada']

from sklearn import model_selection
from sklearn.model_selection import KFold

for name,model in models:
    kfold=model_selection.KFold(n_splits=10)
    cv_results=model_selection.cross_val_score(model,X_train_res,y_train_res,cv=kfold,scoring='accuracy')
    results.append(cv_results)
    names.append(name)

print(results)

import matplotlib.pyplot as plt # to plot graph
import seaborn as sns
fig=plt.figure() 
fig.suptitle('DecisionTree histogram') 
ax=fig.add_subplot(111) 
plt.hist(results) 
#ax.set_xticklabels(names) 
plt.show()

from sklearn.metrics import confusion_matrix, precision_recall_curve, auc, roc_auc_score, roc_curve, recall_score, classification_report

y_score = classifier.fit(X_train, y_train)

false_positive_rate1, true_positive_rate1, threshold1 = roc_curve(y_test, y_score)
print('roc_auc_score for',(name), roc_auc_score(y_test, y_score))

plt.subplots(1, figsize=(10,10))
    plt.title('Receiver Operating Characteristic - DecisionTree')
    plt.plot(false_positive_rate1, true_positive_rate1)
    plt.plot([0, 1], ls="--")
    plt.plot([0, 0], [1, 0] , c=".7"), plt.plot([1, 1] , c=".7")
    plt.legend(loc='lower right')
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()

"""bagging"""

from sklearn.ensemble import BaggingClassifier
predicted_bagging = classifier.predict(X_test)

predicted_bagging

from sklearn.metrics import accuracy_score
accuracy_score(y_test,classifier.predict(X_test))

final_accuracy_scores_DecisionTree_entropy=[]
print(name+" Accuracy score after bagging ",accuracy_score(y_train_res,classifier.predict(X_train_res)))
final_accuracy_scores_DecisionTree_entropy.append([classifier,confusion_matrix(y_test,classifier.predict(X_test)),
                                                   accuracy_score(y_test,classifier.predict(X_test)),confusion_matrix(y_train_res,classifier.predict(X_train_res)),
                                                   accuracy_score(y_train_res,classifier.predict(X_train_res))])