diff --git a/README.md b/README.md index 70c76a1..a3b4bac 100644 --- a/README.md +++ b/README.md @@ -1,62 +1,3 @@ -# Classify and Cluster SMS Messages +The code (and the data journey) is all included in the ipython notebook titled "sms spam". Functions are also contained in sms_spam.py. -## Description - -Use Bayesian classification and K-means clustering to analyze SMS messages. - -## Objectives - -### Learning Objectives - -After completing this assignment, you should understand: - -* The uses of classification and clustering -* Good feature extraction - -### Performance Objectives - -After completing this assignment, you should be able to: - -* Parse text into features -* Classify textual data -* Cluster textual data - -## Details - -### Deliverables - -* A Git repo called sms-spam containing at least: - * `README.md` file explaining how to run your project - * a `requirements.txt` file - -### Requirements - -* No PEP8 or Pyflakes warnings or errors - -## Normal Mode - -Download the [SMS Spam collection](https://archive.ics.uci.edu/ml/datasets/SMS+Spam+Collection) from the UCI Machine Learning Repository. - -Choose a set of features to use in order to separate SMS ham from spam. - -Write a program to extract the features you want from each SMS message and then classify each SMS as ham or spam. Iterate on your feature extraction until you -have a classification success level you are comfortable with (> 75% minimum.) - -## Hard Mode - -In addition to the requirements from **Normal Mode**: - -Write a program to cluster the SMS messages in differing numbers of groups. Examine each cluster to see if they have meaning. Write up your findings. - -## Notes - -Some features you might want to try: - -* Presence of the following words: claim, winner -* Presence of money symbols -* Presence of numbers -* Presence of first-person words - -## Additional Resources - -* [TextBlob](http://textblob.readthedocs.org/en/dev/) \ No newline at end of file +pip install -r requirements.txt to make sure that all necessary packages are installed diff --git a/requirements.txt b/requirements.txt index 73e7a6e..c9706f0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,6 @@ +ipython[all] scikit-learn pandas numpy matplotlib -textblob \ No newline at end of file +textblob diff --git a/sms spam.ipynb b/sms spam.ipynb new file mode 100644 index 0000000..53632c3 --- /dev/null +++ b/sms spam.ipynb @@ -0,0 +1,994 @@ +{ + "metadata": { + "name": "", + "signature": "sha256:6866a1ca8f2d1885f51260e6e326c96132ab4c385a0b846b1565f5cd9b6de1a4" + }, + "nbformat": 3, + "nbformat_minor": 0, + "worksheets": [ + { + "cells": [ + { + "cell_type": "code", + "collapsed": false, + "input": [ + "import numpy as np\n", + "from textblob import TextBlob\n", + "import re\n", + "from sklearn.feature_extraction.text import TfidfVectorizer as TfidV" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 255 + }, + { + "cell_type": "heading", + "level": 3, + "metadata": {}, + "source": [ + "Let's read in this text:" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "with open(\"SMSSpamCollection\") as file:\n", + " all_lines = file.readlines()" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 34 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "#all_lines" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 319 + }, + { + "cell_type": "heading", + "level": 3, + "metadata": {}, + "source": [ + "How does TextBlob work?" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "blob = TextBlob(all_lines[0])" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 28 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "blob" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 126, + "text": [ + "TextBlob(\"ham\tGo until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...\n", + "\")" + ] + } + ], + "prompt_number": 126 + }, + { + "cell_type": "heading", + "level": 3, + "metadata": {}, + "source": [ + "MAKE EVERYTHING A TEXTBLOB!!" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "bloblist = [TextBlob(x) for x in all_lines]" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 12 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "len(bloblist)" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 19, + "text": [ + "5574" + ] + } + ], + "prompt_number": 19 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "hamlist = [x for x in bloblist if x.startswith('ham')]" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 20 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "len(hamlist)" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 21, + "text": [ + "4827" + ] + } + ], + "prompt_number": 21 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "spamlist= [x for x in bloblist if x.startswith('spam')]" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 22 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "len(spamlist)" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 23, + "text": [ + "747" + ] + } + ], + "prompt_number": 23 + }, + { + "cell_type": "heading", + "level": 3, + "metadata": {}, + "source": [ + "I'm glad I wrote such a great word_frequency program once upon a time that can easily be tweaked for this:" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "def word_frequency(bloblist):\n", + " a_dict = {}\n", + " for blob in bloblist:\n", + " xlist = [x for x in blob.words]\n", + " for item in xlist:\n", + " if item.lower() in a_dict:\n", + " a_dict[item.lower()]+=1\n", + " else:\n", + " a_dict[item.lower()]=1\n", + " return a_dict\n", + "\n", + "def top_dict(a_dict, num):\n", + " sorted_list = sorted(a_dict.items(), key=lambda x: x[1], reverse=True)\n", + " return sorted_list[0:num]" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 317 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "ham_word_dict = word_frequency(hamlist)\n", + "top_dict(ham_word_dict, 3)" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 88, + "text": [ + "[('ham', 4828), ('i', 2851), ('you', 1938)]" + ] + } + ], + "prompt_number": 88 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "spam_word_dict = word_frequency(spamlist)\n", + "top_dict(spam_word_dict, 3)" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 278, + "text": [ + "[('spam', 748), ('to', 690), ('a', 378)]" + ] + } + ], + "prompt_number": 278 + }, + { + "cell_type": "heading", + "level": 3, + "metadata": {}, + "source": [ + "I had the idea to make a set of ham words and a set of spam words to cross reference them. I don't end up getting a ton out of it, partly because I had wanted to limit all of my features to True/False so I could do the Bernoulli Classifier. John Mitsch put no such constraints on himself and ended up using the idea of two sets to make a powerful spam filter." + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "def set_maker(a_dict):\n", + " aset = set()\n", + " for key in a_dict:\n", + " aset.add(key)\n", + " return aset" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 84 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "hamset = set_maker(ham_word_dict)\n", + "spamset = set_maker(spam_word_dict)" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 90 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "#hamset.difference(spamset)" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 93 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "#spamset.difference(hamset)" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 94 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "#hamset.union(spamset)" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 96 + }, + { + "cell_type": "heading", + "level": 3, + "metadata": {}, + "source": [ + "I made myself a nifty little function that shows how many times a word appears in each category of sms, and what percentage of the texts it is in for that category" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "def word_checker(string):\n", + " print(\"Ham: {} {}\".format(ham_word_dict[string], (ham_word_dict[string]/len(hamlist))))\n", + " print(\"Spam: {} {}\".format(spam_word_dict[string], (spam_word_dict[string]/len(spamlist))))" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 107 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "word_checker('txt')" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "Ham: 13 0.0026931841723637872\n", + "Spam: 156 0.20883534136546184\n" + ] + } + ], + "prompt_number": 277 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "ham_word_dict['call']" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 98, + "text": [ + "36" + ] + } + ], + "prompt_number": 98 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "spam_word_dict['reply']" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 274, + "text": [ + "104" + ] + } + ], + "prompt_number": 274 + }, + { + "cell_type": "heading", + "level": 3, + "metadata": {}, + "source": [ + "Here are all my functions for assembling my array of vectors/features. My features have 3 main categories:" + ] + }, + { + "cell_type": "heading", + "level": 3, + "metadata": {}, + "source": [ + "1: Whether or not a particular keyword is in the text" + ] + }, + { + "cell_type": "heading", + "level": 3, + "metadata": {}, + "source": [ + "2. How many words are in the text. I created 5 buckets of word lengths" + ] + }, + { + "cell_type": "heading", + "level": 3, + "metadata": {}, + "source": [ + "3. Whether they had numbers, and whether they had consecutive numbers." + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "def main_function(bloblist):\n", + " lista = []\n", + " for blobstring in bloblist:\n", + " a = findword(blobstring.lower(), 'i')\n", + " b = findword(blobstring.lower(), 'free')\n", + " c = findword(blobstring.lower(), 'mobile')\n", + " d = findword(blobstring.lower(), 'call')\n", + " e = findword(blobstring.lower(), 'cash')\n", + " f = findword(blobstring.lower(), 'cust')\n", + " g = findword(blobstring.lower(), 'cash')\n", + " h = findword(blobstring.lower(), 'stop')\n", + " i = findword(blobstring.lower(), 'reply')\n", + " j = findword(blobstring.lower(), 'txt')\n", + " l1 = wordlength(blobstring, 0, 10)\n", + " l2 = wordlength(blobstring, 10, 20)\n", + " l3 = wordlength(blobstring, 20, 30)\n", + " l4 = wordlength(blobstring, 30, 40)\n", + " l5 = wordlength(blobstring, 40, 999999)\n", + " nc1 = numbercheck1(blobstring)\n", + " nc5 = numbercheck5(blobstring)\n", + " nc10 = numbercheck10(blobstring)\n", + " z = startword(blobstring)\n", + " lista.append([a, b, c, d, e, f, g, h, i, j, l1, l2, l3, l4, l5, nc1, nc5, nc10, z])\n", + " return lista\n", + "\n", + "\n", + "def findword(blobstring, word):\n", + " if blobstring.find(word) == -1:\n", + " return 0\n", + " else:\n", + " return 1\n", + "\n", + "def startword(blobstring):\n", + " if blobstring.starts_with('spam'):\n", + " return 1\n", + " else:\n", + " return 0\n", + "\n", + "def wordlength(blobstring, num1, num2):\n", + " if num1 < len(blobstring.words) <= num2:\n", + " return 1\n", + " else:\n", + " return 0\n", + "\n", + "def numbercheck1(blobstring):\n", + " astring = str(blobstring)\n", + " if re.search(r\"[0-9]{1}\", astring):\n", + " return 1\n", + " else:\n", + " return 0\n", + "\n", + "def numbercheck5(blobstring):\n", + " astring = str(blobstring)\n", + " if re.search(r\"[0-9]{5}\", astring):\n", + " return 1\n", + " else:\n", + " return 0\n", + "\n", + "def numbercheck10(blobstring):\n", + " astring = str(blobstring)\n", + " if re.search(r\"[0-9]{10}\", astring):\n", + " return 1\n", + " else:\n", + " return 0" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 320 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "y = main_function(bloblist)" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 321 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "vector_array = np.array(y)" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 322 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "vector_array.shape" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 323, + "text": [ + "(5574, 19)" + ] + } + ], + "prompt_number": 323 + }, + { + "cell_type": "heading", + "level": 3, + "metadata": {}, + "source": [ + "We got our array of vectors! This is cool, let's train and run some classifiers!" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "from sklearn.cross_validation import train_test_split\n", + "from sklearn import metrics" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 308 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "xdata = vector_array[:,:-1 ]" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 297 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "xdata.shape" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 298, + "text": [ + "(5574, 18)" + ] + } + ], + "prompt_number": 298 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "ydata = vector_array[:, -1]" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 300 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "ydata.shape" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 303, + "text": [ + "(5574,)" + ] + } + ], + "prompt_number": 303 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "x_train, x_test, y_train, y_test = train_test_split(xdata, ydata, test_size=0.4, random_state=0)" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 304 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "def run_classifier(clf, x_train, x_test, y_train, y_test, x_data, y_data):\n", + " clf.fit(x_train, y_train)\n", + " predicted = clf.predict(x_test)\n", + " print(metrics.classification_report(y_test, predicted))\n", + " print(metrics.confusion_matrix(y_test, predicted))\n", + " print(metrics.f1_score(y_test, predicted))\n", + " scores = cross_val_score(clf, xdata, ydata, cv=5)" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 305 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "from sklearn.naive_bayes import BernoulliNB\n", + "from sklearn.cross_validation import cross_val_score\n", + "from sklearn.naive_bayes import GaussianNB\n", + "from sklearn.tree import DecisionTreeClassifier\n", + "from sklearn.ensemble import RandomForestClassifier" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 312 + }, + { + "cell_type": "heading", + "level": 3, + "metadata": {}, + "source": [ + "Bernoulli - f1 score: 0.912" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "bernoulli = BernoulliNB()\n", + "run_classifier(bernoulli, x_train, x_test, y_train, y_test, xdata, ydata)" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "stream": "stdout", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 0.98 0.99 0.99 1928\n", + " 1 0.94 0.89 0.91 302\n", + "\n", + "avg / total 0.98 0.98 0.98 2230\n", + "\n", + "[[1910 18]\n", + " [ 34 268]]\n", + "0.91156462585\n" + ] + } + ], + "prompt_number": 311 + }, + { + "cell_type": "heading", + "level": 3, + "metadata": {}, + "source": [ + "Gaussian Naive Bayesian = f1 score: 0.853" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "gauss = GaussianNB()\n", + "run_classifier(gauss, x_train, x_test, y_train, y_test, xdata, ydata)" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "stream": "stdout", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 0.98 0.97 0.98 1928\n", + " 1 0.81 0.90 0.85 302\n", + "\n", + "avg / total 0.96 0.96 0.96 2230\n", + "\n", + "[[1863 65]\n", + " [ 29 273]]\n", + "0.853125\n" + ] + } + ], + "prompt_number": 314 + }, + { + "cell_type": "heading", + "level": 3, + "metadata": {}, + "source": [ + "Decision Tree - f1 score: 0.907" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "tree = DecisionTreeClassifier()\n", + "run_classifier(tree, x_train, x_test, y_train, y_test, xdata, ydata)" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "stream": "stdout", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 0.98 0.99 0.99 1928\n", + " 1 0.96 0.86 0.91 302\n", + "\n", + "avg / total 0.98 0.98 0.98 2230\n", + "\n", + "[[1918 10]\n", + " [ 43 259]]\n", + "0.907180385289\n" + ] + } + ], + "prompt_number": 315 + }, + { + "cell_type": "heading", + "level": 3, + "metadata": {}, + "source": [ + "Random Forest - f1 score: 0.907" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "forest = RandomForestClassifier()\n", + "run_classifier(forest, x_train, x_test, y_train, y_test, xdata, ydata)" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "stream": "stdout", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 0.98 0.99 0.99 1928\n", + " 1 0.95 0.87 0.91 302\n", + "\n", + "avg / total 0.98 0.98 0.98 2230\n", + "\n", + "[[1914 14]\n", + " [ 40 262]]\n", + "0.906574394464\n" + ] + } + ], + "prompt_number": 316 + }, + { + "cell_type": "heading", + "level": 3, + "metadata": {}, + "source": [ + "Let's see if we can figure anything out with the clusters:" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "from sklearn.cluster import KMeans" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 324 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "kmeans = KMeans(6)\n", + "kmeans.fit(xdata)" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 350, + "text": [ + "KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=6, n_init=10,\n", + " n_jobs=1, precompute_distances=True, random_state=None, tol=0.0001,\n", + " verbose=0)" + ] + } + ], + "prompt_number": 350 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "clusters = kmeans.predict(xdata)\n", + "clusters" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "stream": "stderr", + "text": [ + "/Users/bretrunestad/.pyenv/versions/charting/lib/python3.4/site-packages/sklearn/cluster/k_means_.py:797: RuntimeWarning: Got data type int64, converted to float to avoid overflows\n", + " X = self._check_test_data(X)\n" + ] + }, + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 351, + "text": [ + "array([3, 4, 5, ..., 1, 3, 4], dtype=int32)" + ] + } + ], + "prompt_number": 351 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "#kmeans.cluster_centers_" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 356 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "kmeans.cluster_centers_.round()" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 353, + "text": [ + "array([[ 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., -0., -0., -0.,\n", + " 1., 0., 0., 0., -0.],\n", + " [ 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,\n", + " -0., -0., 0., 0., -0.],\n", + " [-0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., -0., -0.,\n", + " 0., -0., 0., 0., 0.],\n", + " [ 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,\n", + " -0., 0., 0., 0., -0.],\n", + " [ 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,\n", + " -0., -0., 0., 0., -0.],\n", + " [ 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1.,\n", + " 0., 0., 1., 1., 1.]])" + ] + } + ], + "prompt_number": 353 + }, + { + "cell_type": "heading", + "level": 6, + "metadata": {}, + "source": [ + "I had to get up to six clusters before there was a group that had a 0 for that initial \"i\" group. There seems to always be a goup that clusters around the number conditions at the end, as the last cluster does here. Other than that they seem to cluster around different lengths of the text message(by number of words). That seems to indicate that most of my keyword features weren't particularly beneficial." + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "# a = findword(blobstring.lower(), 'i')\n", + "# b = findword(blobstring.lower(), 'free')\n", + "# c = findword(blobstring.lower(), 'mobile')\n", + "# d = findword(blobstring.lower(), 'call')\n", + "# e = findword(blobstring.lower(), 'cash')\n", + "# f = findword(blobstring.lower(), 'cust')\n", + "# g = findword(blobstring.lower(), 'cash')\n", + "# h = findword(blobstring.lower(), 'stop')\n", + "# i = findword(blobstring.lower(), 'reply')\n", + "# j = findword(blobstring.lower(), 'txt')\n", + "# l1 = wordlength(blobstring, 0, 10)\n", + "# l2 = wordlength(blobstring, 10, 20)\n", + "# l3 = wordlength(blobstring, 20, 30)\n", + "# l4 = wordlength(blobstring, 30, 40)\n", + "# l5 = wordlength(blobstring, 40, 999999)\n", + "# nc1 = numbercheck1(blobstring)\n", + "# nc5 = numbercheck5(blobstring)\n", + "# nc10 = numbercheck10(blobstring)" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 354 + } + ], + "metadata": {} + } + ] +} \ No newline at end of file diff --git a/sms_spam.py b/sms_spam.py new file mode 100644 index 0000000..b7e71af --- /dev/null +++ b/sms_spam.py @@ -0,0 +1,105 @@ +import numpy as np +from textblob import TextBlob +import re + + +def word_frequency(bloblist): + """Creates a dictionary of all the words that appear in the list + and the number of times they appear""" + a_dict = {} + for blob in bloblist: + xlist = [x for x in blob.words] + for item in xlist: + if item.lower() in a_dict: + a_dict[item.lower()] += 1 + else: + a_dict[item.lower()] = 1 + return a_dict + + +def top_dict(a_dict, num): + """Returns the top 'num' most frequently appearing words""" + sorted_list = sorted(a_dict.items(), key=lambda x: x[1], reverse=True) + return sorted_list[0:num] + + +def main_function(bloblist, word1, word2): + """Makes a list of lists. Each list is a list of the 0/1 values + for each feature""" + lista = [] + for blobstring in bloblist: + a = findword(blobstring.lower(), 'i') + b = findword(blobstring.lower(), 'free') + c = findword(blobstring.lower(), 'mobile') + d = findword(blobstring.lower(), 'call') + e = findword(blobstring.lower(), 'cash') + f = findword(blobstring.lower(), 'cust') + g = findword(blobstring.lower(), 'cash') + h = findword(blobstring.lower(), 'stop') + i = findword(blobstring.lower(), 'reply') + j = findword(blobstring.lower(), 'txt') + l1 = wordlength(blobstring, 0, 10) + l2 = wordlength(blobstring, 10, 20) + l3 = wordlength(blobstring, 20, 30) + l4 = wordlength(blobstring, 30, 40) + l5 = wordlength(blobstring, 40, 999999) + nc1 = numbercheck1(blobstring) + nc5 = numbercheck5(blobstring) + nc10 = numbercheck10(blobstring) + z = startword(blobstring) + lista.append([a, b, c, d, e, f, g, h, i, j, l1, l2, l3, l4, l5, + nc1, nc5, nc10, z]) + return lista + + +def findword(blobstring, word): + """If the word is in the string, returns 1. Otherwise returns 0.""" + if blobstring.find(word) == -1: + return 0 + else: + return 1 + + +def startword(blobstring): + """Checks to see if the string starts with word spam. 1 for spam, + 0 for ham.""" + if blobstring.starts_with('spam'): + return 1 + else: + return 0 + + +def wordlength(blobstring, num1, num2): + """Checks to see if the number of words in the string is between + num1 and num2""" + if num1 < len(blobstring.words) <= num2: + return 1 + else: + return 0 + + +def numbercheck1(blobstring): + """Checks to see if there's a number in the string""" + astring = str(blobstring) + if re.search(r"[0-9]{1}", astring): + return 1 + else: + return 0 + + +def numbercheck5(blobstring): + """Checks to see if there's 5 numbers in a row in the string""" + astring = str(blobstring) + if re.search(r"[0-9]{5}", astring): + return 1 + else: + return 0 + + +def numbercheck10(blobstring): + """Checks to see if there's 10 numbers in a row in the string""" + astring = str(blobstring) + if re.search(r"[0-9]{10}", astring): + return 1 + else: + return 0