Dependencies

In [1]:

import os
import io
import numpy
import pandas as pd
from pandas import DataFrame
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

Helper Functions

In [2]:

def readFiles(path):
    for root, dirnames, filenames in os.walk(path):
        for filename in filenames:
            path = os.path.join(root, filename)

            inBody = False
            lines = []
            f = io.open(path, 'r', encoding='latin1')
            for line in f:
                if inBody:
                    lines.append(line)
                elif line == '\n':
                    inBody = True
            f.close()
            message = '\n'.join(lines)
            yield path, message


def dataFrameFromDirectory(path, classification):
    rows = []
    index = []
    for filename, message in readFiles(path):
        rows.append({'message': message, 'class': classification})
        index.append(filename)

    return DataFrame(rows, index=index)

Data Setup

In [3]:

data = DataFrame({'message': [], 'class': []})

data = pd.concat([data, dataFrameFromDirectory("emails/spam", "spam")]);
data = pd.concat([data, dataFrameFromDirectory("emails/ham", "ham")])
# INSPECT the dataframe
data.head()

Out [3]:

	message	class
emails/spam/00249.5f45607c1bffe89f60ba1ec9f878039a	Dear Homeowner,\n\n \n\nInterest Rates are at ...	spam
emails/spam/00373.ebe8670ac56b04125c25100a36ab0510	ATTENTION: This is a MUST for ALL Computer Use...	spam
emails/spam/00214.1367039e50dc6b7adb0f2aa8aba83216	This is a multi-part message in MIME format.\n...	spam
emails/spam/00210.050ffd105bd4e006771ee63cabc59978	IMPORTANT INFORMATION:\n\n\n\nThe new domain n...	spam
emails/spam/00033.9babb58d9298daa2963d4f514193d7d6	This is the bottom line. If you can GIVE AWAY...	spam

Vectorize Text with CountVectorize

SkLearn CountVectorizer DOCS
Split up each message into its list of words.
Store the vectorized data in a MultinomialNB classifier.
Call fit() and voila - a trained spam filter.

In [4]:

vectorizer = CountVectorizer()
counts = vectorizer.fit_transform(data['message'].values)
targets = data['class'].values

print(f'first target: {targets[:1]}\n')
print(f'counts: {counts}\n')
print(f'get_feature_names_out: {vectorizer.get_feature_names_out()}\n')
print(f'first 5 feature names...: {vectorizer.get_feature_names_out()[:5]}')
print(counts.toarray())

first target: ['spam']

counts:   (0, 18974)	1
  (0, 28546)	1
  (0, 30958)	1
  (0, 46065)	1
  (0, 11302)	1
  (0, 11640)	1
  (0, 53242)	1
  (0, 35476)	1
  (0, 43408)	1
  (0, 30375)	1
  (0, 4138)	1
  (0, 60488)	1
  (0, 58215)	1
  (0, 27983)	1
  (0, 60793)	1
  (0, 24338)	1
  (0, 53220)	1
  (0, 13116)	1
  (0, 46062)	1
  (0, 24806)	1
  (0, 60804)	3
  (0, 50029)	1
  (0, 14755)	1
  (0, 36465)	1
  (0, 38861)	1
  :	:
  (2999, 11667)	1
  (2999, 24048)	1
  (2999, 58322)	1
  (2999, 43557)	1
  (2999, 38269)	1
  (2999, 6449)	1
  (2999, 24914)	3
  (2999, 21921)	1
  (2999, 16018)	1
  (2999, 48790)	3
  (2999, 15083)	5
  (2999, 15084)	3
  (2999, 51051)	1
  (2999, 33821)	1
  (2999, 40201)	1
  (2999, 53080)	2
  (2999, 25831)	1
  (2999, 49617)	1
  (2999, 15301)	1
  (2999, 17127)	1
  (2999, 38952)	1
  (2999, 16507)	1
  (2999, 7860)	1
  (2999, 52206)	1
  (2999, 37101)	1

get_feature_names_out: ['00' '000' '0000' ... 'þõµ' 'þüg'
 'ÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿó']

first 5 feature names...: ['00' '000' '0000' '000000' '000000000']
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 4 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [4 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]

Train A Classifier With The Vectorized Results

In [5]:

classifier = MultinomialNB()
classifier.fit(counts, targets)

Out [5]:

MultinomialNB()

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

In [6]:

examples = ['Free Viagra now!!!', "Hi Bob, how about a game of golf tomorrow?"]
example_counts = vectorizer.transform(examples)
predictions = classifier.predict(example_counts)

for idx, exampleText in enumerate(examples):
    # print(f'Text: {exampleText} is probably a {predictions[exampleIdx]}')
    print(f'{predictions[idx]}\t"{exampleText}"')

spam	"Free Viagra now!!!"
ham	"Hi Bob, how about a game of golf tomorrow?"

Page Tags:

python

data-science

jupyter

learning

numpy