Naive Bayes

Table Of Contents

sklearn.naive_bayes can be used to train something like a spam classifier.

In [1]:
import os
import io
import numpy
import pandas as pd
from pandas import DataFrame
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
In [2]:
def readFiles(path):
    for root, dirnames, filenames in os.walk(path):
        for filename in filenames:
            path = os.path.join(root, filename)

            inBody = False
            lines = []
            f = io.open(path, 'r', encoding='latin1')
            for line in f:
                if inBody:
                    lines.append(line)
                elif line == '\n':
                    inBody = True
            f.close()
            message = '\n'.join(lines)
            yield path, message


def dataFrameFromDirectory(path, classification):
    rows = []
    index = []
    for filename, message in readFiles(path):
        rows.append({'message': message, 'class': classification})
        index.append(filename)

    return DataFrame(rows, index=index)
In [3]:
data = DataFrame({'message': [], 'class': []})

data = pd.concat([data, dataFrameFromDirectory("emails/spam", "spam")]);
data = pd.concat([data, dataFrameFromDirectory("emails/ham", "ham")])
# INSPECT the dataframe
data.head()
Out [3]:
message class
emails/spam/00249.5f45607c1bffe89f60ba1ec9f878039a Dear Homeowner,\n\n \n\nInterest Rates are at ... spam
emails/spam/00373.ebe8670ac56b04125c25100a36ab0510 ATTENTION: This is a MUST for ALL Computer Use... spam
emails/spam/00214.1367039e50dc6b7adb0f2aa8aba83216 This is a multi-part message in MIME format.\n... spam
emails/spam/00210.050ffd105bd4e006771ee63cabc59978 IMPORTANT INFORMATION:\n\n\n\nThe new domain n... spam
emails/spam/00033.9babb58d9298daa2963d4f514193d7d6 This is the bottom line. If you can GIVE AWAY... spam

Vectorize Text with CountVectorize

SkLearn CountVectorizer DOCS
Split up each message into its list of words.
Store the vectorized data in a MultinomialNB classifier.
Call fit() and voila - a trained spam filter.

In [4]:
vectorizer = CountVectorizer()
counts = vectorizer.fit_transform(data['message'].values)
targets = data['class'].values

print(f'first target: {targets[:1]}\n')
print(f'counts: {counts}\n')
print(f'get_feature_names_out: {vectorizer.get_feature_names_out()}\n')
print(f'first 5 feature names...: {vectorizer.get_feature_names_out()[:5]}')
print(counts.toarray())
first target: ['spam']

counts:   (0, 18974)	1
  (0, 28546)	1
  (0, 30958)	1
  (0, 46065)	1
  (0, 11302)	1
  (0, 11640)	1
  (0, 53242)	1
  (0, 35476)	1
  (0, 43408)	1
  (0, 30375)	1
  (0, 4138)	1
  (0, 60488)	1
  (0, 58215)	1
  (0, 27983)	1
  (0, 60793)	1
  (0, 24338)	1
  (0, 53220)	1
  (0, 13116)	1
  (0, 46062)	1
  (0, 24806)	1
  (0, 60804)	3
  (0, 50029)	1
  (0, 14755)	1
  (0, 36465)	1
  (0, 38861)	1
  :	:
  (2999, 11667)	1
  (2999, 24048)	1
  (2999, 58322)	1
  (2999, 43557)	1
  (2999, 38269)	1
  (2999, 6449)	1
  (2999, 24914)	3
  (2999, 21921)	1
  (2999, 16018)	1
  (2999, 48790)	3
  (2999, 15083)	5
  (2999, 15084)	3
  (2999, 51051)	1
  (2999, 33821)	1
  (2999, 40201)	1
  (2999, 53080)	2
  (2999, 25831)	1
  (2999, 49617)	1
  (2999, 15301)	1
  (2999, 17127)	1
  (2999, 38952)	1
  (2999, 16507)	1
  (2999, 7860)	1
  (2999, 52206)	1
  (2999, 37101)	1

get_feature_names_out: ['00' '000' '0000' ... 'þõµ' 'þüg'
 'ÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿó']

first 5 feature names...: ['00' '000' '0000' '000000' '000000000']
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 4 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [4 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
In [5]:
classifier = MultinomialNB()
classifier.fit(counts, targets)
Out [5]:
MultinomialNB()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
In [6]:
examples = ['Free Viagra now!!!', "Hi Bob, how about a game of golf tomorrow?"]
example_counts = vectorizer.transform(examples)
predictions = classifier.predict(example_counts)

for idx, exampleText in enumerate(examples):
    # print(f'Text: {exampleText} is probably a {predictions[exampleIdx]}')
    print(f'{predictions[idx]}\t"{exampleText}"')
spam	"Free Viagra now!!!"
ham	"Hi Bob, how about a game of golf tomorrow?"
Page Tags:
python
data-science
jupyter
learning
numpy