Email spam Detection
import numpy as np import pandas as pd import nltk from nltk.corpus import stopwords import string
df = pd.read_csv("data.csv") df.head()
text | spam | |
---|---|---|
0 | Subject: naturally irresistible your corporate... | 1 |
1 | Subject: the stock trading gunslinger fanny i... | 1 |
2 | Subject: unbelievable new homes made easy im ... | 1 |
3 | Subject: 4 color printing special request add... | 1 |
4 | Subject: do not have money , get software cds ... | 1 |
df.shape
(5728, 2)
df.columns
Index(['text', 'spam'], dtype='object')
df.drop_duplicates(inplace=True) print(df.shape)
(5695, 2)
print(df.isnull().sum())
text 0 spam 0 dtype: int64
# download the stopwords package nltk.download("stopwords")
[nltk_data] Downloading package stopwords to [nltk_data] /home/webtunix/nltk_data... [nltk_data] Package stopwords is already up-to-date!
True
def process(text): nopunc = [char for char in text if char not in string.punctuation] nopunc = ''.join(nopunc) clean = [word for word in nopunc.split() if word.lower() not in stopwords.words('english')] return clean # to show the tokenization df['text'].head().apply(process)
0 [Subject, naturally, irresistible, corporate, ... 1 [Subject, stock, trading, gunslinger, fanny, m... 2 [Subject, unbelievable, new, homes, made, easy... 3 [Subject, 4, color, printing, special, request... 4 [Subject, money, get, software, cds, software,... Name: text, dtype: object
from sklearn.feature_extraction.text import CountVectorizer message = CountVectorizer(analyzer=process).fit_transform(df['text'])
#split the data into 80% training and 20% testing from sklearn.model_selection import train_test_split xtrain, xtest, ytrain, ytest = train_test_split(message, df['spam'], test_size=0.20, random_state=0) # To see the shape of the data print(message.shape)
(5695, 37229)
# create and train the Naive Bayes Classifier from sklearn.naive_bayes import MultinomialNB classifier = MultinomialNB().fit(xtrain, ytrain)
print(classifier.predict(xtrain)) print(ytrain.values)
[0 0 0 ... 0 0 0] [0 0 0 ... 0 0 0]
# Evaluating the model on the training data set from sklearn.metrics import classification_report, confusion_matrix, accuracy_score pred = classifier.predict(xtrain) print(classification_report(ytrain, pred)) print() print("Confusion Matrix: \n", confusion_matrix(ytrain, pred)) print("Accuracy: \n", accuracy_score(ytrain, pred))
precision recall f1-score support 0 1.00 1.00 1.00 3457 1 0.99 1.00 0.99 1099 accuracy 1.00 4556 macro avg 0.99 1.00 1.00 4556 weighted avg 1.00 1.00 1.00 4556 Confusion Matrix: [[3445 12] [ 1 1098]] Accuracy: 0.9971466198419666
#print the predictions print(classifier.predict(xtest)) #print the actual values print(ytest.values)
[1 0 0 ... 0 0 0] [1 0 0 ... 0 0 0]
# Evaluating the model on the training data set from sklearn.metrics import classification_report, confusion_matrix, accuracy_score pred = classifier.predict(xtest) print(classification_report(ytest, pred)) print() print("Confusion Matrix: \n", confusion_matrix(ytest, pred)) print("Accuracy: \n", accuracy_score(ytest, pred))
precision recall f1-score support 0 1.00 0.99 0.99 870 1 0.97 1.00 0.98 269 accuracy 0.99 1139 macro avg 0.98 0.99 0.99 1139 weighted avg 0.99 0.99 0.99 1139 Confusion Matrix: [[862 8] [ 1 268]] Accuracy: 0.9920983318700615