This is the final project of udacity class "Intro to Machine Learning". The goal is building an algorithm to identify Enron employees who may have committed fraud based on the public Enron financial and email dataset.
This notebook relies on code provided by the class, which is available here. Put this notebook in the final_project folder and it should work.
import pickle
import sys
sys.path.append("../tools/")
### Load the dictionary containing the dataset
with open("final_project_dataset.pkl", "r") as data_file:
data_dict = pickle.load(data_file)
### Also convert to dataframe for easy manipulation.
import pandas as pd
import numpy as np
df = pd.DataFrame.from_dict(data_dict, orient='index')
for column in df.columns:
if column == 'email_address':
continue
# Convert NaN to 0.
df[column] = df[column].apply(lambda x: 0 if x == 'NaN' else float(x))
I choose to select all the features available at this step.
### Task 1: Select what features you'll use.
### features_list is a list of strings, each of which is a feature name.
### The first feature must be "poi".
features_list = ['poi',
'salary', 'deferral_payments', 'total_payments',
'exercised_stock_options', 'bonus', 'restricted_stock',
'total_stock_value', 'expenses', 'loan_advances',
'director_fees', 'deferred_income', 'long_term_incentive',
'to_messages', 'shared_receipt_with_poi', 'from_messages',
'from_this_person_to_poi', 'from_poi_to_this_person']
Use histogram to identify outliers. Start with salary.
import matplotlib
matplotlib.style.use('ggplot')
%matplotlib inline
df['salary'].plot.hist(by='salary')
There seems to be an outlier, A 25 million salary seems really high. Let's find it.
df['salary'].idxmax()
# Drop 'TOTAL' and replot.
df=df.drop('TOTAL')
df['salary'].plot.hist()
# Let's check some high salaries.
df[df['salary'] > 800000]
The results seems reasonable: Lay Kenneth is the CEO of Enron, Frevert Mark is the CEO of Enron wholesale services, Skilling Jeffrey is former CEO of Enron. Let's continue to check other compensation data.
df[['deferral_payments', 'total_payments',
'exercised_stock_options', 'bonus', 'restricted_stock',
'total_stock_value', 'expenses', 'loan_advances',
'director_fees', 'deferred_income', 'long_term_incentive']].hist(figsize=(20, 15))
# Let's double check data with high total payments
df[df['total_payments'] > 80000000]
CEO Kenneth Lay got high payment seems reasonable. Continue with email data.
df[['to_messages', 'shared_receipt_with_poi', 'from_messages',
'from_this_person_to_poi', 'from_poi_to_this_person']].hist(figsize=(20, 15))
Now there seems to be no outlier. Continue with feature selection.
# My first thought is to build TFIDF matrix. However, it turns out the email data is very incomplete.
# We only have 19 people's email out of the 145 people given.
#
# There are two sets of email data provided:
#
# 1. Email folder. For each person, if we have their email, there will be a folder named
# LastName-FirstLetterOfFirstName. For example:
# maildir/
# allen-p
# inbox/
# send/
#
# 2. Email index file: For each person, we might have a file named
def name_to_folder_name(last_first_middle):
parts = last_first_middle.split(' ')
if len(parts) < 2:
return ''
return '-'.join([parts[0].lower(), parts[1][0].lower()])
def name_to_file_name(last_first_middle):
parts = last_first_middle.split(' ')
if len(parts) < 2:
return ''
return parts[1].lower() + '.' + parts[0].lower() + '@enron.com.txt'
name_count = 0
folder_count = 0
file_count = 0
for name in df.index.values:
if os.path.isdir('../maildir/' + name_to_folder_name(name)):
folder_count = folder_count + 1
file_name_suffix = name_to_file_name(name)
if os.path.exists('emails_by_address/' + 'from_' + file_name_suffix) or os.path.exists('emails_by_address/' + 'to_' + file_name_suffix):
file_count = file_count + 1
name_count = name_count + 1
print folder_count, file_count, name_count
## My second thought is to scale the number of poi related emails by total number of emails.
df['ratio_of_poi_related_email'] = (df['from_poi_to_this_person'] + df['from_this_person_to_poi'])/(df['to_messages'] + df['from_messages'])
# Also add to feature_list.
if 'ratio_of_poi_related_email' not in features_list:
features_list.append('ratio_of_poi_related_email')
### Task 4: Try a varity of classifiers
### Please name your classifier clf for easy export below.
### Note that if you want to do PCA or other multi-stage operations,
### you'll need to use Pipelines. For more info:
### http://scikit-learn.org/stable/modules/pipeline.html
import tester
### Store to my_dataset for easy export below.
my_dataset = df.fillna(0).to_dict(orient='index')
from sklearn import naive_bayes
clf = naive_bayes.GaussianNB()
tester.test_classifier(clf, my_dataset, features_list)
from sklearn import tree
clf = tree.DecisionTreeClassifier()
tester.test_classifier(clf, my_dataset, features_list)
from sklearn import ensemble
clf = ensemble.RandomForestClassifier()
tester.test_classifier(clf, my_dataset, features_list)
from sklearn import ensemble
clf = ensemble.RandomForestClassifier(criterion='entropy')
tester.test_classifier(clf, my_dataset, features_list)
from sklearn import ensemble
clf = ensemble.RandomForestClassifier(min_samples_split=20)
tester.test_classifier(clf, my_dataset, features_list)
from sklearn.pipeline import Pipeline
from sklearn import ensemble
from sklearn.decomposition import RandomizedPCA
estimators = [('reduce_dim', RandomizedPCA(n_components=5)), ('rf', ensemble.RandomForestClassifier(min_samples_split=20))]
clf = Pipeline(estimators)
tester.test_classifier(clf, my_dataset, features_list)
from sklearn.pipeline import Pipeline
from sklearn import ensemble
from sklearn.decomposition import RandomizedPCA
estimators = [('reduce_dim', RandomizedPCA(n_components=5)), ('rf', ensemble.AdaBoostClassifier())]
clf = Pipeline(estimators)
tester.test_classifier(clf, my_dataset, features_list)
from sklearn import ensemble
clf = ensemble.AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
learning_rate=2.0, n_estimators=50, random_state=None)
tester.test_classifier(clf, my_dataset, features_list)
### Task 5: Tune your classifier to achieve better than .3 precision and recall
### using our testing script. Check the tester.py script in the final project
### folder for details on the evaluation method, especially the test_classifier
### function. Because of the small size of the dataset, the script uses
### stratified shuffle split cross validation. For more info:
### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html
from sklearn import ensemble
clf = ensemble.AdaBoostClassifier(n_estimators=50)
tester.test_classifier(clf, my_dataset, features_list)
from sklearn import ensemble
clf = ensemble.AdaBoostClassifier(n_estimators=100)
tester.test_classifier(clf, my_dataset, features_list)
from sklearn import ensemble
clf = ensemble.AdaBoostClassifier(n_estimators=200)
tester.test_classifier(clf, my_dataset, features_list)
from sklearn import ensemble
clf = ensemble.AdaBoostClassifier(n_estimators=75)
tester.test_classifier(clf, my_dataset, features_list)
So n_estimators = 75 will give us a result with precision 0.39 and recall 0.30.
### Task 6: Dump your classifier, dataset, and features_list so anyone can
### check your results. You do not need to change anything below, but make sure
### that the version of poi_id.py that you submit can be run on its own and
### generates the necessary .pkl files for validating your results.
from tester import dump_classifier_and_data
dump_classifier_and_data(clf, my_dataset, features_list)