Created
February 23, 2017 01:40
-
-
Save jad2192/23ca08fa2053f2b8c8344fd0b03be6ed to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "code", | |
| "execution_count": 17, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "import numpy as np\n", | |
| "import matplotlib as mpl\n", | |
| "from matplotlib import pyplot as plt\n", | |
| "import pandas as pd\n", | |
| "import sklearn\n", | |
| "import seaborn as sns\n", | |
| "%matplotlib inline" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 6, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "data_df = pd.read_csv('/home/james/anaconda3/data/censusdata.csv',header=None)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "Using the Census Income data set from UCI Machine learning repository.\n", | |
| "https://archive.ics.uci.edu/ml/datasets/Adult\n", | |
| "\n", | |
| "The data is a mix of various continuous and categorical features to be described in the above link and the goal is predict whether the adult makes over $50,000 annually.\n", | |
| "\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 10, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>age</th>\n", | |
| " <th>workclass</th>\n", | |
| " <th>fnlwgt</th>\n", | |
| " <th>education</th>\n", | |
| " <th>education_num</th>\n", | |
| " <th>marital_status</th>\n", | |
| " <th>occupation</th>\n", | |
| " <th>relationship</th>\n", | |
| " <th>race</th>\n", | |
| " <th>sex</th>\n", | |
| " <th>capital_gain</th>\n", | |
| " <th>capital_loss</th>\n", | |
| " <th>hours_per_week</th>\n", | |
| " <th>native_country</th>\n", | |
| " <th>label</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>39</td>\n", | |
| " <td>State-gov</td>\n", | |
| " <td>77516</td>\n", | |
| " <td>Bachelors</td>\n", | |
| " <td>13</td>\n", | |
| " <td>Never-married</td>\n", | |
| " <td>Adm-clerical</td>\n", | |
| " <td>Not-in-family</td>\n", | |
| " <td>White</td>\n", | |
| " <td>Male</td>\n", | |
| " <td>2174</td>\n", | |
| " <td>0</td>\n", | |
| " <td>40</td>\n", | |
| " <td>United-States</td>\n", | |
| " <td><=50K</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>50</td>\n", | |
| " <td>Self-emp-not-inc</td>\n", | |
| " <td>83311</td>\n", | |
| " <td>Bachelors</td>\n", | |
| " <td>13</td>\n", | |
| " <td>Married-civ-spouse</td>\n", | |
| " <td>Exec-managerial</td>\n", | |
| " <td>Husband</td>\n", | |
| " <td>White</td>\n", | |
| " <td>Male</td>\n", | |
| " <td>0</td>\n", | |
| " <td>0</td>\n", | |
| " <td>13</td>\n", | |
| " <td>United-States</td>\n", | |
| " <td><=50K</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>38</td>\n", | |
| " <td>Private</td>\n", | |
| " <td>215646</td>\n", | |
| " <td>HS-grad</td>\n", | |
| " <td>9</td>\n", | |
| " <td>Divorced</td>\n", | |
| " <td>Handlers-cleaners</td>\n", | |
| " <td>Not-in-family</td>\n", | |
| " <td>White</td>\n", | |
| " <td>Male</td>\n", | |
| " <td>0</td>\n", | |
| " <td>0</td>\n", | |
| " <td>40</td>\n", | |
| " <td>United-States</td>\n", | |
| " <td><=50K</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>53</td>\n", | |
| " <td>Private</td>\n", | |
| " <td>234721</td>\n", | |
| " <td>11th</td>\n", | |
| " <td>7</td>\n", | |
| " <td>Married-civ-spouse</td>\n", | |
| " <td>Handlers-cleaners</td>\n", | |
| " <td>Husband</td>\n", | |
| " <td>Black</td>\n", | |
| " <td>Male</td>\n", | |
| " <td>0</td>\n", | |
| " <td>0</td>\n", | |
| " <td>40</td>\n", | |
| " <td>United-States</td>\n", | |
| " <td><=50K</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>28</td>\n", | |
| " <td>Private</td>\n", | |
| " <td>338409</td>\n", | |
| " <td>Bachelors</td>\n", | |
| " <td>13</td>\n", | |
| " <td>Married-civ-spouse</td>\n", | |
| " <td>Prof-specialty</td>\n", | |
| " <td>Wife</td>\n", | |
| " <td>Black</td>\n", | |
| " <td>Female</td>\n", | |
| " <td>0</td>\n", | |
| " <td>0</td>\n", | |
| " <td>40</td>\n", | |
| " <td>Cuba</td>\n", | |
| " <td><=50K</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " age workclass fnlwgt education education_num \\\n", | |
| "0 39 State-gov 77516 Bachelors 13 \n", | |
| "1 50 Self-emp-not-inc 83311 Bachelors 13 \n", | |
| "2 38 Private 215646 HS-grad 9 \n", | |
| "3 53 Private 234721 11th 7 \n", | |
| "4 28 Private 338409 Bachelors 13 \n", | |
| "\n", | |
| " marital_status occupation relationship race sex \\\n", | |
| "0 Never-married Adm-clerical Not-in-family White Male \n", | |
| "1 Married-civ-spouse Exec-managerial Husband White Male \n", | |
| "2 Divorced Handlers-cleaners Not-in-family White Male \n", | |
| "3 Married-civ-spouse Handlers-cleaners Husband Black Male \n", | |
| "4 Married-civ-spouse Prof-specialty Wife Black Female \n", | |
| "\n", | |
| " capital_gain capital_loss hours_per_week native_country label \n", | |
| "0 2174 0 40 United-States <=50K \n", | |
| "1 0 0 13 United-States <=50K \n", | |
| "2 0 0 40 United-States <=50K \n", | |
| "3 0 0 40 United-States <=50K \n", | |
| "4 0 0 40 Cuba <=50K " | |
| ] | |
| }, | |
| "execution_count": 10, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "data_df.head()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 9, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "data_df.columns = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status',\n", | |
| " 'occupation', 'relationship', 'race','sex', 'capital_gain', 'capital_loss', \n", | |
| " 'hours_per_week', 'native_country', 'label']" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 11, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "categorical_features = ['workclass', 'education', 'marital_status', 'occupation', 'relationship',\n", | |
| " 'race', 'sex', 'native_country', 'label']" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 12, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "data_df_dum = pd.get_dummies(data_df, columns = categorical_features, drop_first=1 )" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "Using the pandas' method get_dummies to do a 'one-hot-encoding' of the categorical features to use in the classification model. " | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 92, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>age</th>\n", | |
| " <th>fnlwgt</th>\n", | |
| " <th>education_num</th>\n", | |
| " <th>capital_gain</th>\n", | |
| " <th>capital_loss</th>\n", | |
| " <th>hours_per_week</th>\n", | |
| " <th>workclass_ Federal-gov</th>\n", | |
| " <th>workclass_ Local-gov</th>\n", | |
| " <th>workclass_ Never-worked</th>\n", | |
| " <th>workclass_ Private</th>\n", | |
| " <th>...</th>\n", | |
| " <th>native_country_ Puerto-Rico</th>\n", | |
| " <th>native_country_ Scotland</th>\n", | |
| " <th>native_country_ South</th>\n", | |
| " <th>native_country_ Taiwan</th>\n", | |
| " <th>native_country_ Thailand</th>\n", | |
| " <th>native_country_ Trinadad&Tobago</th>\n", | |
| " <th>native_country_ United-States</th>\n", | |
| " <th>native_country_ Vietnam</th>\n", | |
| " <th>native_country_ Yugoslavia</th>\n", | |
| " <th>label_ >50K</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>39</td>\n", | |
| " <td>77516</td>\n", | |
| " <td>13</td>\n", | |
| " <td>2174</td>\n", | |
| " <td>0</td>\n", | |
| " <td>40</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>...</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>1.0</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>50</td>\n", | |
| " <td>83311</td>\n", | |
| " <td>13</td>\n", | |
| " <td>0</td>\n", | |
| " <td>0</td>\n", | |
| " <td>13</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>...</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>1.0</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>38</td>\n", | |
| " <td>215646</td>\n", | |
| " <td>9</td>\n", | |
| " <td>0</td>\n", | |
| " <td>0</td>\n", | |
| " <td>40</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>1.0</td>\n", | |
| " <td>...</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>1.0</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>53</td>\n", | |
| " <td>234721</td>\n", | |
| " <td>7</td>\n", | |
| " <td>0</td>\n", | |
| " <td>0</td>\n", | |
| " <td>40</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>1.0</td>\n", | |
| " <td>...</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>1.0</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>28</td>\n", | |
| " <td>338409</td>\n", | |
| " <td>13</td>\n", | |
| " <td>0</td>\n", | |
| " <td>0</td>\n", | |
| " <td>40</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>1.0</td>\n", | |
| " <td>...</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.0</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "<p>5 rows × 101 columns</p>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " age fnlwgt education_num capital_gain capital_loss hours_per_week \\\n", | |
| "0 39 77516 13 2174 0 40 \n", | |
| "1 50 83311 13 0 0 13 \n", | |
| "2 38 215646 9 0 0 40 \n", | |
| "3 53 234721 7 0 0 40 \n", | |
| "4 28 338409 13 0 0 40 \n", | |
| "\n", | |
| " workclass_ Federal-gov workclass_ Local-gov workclass_ Never-worked \\\n", | |
| "0 0.0 0.0 0.0 \n", | |
| "1 0.0 0.0 0.0 \n", | |
| "2 0.0 0.0 0.0 \n", | |
| "3 0.0 0.0 0.0 \n", | |
| "4 0.0 0.0 0.0 \n", | |
| "\n", | |
| " workclass_ Private ... native_country_ Puerto-Rico \\\n", | |
| "0 0.0 ... 0.0 \n", | |
| "1 0.0 ... 0.0 \n", | |
| "2 1.0 ... 0.0 \n", | |
| "3 1.0 ... 0.0 \n", | |
| "4 1.0 ... 0.0 \n", | |
| "\n", | |
| " native_country_ Scotland native_country_ South native_country_ Taiwan \\\n", | |
| "0 0.0 0.0 0.0 \n", | |
| "1 0.0 0.0 0.0 \n", | |
| "2 0.0 0.0 0.0 \n", | |
| "3 0.0 0.0 0.0 \n", | |
| "4 0.0 0.0 0.0 \n", | |
| "\n", | |
| " native_country_ Thailand native_country_ Trinadad&Tobago \\\n", | |
| "0 0.0 0.0 \n", | |
| "1 0.0 0.0 \n", | |
| "2 0.0 0.0 \n", | |
| "3 0.0 0.0 \n", | |
| "4 0.0 0.0 \n", | |
| "\n", | |
| " native_country_ United-States native_country_ Vietnam \\\n", | |
| "0 1.0 0.0 \n", | |
| "1 1.0 0.0 \n", | |
| "2 1.0 0.0 \n", | |
| "3 1.0 0.0 \n", | |
| "4 0.0 0.0 \n", | |
| "\n", | |
| " native_country_ Yugoslavia label_ >50K \n", | |
| "0 0.0 0.0 \n", | |
| "1 0.0 0.0 \n", | |
| "2 0.0 0.0 \n", | |
| "3 0.0 0.0 \n", | |
| "4 0.0 0.0 \n", | |
| "\n", | |
| "[5 rows x 101 columns]" | |
| ] | |
| }, | |
| "execution_count": 92, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "data_df_dum.head()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 16, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "from sklearn.model_selection import train_test_split\n", | |
| "from sklearn.ensemble import GradientBoostingClassifier\n", | |
| "from sklearn.ensemble import RandomForestClassifier\n", | |
| "from sklearn.metrics import accuracy_score\n", | |
| "from sklearn.metrics import confusion_matrix" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 34, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "X = data_df_dum.ix[:,:100].as_matrix()\n", | |
| "Y = data_df_dum['label_ >50K'].as_matrix()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 37, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.33, random_state=5 )" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 117, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "clf_1 = GradientBoostingClassifier(learning_rate=0.009, n_estimators=1200, subsample=0.95, verbose=1,\n", | |
| " max_depth=6)\n", | |
| "clf_2 = RandomForestClassifier(n_estimators=500,verbose=1)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 118, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| " Iter Train Loss OOB Improve Remaining Time \n", | |
| " 1 1.0966 0.0077 3.39m\n", | |
| " 2 1.0883 0.0079 3.42m\n", | |
| " 3 1.0783 0.0077 3.36m\n", | |
| " 4 1.0722 0.0074 3.37m\n", | |
| " 5 1.0652 0.0077 3.34m\n", | |
| " 6 1.0585 0.0073 3.35m\n", | |
| " 7 1.0519 0.0064 3.32m\n", | |
| " 8 1.0445 0.0065 3.32m\n", | |
| " 9 1.0394 0.0061 3.31m\n", | |
| " 10 1.0330 0.0058 3.31m\n", | |
| " 20 0.9723 0.0046 3.27m\n", | |
| " 30 0.9252 0.0039 3.23m\n", | |
| " 40 0.8870 0.0035 3.21m\n", | |
| " 50 0.8493 0.0032 3.18m\n", | |
| " 60 0.8220 0.0026 3.15m\n", | |
| " 70 0.7944 0.0026 3.12m\n", | |
| " 80 0.7741 0.0020 3.09m\n", | |
| " 90 0.7561 0.0018 3.06m\n", | |
| " 100 0.7357 0.0016 3.04m\n", | |
| " 200 0.6282 0.0007 2.79m\n", | |
| " 300 0.5797 0.0002 2.50m\n", | |
| " 400 0.5538 0.0002 2.18m\n", | |
| " 500 0.5334 0.0001 1.88m\n", | |
| " 600 0.5174 0.0001 1.58m\n", | |
| " 700 0.5040 0.0000 1.30m\n", | |
| " 800 0.4923 -0.0000 1.02m\n", | |
| " 900 0.4838 -0.0001 45.49s\n", | |
| " 1000 0.4747 -0.0000 29.95s\n" | |
| ] | |
| }, | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "GradientBoostingClassifier(criterion='friedman_mse', init=None,\n", | |
| " learning_rate=0.009, loss='deviance', max_depth=6,\n", | |
| " max_features=None, max_leaf_nodes=None,\n", | |
| " min_impurity_split=1e-07, min_samples_leaf=1,\n", | |
| " min_samples_split=2, min_weight_fraction_leaf=0.0,\n", | |
| " n_estimators=1200, presort='auto', random_state=None,\n", | |
| " subsample=0.95, verbose=1, warm_start=False)" | |
| ] | |
| }, | |
| "execution_count": 118, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "clf_1.fit(X_train, y_train)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 119, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "The error rate on the hold out set is : 12.58 %\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "print('The error rate on the hold out set is :', \n", | |
| " round(100 - 100 * accuracy_score(y_test, clf_1.predict(X_test)),2), '%')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 83, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stderr", | |
| "output_type": "stream", | |
| "text": [ | |
| "[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed: 8.6s finished\n" | |
| ] | |
| }, | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\n", | |
| " max_depth=None, max_features='auto', max_leaf_nodes=None,\n", | |
| " min_impurity_split=1e-07, min_samples_leaf=1,\n", | |
| " min_samples_split=2, min_weight_fraction_leaf=0.0,\n", | |
| " n_estimators=500, n_jobs=1, oob_score=False, random_state=None,\n", | |
| " verbose=1, warm_start=False)" | |
| ] | |
| }, | |
| "execution_count": 83, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "clf_2.fit(X_train, y_train)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 84, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "The error rate on the hold out set is : 14.41 %\n" | |
| ] | |
| }, | |
| { | |
| "name": "stderr", | |
| "output_type": "stream", | |
| "text": [ | |
| "[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed: 0.8s finished\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "print('The error rate on the hold out set is :', \n", | |
| " round(100 - 100 * accuracy_score(y_test, clf_2.predict(X_test)),2), '%')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 120, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "confusion_grad = pd.DataFrame(confusion_matrix(y_test, clf_1.predict(X_test)))\n", | |
| "confusion_grad.columns = ['Predicted: Under 50K', 'Predicted: Over 50K']\n", | |
| "confusion_grad.index = ['Actual: Under 50K', 'Actual: Over 50K']" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 121, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>Predicted: Under 50K</th>\n", | |
| " <th>Predicted: Over 50K</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>Actual: Under 50K</th>\n", | |
| " <td>7736</td>\n", | |
| " <td>421</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>Actual: Over 50K</th>\n", | |
| " <td>931</td>\n", | |
| " <td>1658</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " Predicted: Under 50K Predicted: Over 50K\n", | |
| "Actual: Under 50K 7736 421\n", | |
| "Actual: Over 50K 931 1658" | |
| ] | |
| }, | |
| "execution_count": 121, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "confusion_grad" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 85, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stderr", | |
| "output_type": "stream", | |
| "text": [ | |
| "[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed: 0.8s finished\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "confusion_rf = pd.DataFrame(confusion_matrix(y_test, clf_2.predict(X_test)))\n", | |
| "confusion_rf.columns = ['Predicted: Under 50K', 'Predicted: Over 50K']\n", | |
| "confusion_rf.index = ['Actual: Under 50K', 'Actual: Over 50K']" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 86, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>Predicted: Under 50K</th>\n", | |
| " <th>Predicted: Over 50K</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>Actual: Under 50K</th>\n", | |
| " <td>7596</td>\n", | |
| " <td>561</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>Actual: Over 50K</th>\n", | |
| " <td>988</td>\n", | |
| " <td>1601</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " Predicted: Under 50K Predicted: Over 50K\n", | |
| "Actual: Under 50K 7596 561\n", | |
| "Actual: Over 50K 988 1601" | |
| ] | |
| }, | |
| "execution_count": 86, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "confusion_rf" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [] | |
| } | |
| ], | |
| "metadata": { | |
| "anaconda-cloud": {}, | |
| "kernelspec": { | |
| "display_name": "Python [conda root]", | |
| "language": "python", | |
| "name": "conda-root-py" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.5.2" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 1 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment