jad2192 · February 23, 2017 01:40
diff --git a/Census_Income_Model.ipynb b/Census_Income_Model.ipynb
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import matplotlib as mpl\n",
    "from matplotlib import pyplot as plt\n",
    "import pandas as pd\n",
    "import sklearn\n",
    "import seaborn as sns\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "data_df = pd.read_csv('/home/james/anaconda3/data/censusdata.csv',header=None)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Using the Census Income data set from UCI Machine learning repository.\n",
    "https://archive.ics.uci.edu/ml/datasets/Adult\n",
    "\n",
    "The data is a mix of various continuous and categorical features to be described in the above link and the goal is predict whether the adult makes over $50,000 annually.\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>age</th>\n",
       "      <th>workclass</th>\n",
       "      <th>fnlwgt</th>\n",
       "      <th>education</th>\n",
       "      <th>education_num</th>\n",
       "      <th>marital_status</th>\n",
       "      <th>occupation</th>\n",
       "      <th>relationship</th>\n",
       "      <th>race</th>\n",
       "      <th>sex</th>\n",
       "      <th>capital_gain</th>\n",
       "      <th>capital_loss</th>\n",
       "      <th>hours_per_week</th>\n",
       "      <th>native_country</th>\n",
       "      <th>label</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>39</td>\n",
       "      <td>State-gov</td>\n",
       "      <td>77516</td>\n",
       "      <td>Bachelors</td>\n",
       "      <td>13</td>\n",
       "      <td>Never-married</td>\n",
       "      <td>Adm-clerical</td>\n",
       "      <td>Not-in-family</td>\n",
       "      <td>White</td>\n",
       "      <td>Male</td>\n",
       "      <td>2174</td>\n",
       "      <td>0</td>\n",
       "      <td>40</td>\n",
       "      <td>United-States</td>\n",
       "      <td>&lt;=50K</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>50</td>\n",
       "      <td>Self-emp-not-inc</td>\n",
       "      <td>83311</td>\n",
       "      <td>Bachelors</td>\n",
       "      <td>13</td>\n",
       "      <td>Married-civ-spouse</td>\n",
       "      <td>Exec-managerial</td>\n",
       "      <td>Husband</td>\n",
       "      <td>White</td>\n",
       "      <td>Male</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>13</td>\n",
       "      <td>United-States</td>\n",
       "      <td>&lt;=50K</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>38</td>\n",
       "      <td>Private</td>\n",
       "      <td>215646</td>\n",
       "      <td>HS-grad</td>\n",
       "      <td>9</td>\n",
       "      <td>Divorced</td>\n",
       "      <td>Handlers-cleaners</td>\n",
       "      <td>Not-in-family</td>\n",
       "      <td>White</td>\n",
       "      <td>Male</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>40</td>\n",
       "      <td>United-States</td>\n",
       "      <td>&lt;=50K</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>53</td>\n",
       "      <td>Private</td>\n",
       "      <td>234721</td>\n",
       "      <td>11th</td>\n",
       "      <td>7</td>\n",
       "      <td>Married-civ-spouse</td>\n",
       "      <td>Handlers-cleaners</td>\n",
       "      <td>Husband</td>\n",
       "      <td>Black</td>\n",
       "      <td>Male</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>40</td>\n",
       "      <td>United-States</td>\n",
       "      <td>&lt;=50K</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>28</td>\n",
       "      <td>Private</td>\n",
       "      <td>338409</td>\n",
       "      <td>Bachelors</td>\n",
       "      <td>13</td>\n",
       "      <td>Married-civ-spouse</td>\n",
       "      <td>Prof-specialty</td>\n",
       "      <td>Wife</td>\n",
       "      <td>Black</td>\n",
       "      <td>Female</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>40</td>\n",
       "      <td>Cuba</td>\n",
       "      <td>&lt;=50K</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   age          workclass  fnlwgt   education  education_num  \\\n",
       "0   39          State-gov   77516   Bachelors             13   \n",
       "1   50   Self-emp-not-inc   83311   Bachelors             13   \n",
       "2   38            Private  215646     HS-grad              9   \n",
       "3   53            Private  234721        11th              7   \n",
       "4   28            Private  338409   Bachelors             13   \n",
       "\n",
       "        marital_status          occupation    relationship    race      sex  \\\n",
       "0        Never-married        Adm-clerical   Not-in-family   White     Male   \n",
       "1   Married-civ-spouse     Exec-managerial         Husband   White     Male   \n",
       "2             Divorced   Handlers-cleaners   Not-in-family   White     Male   \n",
       "3   Married-civ-spouse   Handlers-cleaners         Husband   Black     Male   \n",
       "4   Married-civ-spouse      Prof-specialty            Wife   Black   Female   \n",
       "\n",
       "   capital_gain  capital_loss  hours_per_week  native_country   label  \n",
       "0          2174             0              40   United-States   <=50K  \n",
       "1             0             0              13   United-States   <=50K  \n",
       "2             0             0              40   United-States   <=50K  \n",
       "3             0             0              40   United-States   <=50K  \n",
       "4             0             0              40            Cuba   <=50K  "
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "data_df.columns = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status',\n",
    "                   'occupation', 'relationship', 'race','sex', 'capital_gain', 'capital_loss', \n",
    "                   'hours_per_week', 'native_country', 'label']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "categorical_features = ['workclass', 'education', 'marital_status', 'occupation', 'relationship',\n",
    "                       'race', 'sex', 'native_country', 'label']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "data_df_dum = pd.get_dummies(data_df, columns = categorical_features, drop_first=1 )"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Using the pandas' method get_dummies to do a 'one-hot-encoding' of the categorical features to use in the classification model. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 92,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>age</th>\n",
       "      <th>fnlwgt</th>\n",
       "      <th>education_num</th>\n",
       "      <th>capital_gain</th>\n",
       "      <th>capital_loss</th>\n",
       "      <th>hours_per_week</th>\n",
       "      <th>workclass_ Federal-gov</th>\n",
       "      <th>workclass_ Local-gov</th>\n",
       "      <th>workclass_ Never-worked</th>\n",
       "      <th>workclass_ Private</th>\n",
       "      <th>...</th>\n",
       "      <th>native_country_ Puerto-Rico</th>\n",
       "      <th>native_country_ Scotland</th>\n",
       "      <th>native_country_ South</th>\n",
       "      <th>native_country_ Taiwan</th>\n",
       "      <th>native_country_ Thailand</th>\n",
       "      <th>native_country_ Trinadad&amp;Tobago</th>\n",
       "      <th>native_country_ United-States</th>\n",
       "      <th>native_country_ Vietnam</th>\n",
       "      <th>native_country_ Yugoslavia</th>\n",
       "      <th>label_ &gt;50K</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>39</td>\n",
       "      <td>77516</td>\n",
       "      <td>13</td>\n",
       "      <td>2174</td>\n",
       "      <td>0</td>\n",
       "      <td>40</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>50</td>\n",
       "      <td>83311</td>\n",
       "      <td>13</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>13</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>38</td>\n",
       "      <td>215646</td>\n",
       "      <td>9</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>40</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>53</td>\n",
       "      <td>234721</td>\n",
       "      <td>7</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>40</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>28</td>\n",
       "      <td>338409</td>\n",
       "      <td>13</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>40</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 101 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   age  fnlwgt  education_num  capital_gain  capital_loss  hours_per_week  \\\n",
       "0   39   77516             13          2174             0              40   \n",
       "1   50   83311             13             0             0              13   \n",
       "2   38  215646              9             0             0              40   \n",
       "3   53  234721              7             0             0              40   \n",
       "4   28  338409             13             0             0              40   \n",
       "\n",
       "   workclass_ Federal-gov  workclass_ Local-gov  workclass_ Never-worked  \\\n",
       "0                     0.0                   0.0                      0.0   \n",
       "1                     0.0                   0.0                      0.0   \n",
       "2                     0.0                   0.0                      0.0   \n",
       "3                     0.0                   0.0                      0.0   \n",
       "4                     0.0                   0.0                      0.0   \n",
       "\n",
       "   workclass_ Private     ...       native_country_ Puerto-Rico  \\\n",
       "0                 0.0     ...                               0.0   \n",
       "1                 0.0     ...                               0.0   \n",
       "2                 1.0     ...                               0.0   \n",
       "3                 1.0     ...                               0.0   \n",
       "4                 1.0     ...                               0.0   \n",
       "\n",
       "   native_country_ Scotland  native_country_ South  native_country_ Taiwan  \\\n",
       "0                       0.0                    0.0                     0.0   \n",
       "1                       0.0                    0.0                     0.0   \n",
       "2                       0.0                    0.0                     0.0   \n",
       "3                       0.0                    0.0                     0.0   \n",
       "4                       0.0                    0.0                     0.0   \n",
       "\n",
       "   native_country_ Thailand  native_country_ Trinadad&Tobago  \\\n",
       "0                       0.0                              0.0   \n",
       "1                       0.0                              0.0   \n",
       "2                       0.0                              0.0   \n",
       "3                       0.0                              0.0   \n",
       "4                       0.0                              0.0   \n",
       "\n",
       "   native_country_ United-States  native_country_ Vietnam  \\\n",
       "0                            1.0                      0.0   \n",
       "1                            1.0                      0.0   \n",
       "2                            1.0                      0.0   \n",
       "3                            1.0                      0.0   \n",
       "4                            0.0                      0.0   \n",
       "\n",
       "   native_country_ Yugoslavia  label_ >50K  \n",
       "0                         0.0          0.0  \n",
       "1                         0.0          0.0  \n",
       "2                         0.0          0.0  \n",
       "3                         0.0          0.0  \n",
       "4                         0.0          0.0  \n",
       "\n",
       "[5 rows x 101 columns]"
      ]
     },
     "execution_count": 92,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_df_dum.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.ensemble import GradientBoostingClassifier\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.metrics import accuracy_score\n",
    "from sklearn.metrics import confusion_matrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "X = data_df_dum.ix[:,:100].as_matrix()\n",
    "Y = data_df_dum['label_ >50K'].as_matrix()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.33, random_state=5 )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 117,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "clf_1 = GradientBoostingClassifier(learning_rate=0.009, n_estimators=1200, subsample=0.95, verbose=1,\n",
    "                                  max_depth=6)\n",
    "clf_2 = RandomForestClassifier(n_estimators=500,verbose=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 118,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "      Iter       Train Loss      OOB Improve   Remaining Time \n",
      "         1           1.0966           0.0077            3.39m\n",
      "         2           1.0883           0.0079            3.42m\n",
      "         3           1.0783           0.0077            3.36m\n",
      "         4           1.0722           0.0074            3.37m\n",
      "         5           1.0652           0.0077            3.34m\n",
      "         6           1.0585           0.0073            3.35m\n",
      "         7           1.0519           0.0064            3.32m\n",
      "         8           1.0445           0.0065            3.32m\n",
      "         9           1.0394           0.0061            3.31m\n",
      "        10           1.0330           0.0058            3.31m\n",
      "        20           0.9723           0.0046            3.27m\n",
      "        30           0.9252           0.0039            3.23m\n",
      "        40           0.8870           0.0035            3.21m\n",
      "        50           0.8493           0.0032            3.18m\n",
      "        60           0.8220           0.0026            3.15m\n",
      "        70           0.7944           0.0026            3.12m\n",
      "        80           0.7741           0.0020            3.09m\n",
      "        90           0.7561           0.0018            3.06m\n",
      "       100           0.7357           0.0016            3.04m\n",
      "       200           0.6282           0.0007            2.79m\n",
      "       300           0.5797           0.0002            2.50m\n",
      "       400           0.5538           0.0002            2.18m\n",
      "       500           0.5334           0.0001            1.88m\n",
      "       600           0.5174           0.0001            1.58m\n",
      "       700           0.5040           0.0000            1.30m\n",
      "       800           0.4923          -0.0000            1.02m\n",
      "       900           0.4838          -0.0001           45.49s\n",
      "      1000           0.4747          -0.0000           29.95s\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "GradientBoostingClassifier(criterion='friedman_mse', init=None,\n",
       "              learning_rate=0.009, loss='deviance', max_depth=6,\n",
       "              max_features=None, max_leaf_nodes=None,\n",
       "              min_impurity_split=1e-07, min_samples_leaf=1,\n",
       "              min_samples_split=2, min_weight_fraction_leaf=0.0,\n",
       "              n_estimators=1200, presort='auto', random_state=None,\n",
       "              subsample=0.95, verbose=1, warm_start=False)"
      ]
     },
     "execution_count": 118,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "clf_1.fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 119,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The error rate on the hold out set is : 12.58 %\n"
     ]
    }
   ],
   "source": [
    "print('The error rate on the hold out set is :', \n",
    "      round(100 - 100 * accuracy_score(y_test, clf_1.predict(X_test)),2), '%')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    8.6s finished\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\n",
       "            max_depth=None, max_features='auto', max_leaf_nodes=None,\n",
       "            min_impurity_split=1e-07, min_samples_leaf=1,\n",
       "            min_samples_split=2, min_weight_fraction_leaf=0.0,\n",
       "            n_estimators=500, n_jobs=1, oob_score=False, random_state=None,\n",
       "            verbose=1, warm_start=False)"
      ]
     },
     "execution_count": 83,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "clf_2.fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The error rate on the hold out set is : 14.41 %\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.8s finished\n"
     ]
    }
   ],
   "source": [
    "print('The error rate on the hold out set is :', \n",
    "      round(100 - 100 * accuracy_score(y_test, clf_2.predict(X_test)),2), '%')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 120,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "confusion_grad = pd.DataFrame(confusion_matrix(y_test, clf_1.predict(X_test)))\n",
    "confusion_grad.columns = ['Predicted: Under 50K', 'Predicted: Over 50K']\n",
    "confusion_grad.index = ['Actual: Under 50K', 'Actual: Over 50K']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 121,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Predicted: Under 50K</th>\n",
       "      <th>Predicted: Over 50K</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>Actual: Under 50K</th>\n",
       "      <td>7736</td>\n",
       "      <td>421</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Actual: Over 50K</th>\n",
       "      <td>931</td>\n",
       "      <td>1658</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                   Predicted: Under 50K  Predicted: Over 50K\n",
       "Actual: Under 50K                  7736                  421\n",
       "Actual: Over 50K                    931                 1658"
      ]
     },
     "execution_count": 121,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "confusion_grad"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.8s finished\n"
     ]
    }
   ],
   "source": [
    "confusion_rf = pd.DataFrame(confusion_matrix(y_test, clf_2.predict(X_test)))\n",
    "confusion_rf.columns = ['Predicted: Under 50K', 'Predicted: Over 50K']\n",
    "confusion_rf.index = ['Actual: Under 50K', 'Actual: Over 50K']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Predicted: Under 50K</th>\n",
       "      <th>Predicted: Over 50K</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>Actual: Under 50K</th>\n",
       "      <td>7596</td>\n",
       "      <td>561</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Actual: Over 50K</th>\n",
       "      <td>988</td>\n",
       "      <td>1601</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                   Predicted: Under 50K  Predicted: Over 50K\n",
       "Actual: Under 50K                  7596                  561\n",
       "Actual: Over 50K                    988                 1601"
      ]
     },
     "execution_count": 86,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "confusion_rf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python [conda root]",
   "language": "python",
   "name": "conda-root-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
 }
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 17,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"import numpy as np\n",
	"import matplotlib as mpl\n",
	"from matplotlib import pyplot as plt\n",
	"import pandas as pd\n",
	"import sklearn\n",
	"import seaborn as sns\n",
	"%matplotlib inline"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"data_df = pd.read_csv('/home/james/anaconda3/data/censusdata.csv',header=None)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Using the Census Income data set from UCI Machine learning repository.\n",
	"https://archive.ics.uci.edu/ml/datasets/Adult\n",
	"\n",
	"The data is a mix of various continuous and categorical features to be described in the above link and the goal is predict whether the adult makes over $50,000 annually.\n",
	"\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 10,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>age</th>\n",
	" <th>workclass</th>\n",
	" <th>fnlwgt</th>\n",
	" <th>education</th>\n",
	" <th>education_num</th>\n",
	" <th>marital_status</th>\n",
	" <th>occupation</th>\n",
	" <th>relationship</th>\n",
	" <th>race</th>\n",
	" <th>sex</th>\n",
	" <th>capital_gain</th>\n",
	" <th>capital_loss</th>\n",
	" <th>hours_per_week</th>\n",
	" <th>native_country</th>\n",
	" <th>label</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td>39</td>\n",
	" <td>State-gov</td>\n",
	" <td>77516</td>\n",
	" <td>Bachelors</td>\n",
	" <td>13</td>\n",
	" <td>Never-married</td>\n",
	" <td>Adm-clerical</td>\n",
	" <td>Not-in-family</td>\n",
	" <td>White</td>\n",
	" <td>Male</td>\n",
	" <td>2174</td>\n",
	" <td>0</td>\n",
	" <td>40</td>\n",
	" <td>United-States</td>\n",
	" <td><=50K</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1</th>\n",
	" <td>50</td>\n",
	" <td>Self-emp-not-inc</td>\n",
	" <td>83311</td>\n",
	" <td>Bachelors</td>\n",
	" <td>13</td>\n",
	" <td>Married-civ-spouse</td>\n",
	" <td>Exec-managerial</td>\n",
	" <td>Husband</td>\n",
	" <td>White</td>\n",
	" <td>Male</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>13</td>\n",
	" <td>United-States</td>\n",
	" <td><=50K</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2</th>\n",
	" <td>38</td>\n",
	" <td>Private</td>\n",
	" <td>215646</td>\n",
	" <td>HS-grad</td>\n",
	" <td>9</td>\n",
	" <td>Divorced</td>\n",
	" <td>Handlers-cleaners</td>\n",
	" <td>Not-in-family</td>\n",
	" <td>White</td>\n",
	" <td>Male</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>40</td>\n",
	" <td>United-States</td>\n",
	" <td><=50K</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>3</th>\n",
	" <td>53</td>\n",
	" <td>Private</td>\n",
	" <td>234721</td>\n",
	" <td>11th</td>\n",
	" <td>7</td>\n",
	" <td>Married-civ-spouse</td>\n",
	" <td>Handlers-cleaners</td>\n",
	" <td>Husband</td>\n",
	" <td>Black</td>\n",
	" <td>Male</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>40</td>\n",
	" <td>United-States</td>\n",
	" <td><=50K</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>4</th>\n",
	" <td>28</td>\n",
	" <td>Private</td>\n",
	" <td>338409</td>\n",
	" <td>Bachelors</td>\n",
	" <td>13</td>\n",
	" <td>Married-civ-spouse</td>\n",
	" <td>Prof-specialty</td>\n",
	" <td>Wife</td>\n",
	" <td>Black</td>\n",
	" <td>Female</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>40</td>\n",
	" <td>Cuba</td>\n",
	" <td><=50K</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" age workclass fnlwgt education education_num \\\n",
	"0 39 State-gov 77516 Bachelors 13 \n",
	"1 50 Self-emp-not-inc 83311 Bachelors 13 \n",
	"2 38 Private 215646 HS-grad 9 \n",
	"3 53 Private 234721 11th 7 \n",
	"4 28 Private 338409 Bachelors 13 \n",
	"\n",
	" marital_status occupation relationship race sex \\\n",
	"0 Never-married Adm-clerical Not-in-family White Male \n",
	"1 Married-civ-spouse Exec-managerial Husband White Male \n",
	"2 Divorced Handlers-cleaners Not-in-family White Male \n",
	"3 Married-civ-spouse Handlers-cleaners Husband Black Male \n",
	"4 Married-civ-spouse Prof-specialty Wife Black Female \n",
	"\n",
	" capital_gain capital_loss hours_per_week native_country label \n",
	"0 2174 0 40 United-States <=50K \n",
	"1 0 0 13 United-States <=50K \n",
	"2 0 0 40 United-States <=50K \n",
	"3 0 0 40 United-States <=50K \n",
	"4 0 0 40 Cuba <=50K "
	]
	},
	"execution_count": 10,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"data_df.head()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"data_df.columns = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status',\n",
	" 'occupation', 'relationship', 'race','sex', 'capital_gain', 'capital_loss', \n",
	" 'hours_per_week', 'native_country', 'label']"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 11,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"categorical_features = ['workclass', 'education', 'marital_status', 'occupation', 'relationship',\n",
	" 'race', 'sex', 'native_country', 'label']"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 12,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"data_df_dum = pd.get_dummies(data_df, columns = categorical_features, drop_first=1 )"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Using the pandas' method get_dummies to do a 'one-hot-encoding' of the categorical features to use in the classification model. "
	]
	},
	{
	"cell_type": "code",
	"execution_count": 92,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>age</th>\n",
	" <th>fnlwgt</th>\n",
	" <th>education_num</th>\n",
	" <th>capital_gain</th>\n",
	" <th>capital_loss</th>\n",
	" <th>hours_per_week</th>\n",
	" <th>workclass_ Federal-gov</th>\n",
	" <th>workclass_ Local-gov</th>\n",
	" <th>workclass_ Never-worked</th>\n",
	" <th>workclass_ Private</th>\n",
	" <th>...</th>\n",
	" <th>native_country_ Puerto-Rico</th>\n",
	" <th>native_country_ Scotland</th>\n",
	" <th>native_country_ South</th>\n",
	" <th>native_country_ Taiwan</th>\n",
	" <th>native_country_ Thailand</th>\n",
	" <th>native_country_ Trinadad&Tobago</th>\n",
	" <th>native_country_ United-States</th>\n",
	" <th>native_country_ Vietnam</th>\n",
	" <th>native_country_ Yugoslavia</th>\n",
	" <th>label_ >50K</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td>39</td>\n",
	" <td>77516</td>\n",
	" <td>13</td>\n",
	" <td>2174</td>\n",
	" <td>0</td>\n",
	" <td>40</td>\n",
	" <td>0.0</td>\n",
	" <td>0.0</td>\n",
	" <td>0.0</td>\n",
	" <td>0.0</td>\n",
	" <td>...</td>\n",
	" <td>0.0</td>\n",
	" <td>0.0</td>\n",
	" <td>0.0</td>\n",
	" <td>0.0</td>\n",
	" <td>0.0</td>\n",
	" <td>0.0</td>\n",
	" <td>1.0</td>\n",
	" <td>0.0</td>\n",
	" <td>0.0</td>\n",
	" <td>0.0</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1</th>\n",
	" <td>50</td>\n",
	" <td>83311</td>\n",
	" <td>13</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>13</td>\n",
	" <td>0.0</td>\n",
	" <td>0.0</td>\n",
	" <td>0.0</td>\n",
	" <td>0.0</td>\n",
	" <td>...</td>\n",
	" <td>0.0</td>\n",
	" <td>0.0</td>\n",
	" <td>0.0</td>\n",
	" <td>0.0</td>\n",
	" <td>0.0</td>\n",
	" <td>0.0</td>\n",
	" <td>1.0</td>\n",
	" <td>0.0</td>\n",
	" <td>0.0</td>\n",
	" <td>0.0</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2</th>\n",
	" <td>38</td>\n",
	" <td>215646</td>\n",
	" <td>9</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>40</td>\n",
	" <td>0.0</td>\n",
	" <td>0.0</td>\n",
	" <td>0.0</td>\n",
	" <td>1.0</td>\n",
	" <td>...</td>\n",
	" <td>0.0</td>\n",
	" <td>0.0</td>\n",
	" <td>0.0</td>\n",
	" <td>0.0</td>\n",
	" <td>0.0</td>\n",
	" <td>0.0</td>\n",
	" <td>1.0</td>\n",
	" <td>0.0</td>\n",
	" <td>0.0</td>\n",
	" <td>0.0</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>3</th>\n",
	" <td>53</td>\n",
	" <td>234721</td>\n",
	" <td>7</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>40</td>\n",
	" <td>0.0</td>\n",
	" <td>0.0</td>\n",
	" <td>0.0</td>\n",
	" <td>1.0</td>\n",
	" <td>...</td>\n",
	" <td>0.0</td>\n",
	" <td>0.0</td>\n",
	" <td>0.0</td>\n",
	" <td>0.0</td>\n",
	" <td>0.0</td>\n",
	" <td>0.0</td>\n",
	" <td>1.0</td>\n",
	" <td>0.0</td>\n",
	" <td>0.0</td>\n",
	" <td>0.0</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>4</th>\n",
	" <td>28</td>\n",
	" <td>338409</td>\n",
	" <td>13</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>40</td>\n",
	" <td>0.0</td>\n",
	" <td>0.0</td>\n",
	" <td>0.0</td>\n",
	" <td>1.0</td>\n",
	" <td>...</td>\n",
	" <td>0.0</td>\n",
	" <td>0.0</td>\n",
	" <td>0.0</td>\n",
	" <td>0.0</td>\n",
	" <td>0.0</td>\n",
	" <td>0.0</td>\n",
	" <td>0.0</td>\n",
	" <td>0.0</td>\n",
	" <td>0.0</td>\n",
	" <td>0.0</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"<p>5 rows × 101 columns</p>\n",
	"</div>"
	],
	"text/plain": [
	" age fnlwgt education_num capital_gain capital_loss hours_per_week \\\n",
	"0 39 77516 13 2174 0 40 \n",
	"1 50 83311 13 0 0 13 \n",
	"2 38 215646 9 0 0 40 \n",
	"3 53 234721 7 0 0 40 \n",
	"4 28 338409 13 0 0 40 \n",
	"\n",
	" workclass_ Federal-gov workclass_ Local-gov workclass_ Never-worked \\\n",
	"0 0.0 0.0 0.0 \n",
	"1 0.0 0.0 0.0 \n",
	"2 0.0 0.0 0.0 \n",
	"3 0.0 0.0 0.0 \n",
	"4 0.0 0.0 0.0 \n",
	"\n",
	" workclass_ Private ... native_country_ Puerto-Rico \\\n",
	"0 0.0 ... 0.0 \n",
	"1 0.0 ... 0.0 \n",
	"2 1.0 ... 0.0 \n",
	"3 1.0 ... 0.0 \n",
	"4 1.0 ... 0.0 \n",
	"\n",
	" native_country_ Scotland native_country_ South native_country_ Taiwan \\\n",
	"0 0.0 0.0 0.0 \n",
	"1 0.0 0.0 0.0 \n",
	"2 0.0 0.0 0.0 \n",
	"3 0.0 0.0 0.0 \n",
	"4 0.0 0.0 0.0 \n",
	"\n",
	" native_country_ Thailand native_country_ Trinadad&Tobago \\\n",
	"0 0.0 0.0 \n",
	"1 0.0 0.0 \n",
	"2 0.0 0.0 \n",
	"3 0.0 0.0 \n",
	"4 0.0 0.0 \n",
	"\n",
	" native_country_ United-States native_country_ Vietnam \\\n",
	"0 1.0 0.0 \n",
	"1 1.0 0.0 \n",
	"2 1.0 0.0 \n",
	"3 1.0 0.0 \n",
	"4 0.0 0.0 \n",
	"\n",
	" native_country_ Yugoslavia label_ >50K \n",
	"0 0.0 0.0 \n",
	"1 0.0 0.0 \n",
	"2 0.0 0.0 \n",
	"3 0.0 0.0 \n",
	"4 0.0 0.0 \n",
	"\n",
	"[5 rows x 101 columns]"
	]
	},
	"execution_count": 92,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"data_df_dum.head()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 16,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"from sklearn.model_selection import train_test_split\n",
	"from sklearn.ensemble import GradientBoostingClassifier\n",
	"from sklearn.ensemble import RandomForestClassifier\n",
	"from sklearn.metrics import accuracy_score\n",
	"from sklearn.metrics import confusion_matrix"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 34,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"X = data_df_dum.ix[:,:100].as_matrix()\n",
	"Y = data_df_dum['label_ >50K'].as_matrix()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 37,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.33, random_state=5 )"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 117,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"clf_1 = GradientBoostingClassifier(learning_rate=0.009, n_estimators=1200, subsample=0.95, verbose=1,\n",
	" max_depth=6)\n",
	"clf_2 = RandomForestClassifier(n_estimators=500,verbose=1)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 118,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	" Iter Train Loss OOB Improve Remaining Time \n",
	" 1 1.0966 0.0077 3.39m\n",
	" 2 1.0883 0.0079 3.42m\n",
	" 3 1.0783 0.0077 3.36m\n",
	" 4 1.0722 0.0074 3.37m\n",
	" 5 1.0652 0.0077 3.34m\n",
	" 6 1.0585 0.0073 3.35m\n",
	" 7 1.0519 0.0064 3.32m\n",
	" 8 1.0445 0.0065 3.32m\n",
	" 9 1.0394 0.0061 3.31m\n",
	" 10 1.0330 0.0058 3.31m\n",
	" 20 0.9723 0.0046 3.27m\n",
	" 30 0.9252 0.0039 3.23m\n",
	" 40 0.8870 0.0035 3.21m\n",
	" 50 0.8493 0.0032 3.18m\n",
	" 60 0.8220 0.0026 3.15m\n",
	" 70 0.7944 0.0026 3.12m\n",
	" 80 0.7741 0.0020 3.09m\n",
	" 90 0.7561 0.0018 3.06m\n",
	" 100 0.7357 0.0016 3.04m\n",
	" 200 0.6282 0.0007 2.79m\n",
	" 300 0.5797 0.0002 2.50m\n",
	" 400 0.5538 0.0002 2.18m\n",
	" 500 0.5334 0.0001 1.88m\n",
	" 600 0.5174 0.0001 1.58m\n",
	" 700 0.5040 0.0000 1.30m\n",
	" 800 0.4923 -0.0000 1.02m\n",
	" 900 0.4838 -0.0001 45.49s\n",
	" 1000 0.4747 -0.0000 29.95s\n"
	]
	},
	{
	"data": {
	"text/plain": [
	"GradientBoostingClassifier(criterion='friedman_mse', init=None,\n",
	" learning_rate=0.009, loss='deviance', max_depth=6,\n",
	" max_features=None, max_leaf_nodes=None,\n",
	" min_impurity_split=1e-07, min_samples_leaf=1,\n",
	" min_samples_split=2, min_weight_fraction_leaf=0.0,\n",
	" n_estimators=1200, presort='auto', random_state=None,\n",
	" subsample=0.95, verbose=1, warm_start=False)"
	]
	},
	"execution_count": 118,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"clf_1.fit(X_train, y_train)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 119,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"The error rate on the hold out set is : 12.58 %\n"
	]
	}
	],
	"source": [
	"print('The error rate on the hold out set is :', \n",
	" round(100 - 100 * accuracy_score(y_test, clf_1.predict(X_test)),2), '%')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 83,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"[Parallel(n_jobs=1)]: Done 500 out of 500 \| elapsed: 8.6s finished\n"
	]
	},
	{
	"data": {
	"text/plain": [
	"RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\n",
	" max_depth=None, max_features='auto', max_leaf_nodes=None,\n",
	" min_impurity_split=1e-07, min_samples_leaf=1,\n",
	" min_samples_split=2, min_weight_fraction_leaf=0.0,\n",
	" n_estimators=500, n_jobs=1, oob_score=False, random_state=None,\n",
	" verbose=1, warm_start=False)"
	]
	},
	"execution_count": 83,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"clf_2.fit(X_train, y_train)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 84,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"The error rate on the hold out set is : 14.41 %\n"
	]
	},
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"[Parallel(n_jobs=1)]: Done 500 out of 500 \| elapsed: 0.8s finished\n"
	]
	}
	],
	"source": [
	"print('The error rate on the hold out set is :', \n",
	" round(100 - 100 * accuracy_score(y_test, clf_2.predict(X_test)),2), '%')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 120,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"confusion_grad = pd.DataFrame(confusion_matrix(y_test, clf_1.predict(X_test)))\n",
	"confusion_grad.columns = ['Predicted: Under 50K', 'Predicted: Over 50K']\n",
	"confusion_grad.index = ['Actual: Under 50K', 'Actual: Over 50K']"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 121,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>Predicted: Under 50K</th>\n",
	" <th>Predicted: Over 50K</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>Actual: Under 50K</th>\n",
	" <td>7736</td>\n",
	" <td>421</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>Actual: Over 50K</th>\n",
	" <td>931</td>\n",
	" <td>1658</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" Predicted: Under 50K Predicted: Over 50K\n",
	"Actual: Under 50K 7736 421\n",
	"Actual: Over 50K 931 1658"
	]
	},
	"execution_count": 121,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"confusion_grad"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 85,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"[Parallel(n_jobs=1)]: Done 500 out of 500 \| elapsed: 0.8s finished\n"
	]
	}
	],
	"source": [
	"confusion_rf = pd.DataFrame(confusion_matrix(y_test, clf_2.predict(X_test)))\n",
	"confusion_rf.columns = ['Predicted: Under 50K', 'Predicted: Over 50K']\n",
	"confusion_rf.index = ['Actual: Under 50K', 'Actual: Over 50K']"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 86,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>Predicted: Under 50K</th>\n",
	" <th>Predicted: Over 50K</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>Actual: Under 50K</th>\n",
	" <td>7596</td>\n",
	" <td>561</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>Actual: Over 50K</th>\n",
	" <td>988</td>\n",
	" <td>1601</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" Predicted: Under 50K Predicted: Over 50K\n",
	"Actual: Under 50K 7596 561\n",
	"Actual: Over 50K 988 1601"
	]
	},
	"execution_count": 86,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"confusion_rf"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"anaconda-cloud": {},
	"kernelspec": {
	"display_name": "Python [conda root]",
	"language": "python",
	"name": "conda-root-py"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.5.2"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 1
	}
No results found