Last active
January 24, 2017 05:30
-
-
Save awajundesu/6a3180c611246ee7467fdf07897b1596 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "code", | |
| "execution_count": 1, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "import os\n", | |
| "import re\n", | |
| "import pandas as pd\n", | |
| "import pandas_td as td\n", | |
| "import numpy as np\n", | |
| "\n", | |
| "con = td.connect(apikey=os.environ['TD_API_KEY'], endpoint='https://api.treasuredata.com')\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 2, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "presto = td.create_engine('presto:awa_lead_scoring', con=con)\n", | |
| "hive = td.create_engine('hive:awa_lead_scoring', con=con)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 3, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "df = td.read_td_table('lead_scoring',presto,limit=100000)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 25, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "query = '''\n", | |
| "WITH jobtitle AS(\n", | |
| " SELECT\n", | |
| " *,\n", | |
| " case\n", | |
| " when LOWER(job_title) LIKE '%manager%' then 1\n", | |
| " when LOWER(job_title) LIKE '%director%' then 1\n", | |
| " when LOWER(job_title) LIKE '%supervisor%' then 1\n", | |
| " else 0\n", | |
| " end as is_manager2\n", | |
| " FROM\n", | |
| " lead_scoring \n", | |
| "),\n", | |
| "quantified AS(\n", | |
| " SELECT\n", | |
| " quantify(true,converted,days_since_signup,completed_form,visited_pricing,registered_for_webinar,attended_webinar,is_manager2,acquisition_channel,company_size,industry)\n", | |
| " as (converted,days_since_signup,completed_form,visited_pricing,registered_for_webinar,attended_webinar,is_manager2,acquisition_channel,company_size,industry)\n", | |
| " FROM\n", | |
| " jobtitle\n", | |
| ")\n", | |
| "SELECT\n", | |
| " rowid() as rowid,\n", | |
| " converted as label,\n", | |
| " array(acquisition_channel,company_size,industry,is_manager2,days_since_signup,completed_form,visited_pricing,registered_for_webinar,attended_webinar)\n", | |
| " as features\n", | |
| "FROM\n", | |
| " quantified\n", | |
| "'''" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 26, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "td_quantified = td.read_td(query, hive)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 27, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>rowid</th>\n", | |
| " <th>label</th>\n", | |
| " <th>features</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>1-1</td>\n", | |
| " <td>0</td>\n", | |
| " <td>[0, 0, 0, 1, 8, 1, 1, 0, 0]</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>1-2</td>\n", | |
| " <td>0</td>\n", | |
| " <td>[1, 1, 0, 1, 2, 1, 1, 0, 0]</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>1-3</td>\n", | |
| " <td>0</td>\n", | |
| " <td>[1, 2, 1, 1, 28, 1, 0, 0, 0]</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>1-4</td>\n", | |
| " <td>0</td>\n", | |
| " <td>[0, 1, 0, 1, 20, 1, 1, 0, 0]</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>1-5</td>\n", | |
| " <td>1</td>\n", | |
| " <td>[1, 2, 0, 1, 12, 1, 1, 0, 0]</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " rowid label features\n", | |
| "0 1-1 0 [0, 0, 0, 1, 8, 1, 1, 0, 0]\n", | |
| "1 1-2 0 [1, 1, 0, 1, 2, 1, 1, 0, 0]\n", | |
| "2 1-3 0 [1, 2, 1, 1, 28, 1, 0, 0, 0]\n", | |
| "3 1-4 0 [0, 1, 0, 1, 20, 1, 1, 0, 0]\n", | |
| "4 1-5 1 [1, 2, 0, 1, 12, 1, 1, 0, 0]" | |
| ] | |
| }, | |
| "execution_count": 27, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "td_quantified.head()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 28, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "td.to_td(td_quantified,'awa_lead_scoring.featured',con,if_exists='replace',index=False)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 53, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "query2 = '''\n", | |
| "SELECT\n", | |
| " train_randomforest_classifier(features,label,'-attrs C,C,C,C,Q,C,C,C,C -trees 50')\n", | |
| " as (model_id,model_type,pred_model,var_importance,oob_errors,oob_tests)\n", | |
| "FROM\n", | |
| " featured\n", | |
| "'''" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 54, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "td_model = td.read_td(query2, hive)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 55, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>model_id</th>\n", | |
| " <th>model_type</th>\n", | |
| " <th>pred_model</th>\n", | |
| " <th>var_importance</th>\n", | |
| " <th>oob_errors</th>\n", | |
| " <th>oob_tests</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>ee167fbf-0473-4242-a4a6-b92ed22c4501</td>\n", | |
| " <td>-3</td>\n", | |
| " <td>I?8Q|t`*!J3d;N-[6r0ceOd{cMUu;Y</szdb*/Ot^rOr{`...</td>\n", | |
| " <td>[7.5508191889997835, 2.2270526498149352, 3.617...</td>\n", | |
| " <td>0</td>\n", | |
| " <td>0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>d26891fd-318d-4fd5-ac61-458a95657f94</td>\n", | |
| " <td>-3</td>\n", | |
| " <td>I?Q^k2_YnKC;Z/x#B!=uAe6%K6nBP7~O?Pw>]c&3Jm/q)^...</td>\n", | |
| " <td>[1.3113520211354033, 4.716675080561591, 2.5454...</td>\n", | |
| " <td>0</td>\n", | |
| " <td>0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>12959a81-1480-4e99-9071-f4cf9444d402</td>\n", | |
| " <td>-3</td>\n", | |
| " <td>I?GSl2:`eJ#r;yE`^E=Fyw6~(fscDY966FzFl;PI2(#t:X...</td>\n", | |
| " <td>[6.929184772569904, 4.883177786338777, 4.32007...</td>\n", | |
| " <td>0</td>\n", | |
| " <td>0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>42c05f15-5844-4a5d-9534-009bda94d382</td>\n", | |
| " <td>-3</td>\n", | |
| " <td>I?8QA16+!J3d?3B_hRwB-~:iFDA+ziSWCH0pGo$$c2_F6N...</td>\n", | |
| " <td>[6.791954188778515, 4.564584380045808, 5.53491...</td>\n", | |
| " <td>0</td>\n", | |
| " <td>0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>8640c6b1-9d48-4b1a-b0c7-a43b5a0915e1</td>\n", | |
| " <td>-3</td>\n", | |
| " <td>I?Q^~0zenK4/va<UFB2/S9a~=uiqrg6sk[QqgDSL<V&|6y...</td>\n", | |
| " <td>[3.6549179812841817, 7.925584410539383, 3.6769...</td>\n", | |
| " <td>0</td>\n", | |
| " <td>0</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " model_id model_type \\\n", | |
| "0 ee167fbf-0473-4242-a4a6-b92ed22c4501 -3 \n", | |
| "1 d26891fd-318d-4fd5-ac61-458a95657f94 -3 \n", | |
| "2 12959a81-1480-4e99-9071-f4cf9444d402 -3 \n", | |
| "3 42c05f15-5844-4a5d-9534-009bda94d382 -3 \n", | |
| "4 8640c6b1-9d48-4b1a-b0c7-a43b5a0915e1 -3 \n", | |
| "\n", | |
| " pred_model \\\n", | |
| "0 I?8Q|t`*!J3d;N-[6r0ceOd{cMUu;Y</szdb*/Ot^rOr{`... \n", | |
| "1 I?Q^k2_YnKC;Z/x#B!=uAe6%K6nBP7~O?Pw>]c&3Jm/q)^... \n", | |
| "2 I?GSl2:`eJ#r;yE`^E=Fyw6~(fscDY966FzFl;PI2(#t:X... \n", | |
| "3 I?8QA16+!J3d?3B_hRwB-~:iFDA+ziSWCH0pGo$$c2_F6N... \n", | |
| "4 I?Q^~0zenK4/va<UFB2/S9a~=uiqrg6sk[QqgDSL<V&|6y... \n", | |
| "\n", | |
| " var_importance oob_errors oob_tests \n", | |
| "0 [7.5508191889997835, 2.2270526498149352, 3.617... 0 0 \n", | |
| "1 [1.3113520211354033, 4.716675080561591, 2.5454... 0 0 \n", | |
| "2 [6.929184772569904, 4.883177786338777, 4.32007... 0 0 \n", | |
| "3 [6.791954188778515, 4.564584380045808, 5.53491... 0 0 \n", | |
| "4 [3.6549179812841817, 7.925584410539383, 3.6769... 0 0 " | |
| ] | |
| }, | |
| "execution_count": 55, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "td_model.head()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 56, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "td.to_td(td_model,'awa_lead_scoring.model',con,if_exists='replace',index=False)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 57, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "query3 = '''\n", | |
| "WITH t1 AS(\n", | |
| " SELECT\n", | |
| " t.rowid,\n", | |
| " tree_predict(p.model_id,p.model_type,p.pred_model,t.features,true) as predicted\n", | |
| " FROM\n", | |
| " model as p\n", | |
| " LEFT OUTER JOIN featured as t\n", | |
| "),\n", | |
| "t2 AS(\n", | |
| " SELECT\n", | |
| " rowid,\n", | |
| " rf_ensemble(predicted) as predicted\n", | |
| " FROM\n", | |
| " t1\n", | |
| " GROUP BY\n", | |
| " rowid\n", | |
| ")\n", | |
| "SELECT\n", | |
| " rowid,\n", | |
| " predicted.label, predicted.probability, predicted.probabilities\n", | |
| "FROM\n", | |
| " t2\n", | |
| "'''" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 58, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "td_predicted = td.read_td(query3, hive)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 59, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>rowid</th>\n", | |
| " <th>label</th>\n", | |
| " <th>probability</th>\n", | |
| " <th>probabilities</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>1-1</td>\n", | |
| " <td>0</td>\n", | |
| " <td>1.00</td>\n", | |
| " <td>[1.0, 0.0]</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>1-10</td>\n", | |
| " <td>0</td>\n", | |
| " <td>0.98</td>\n", | |
| " <td>[0.98, 0.02]</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>1-100</td>\n", | |
| " <td>0</td>\n", | |
| " <td>0.88</td>\n", | |
| " <td>[0.88, 0.12]</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>1-1000</td>\n", | |
| " <td>0</td>\n", | |
| " <td>1.00</td>\n", | |
| " <td>[1.0, 0.0]</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>1-10000</td>\n", | |
| " <td>0</td>\n", | |
| " <td>1.00</td>\n", | |
| " <td>[1.0, 0.0]</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " rowid label probability probabilities\n", | |
| "0 1-1 0 1.00 [1.0, 0.0]\n", | |
| "1 1-10 0 0.98 [0.98, 0.02]\n", | |
| "2 1-100 0 0.88 [0.88, 0.12]\n", | |
| "3 1-1000 0 1.00 [1.0, 0.0]\n", | |
| "4 1-10000 0 1.00 [1.0, 0.0]" | |
| ] | |
| }, | |
| "execution_count": 59, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "td_predicted.head()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 60, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "td.to_td(td_predicted,'awa_lead_scoring.predicted',con,if_exists='replace',index=False)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 61, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "query4 = '''\n", | |
| "WITH t1 AS(\n", | |
| " SELECT\n", | |
| " rowid,\n", | |
| " label,\n", | |
| " probabilities[2] as cv_probability\n", | |
| " FROM\n", | |
| " predicted\n", | |
| ")\n", | |
| "SELECT\n", | |
| " t.rowid,\n", | |
| " f.label as actual,\n", | |
| " t.label as predicted,\n", | |
| " t.cv_probability,\n", | |
| " case\n", | |
| " when (t.cv_probability < 0.2) then 'F'\n", | |
| " when (t.cv_probability < 0.4 AND t.cv_probability >= 0.2) then 'D'\n", | |
| " when (t.cv_probability < 0.6 AND t.cv_probability >= 0.4) then 'C'\n", | |
| " when (t.cv_probability < 0.8 AND t.cv_probability >= 0.6) then 'B'\n", | |
| " else 'A'\n", | |
| " end as grade\n", | |
| "FROM\n", | |
| " t1 as t\n", | |
| " LEFT OUTER JOIN featured as f ON (t.rowid = f.rowid)\n", | |
| "\n", | |
| "'''" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 62, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "td_grade = td.read_td(query4, presto)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 63, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>rowid</th>\n", | |
| " <th>actual</th>\n", | |
| " <th>predicted</th>\n", | |
| " <th>cv_probability</th>\n", | |
| " <th>grade</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>1-63999</td>\n", | |
| " <td>0</td>\n", | |
| " <td>0</td>\n", | |
| " <td>0.00</td>\n", | |
| " <td>F</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>1-64</td>\n", | |
| " <td>0</td>\n", | |
| " <td>0</td>\n", | |
| " <td>0.04</td>\n", | |
| " <td>F</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>1-640</td>\n", | |
| " <td>1</td>\n", | |
| " <td>0</td>\n", | |
| " <td>0.02</td>\n", | |
| " <td>F</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>1-6400</td>\n", | |
| " <td>1</td>\n", | |
| " <td>1</td>\n", | |
| " <td>1.00</td>\n", | |
| " <td>A</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>1-64000</td>\n", | |
| " <td>0</td>\n", | |
| " <td>0</td>\n", | |
| " <td>0.00</td>\n", | |
| " <td>F</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " rowid actual predicted cv_probability grade\n", | |
| "0 1-63999 0 0 0.00 F\n", | |
| "1 1-64 0 0 0.04 F\n", | |
| "2 1-640 1 0 0.02 F\n", | |
| "3 1-6400 1 1 1.00 A\n", | |
| "4 1-64000 0 0 0.00 F" | |
| ] | |
| }, | |
| "execution_count": 63, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "td_grade.head()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 64, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "td.to_td(td_grade,'awa_lead_scoring.grade',con,if_exists='replace',index=False)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 65, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "query5 = '''\n", | |
| "SELECT\n", | |
| " grade,\n", | |
| " count(*)\n", | |
| "FROM\n", | |
| " grade\n", | |
| "GROUP BY\n", | |
| " grade\n", | |
| "ORDER BY\n", | |
| " grade\n", | |
| "'''" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 66, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>grade</th>\n", | |
| " <th>_col1</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>A</td>\n", | |
| " <td>10458</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>B</td>\n", | |
| " <td>1962</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>C</td>\n", | |
| " <td>2085</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>D</td>\n", | |
| " <td>4578</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>F</td>\n", | |
| " <td>80917</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " grade _col1\n", | |
| "0 A 10458\n", | |
| "1 B 1962\n", | |
| "2 C 2085\n", | |
| "3 D 4578\n", | |
| "4 F 80917" | |
| ] | |
| }, | |
| "execution_count": 66, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "td.read_td(query5, presto)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 51, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "# python\n", | |
| "df['is_manager'] = df.job_title.str.contains(\"manager|director|supervisor\", flags=re.IGNORECASE)\n", | |
| "\n", | |
| "\n", | |
| "# dummify our categorical columns (acquisition_channel, company_size, industry)\n", | |
| "dummies = pd.get_dummies(df.acquisition_channel, prefix=\"acquisition_channel=\")\n", | |
| "df[dummies.columns] = dummies\n", | |
| "\n", | |
| "dummies = pd.get_dummies(df.company_size, prefix=\"company_size=\")\n", | |
| "df[dummies.columns] = dummies\n", | |
| "\n", | |
| "dummies = pd.get_dummies(df.industry, prefix=\"industry=\")\n", | |
| "df[dummies.columns] = dummies\n", | |
| "\n", | |
| "features = [\"is_manager\",\n", | |
| " \"days_since_signup\",\n", | |
| " \"completed_form\",\n", | |
| " \"visited_pricing\",\n", | |
| " \"registered_for_webinar\",\n", | |
| " \"attended_webinar\",\n", | |
| " \"acquisition_channel=_Cold Call\",\n", | |
| " \"acquisition_channel=_Cold Email\",\n", | |
| " \"acquisition_channel=_Organic Search\",\n", | |
| " \"acquisition_channel=_Paid Leads\",\n", | |
| " # \"acquisition_channel=_Paid Search\",\n", | |
| " # \"company_size=_1-10\",\n", | |
| " \"company_size=_1000-10000\",\n", | |
| " \"company_size=_10001+\",\n", | |
| " \"company_size=_101-250\",\n", | |
| " \"company_size=_11-50\",\n", | |
| " \"company_size=_251-1000\",\n", | |
| " \"company_size=_51-100\",\n", | |
| " \"industry=_Financial Services\",\n", | |
| " \"industry=_Furniture\",\n", | |
| " \"industry=_Heavy Manufacturing\",\n", | |
| " \"industry=_Scandanavion Design\",\n", | |
| " # \"industry=_Transportation\",\n", | |
| " \"industry=_Web & Internet\"\n", | |
| "]" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 52, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| " index grade\n", | |
| "1 A 10938\n", | |
| "4 B 3219\n", | |
| "3 C 6337\n", | |
| "2 D 9727\n", | |
| "0 F 69779\n" | |
| ] | |
| }, | |
| { | |
| "name": "stderr", | |
| "output_type": "stream", | |
| "text": [ | |
| "/Users/tduser/anaconda/lib/python3.5/site-packages/ipykernel/__main__.py:12: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "#scikit-learn\n", | |
| "from sklearn.ensemble import RandomForestClassifier\n", | |
| "\n", | |
| "rf = RandomForestClassifier(n_estimators = 50)\n", | |
| "rf.fit(df[features], df.converted)\n", | |
| "\n", | |
| "probs = pd.Series(rf.predict_proba(df[features])[:,1])\n", | |
| "\n", | |
| "df['grade'] = pd.cut(probs, 5, labels=[\"F\",\"D\",\"C\",\"B\",\"A\"])\n", | |
| "\n", | |
| "lead_quality = df['grade'].value_counts()\n", | |
| "lead_quality = lead_quality.reset_index().sort(\"index\", ascending=False)\n", | |
| "print(lead_quality)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 77, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "query7 = '''\n", | |
| "SELECT\n", | |
| " count(1) / 100000\n", | |
| "FROM\n", | |
| " grade\n", | |
| "WHERE\n", | |
| " actual = predicted\n", | |
| "'''" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 78, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>_c0</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>0.86873</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " _c0\n", | |
| "0 0.86873" | |
| ] | |
| }, | |
| "execution_count": 78, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "td.read_td(query7, hive)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 76, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "0.89812000000000003" | |
| ] | |
| }, | |
| "execution_count": 76, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "pred = rf.predict(df[features])\n", | |
| "\n", | |
| "sum((pred == df.converted)) / pred.size" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 121, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "query8 = '''\n", | |
| "SELECT\n", | |
| " train_randomforest_classifier(features,label,'-attrs C,C,C,C,Q,C,C,C,C -rule GINI -trees 50 ')\n", | |
| " as (model_id,model_type,pred_model,var_importance,oob_errors,oob_tests)\n", | |
| "FROM\n", | |
| " featured\n", | |
| "'''" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 109, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "td_model20 = td.read_td(query8, hive)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 110, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "td.to_td(td_model20,'awa_lead_scoring.model20',con,if_exists='replace',index=False)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 111, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "query9 = '''\n", | |
| "WITH t1 AS(\n", | |
| " SELECT\n", | |
| " t.rowid,\n", | |
| " tree_predict(p.model_id,p.model_type,p.pred_model,t.features,true) as predicted\n", | |
| " FROM\n", | |
| " model20 as p\n", | |
| " LEFT OUTER JOIN featured as t\n", | |
| "),\n", | |
| "t2 AS(\n", | |
| " SELECT\n", | |
| " rowid,\n", | |
| " rf_ensemble(predicted) as predicted\n", | |
| " FROM\n", | |
| " t1\n", | |
| " GROUP BY\n", | |
| " rowid\n", | |
| ")\n", | |
| "SELECT\n", | |
| " rowid,\n", | |
| " predicted.label, predicted.probability, predicted.probabilities\n", | |
| "FROM\n", | |
| " t2\n", | |
| "'''" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 112, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "td_predicted20 = td.read_td(query9, hive)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 113, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "td.to_td(td_predicted20,'awa_lead_scoring.predicted20',con,if_exists='replace',index=False)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 114, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "query10 = '''\n", | |
| "WITH t1 AS(\n", | |
| " SELECT\n", | |
| " rowid,\n", | |
| " label,\n", | |
| " probabilities[2] as cv_probability\n", | |
| " FROM\n", | |
| " predicted20\n", | |
| ")\n", | |
| "SELECT\n", | |
| " t.rowid,\n", | |
| " f.label as actual,\n", | |
| " t.label as predicted,\n", | |
| " t.cv_probability,\n", | |
| " case\n", | |
| " when (t.cv_probability < 0.2) then 'F'\n", | |
| " when (t.cv_probability < 0.4 AND t.cv_probability >= 0.2) then 'D'\n", | |
| " when (t.cv_probability < 0.6 AND t.cv_probability >= 0.4) then 'C'\n", | |
| " when (t.cv_probability < 0.8 AND t.cv_probability >= 0.6) then 'B'\n", | |
| " else 'A'\n", | |
| " end as grade\n", | |
| "FROM\n", | |
| " t1 as t\n", | |
| " LEFT OUTER JOIN featured as f ON (t.rowid = f.rowid)\n", | |
| "\n", | |
| "'''" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 115, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "td_grade20 = td.read_td(query10, presto)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 116, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "td.to_td(td_grade20,'awa_lead_scoring.grade20',con,if_exists='replace',index=False)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 117, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "query11 = '''\n", | |
| "SELECT\n", | |
| " grade,\n", | |
| " count(*)\n", | |
| "FROM\n", | |
| " grade20\n", | |
| "GROUP BY\n", | |
| " grade\n", | |
| "ORDER BY\n", | |
| " grade\n", | |
| "'''" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 118, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "query12 = '''\n", | |
| "SELECT\n", | |
| " count(1) / 100000\n", | |
| "FROM\n", | |
| " grade20\n", | |
| "WHERE\n", | |
| " actual = predicted\n", | |
| "'''" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 119, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>grade</th>\n", | |
| " <th>_col1</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>A</td>\n", | |
| " <td>12035</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>B</td>\n", | |
| " <td>3992</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>C</td>\n", | |
| " <td>3747</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>D</td>\n", | |
| " <td>5744</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>F</td>\n", | |
| " <td>74482</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " grade _col1\n", | |
| "0 A 12035\n", | |
| "1 B 3992\n", | |
| "2 C 3747\n", | |
| "3 D 5744\n", | |
| "4 F 74482" | |
| ] | |
| }, | |
| "execution_count": 119, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "td.read_td(query11, presto)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 120, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>_c0</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>0.85616</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " _c0\n", | |
| "0 0.85616" | |
| ] | |
| }, | |
| "execution_count": 120, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "td.read_td(query12, hive)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [] | |
| } | |
| ], | |
| "metadata": { | |
| "anaconda-cloud": {}, | |
| "kernelspec": { | |
| "display_name": "Python [conda root]", | |
| "language": "python", | |
| "name": "conda-root-py" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.5.2" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 1 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment