Skip to content

Instantly share code, notes, and snippets.

@awajundesu
Last active January 24, 2017 05:30
Show Gist options
  • Select an option

  • Save awajundesu/6a3180c611246ee7467fdf07897b1596 to your computer and use it in GitHub Desktop.

Select an option

Save awajundesu/6a3180c611246ee7467fdf07897b1596 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import os\n",
"import re\n",
"import pandas as pd\n",
"import pandas_td as td\n",
"import numpy as np\n",
"\n",
"con = td.connect(apikey=os.environ['TD_API_KEY'], endpoint='https://api.treasuredata.com')\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"presto = td.create_engine('presto:awa_lead_scoring', con=con)\n",
"hive = td.create_engine('hive:awa_lead_scoring', con=con)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"df = td.read_td_table('lead_scoring',presto,limit=100000)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"query = '''\n",
"WITH jobtitle AS(\n",
" SELECT\n",
" *,\n",
" case\n",
" when LOWER(job_title) LIKE '%manager%' then 1\n",
" when LOWER(job_title) LIKE '%director%' then 1\n",
" when LOWER(job_title) LIKE '%supervisor%' then 1\n",
" else 0\n",
" end as is_manager2\n",
" FROM\n",
" lead_scoring \n",
"),\n",
"quantified AS(\n",
" SELECT\n",
" quantify(true,converted,days_since_signup,completed_form,visited_pricing,registered_for_webinar,attended_webinar,is_manager2,acquisition_channel,company_size,industry)\n",
" as (converted,days_since_signup,completed_form,visited_pricing,registered_for_webinar,attended_webinar,is_manager2,acquisition_channel,company_size,industry)\n",
" FROM\n",
" jobtitle\n",
")\n",
"SELECT\n",
" rowid() as rowid,\n",
" converted as label,\n",
" array(acquisition_channel,company_size,industry,is_manager2,days_since_signup,completed_form,visited_pricing,registered_for_webinar,attended_webinar)\n",
" as features\n",
"FROM\n",
" quantified\n",
"'''"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"td_quantified = td.read_td(query, hive)"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>rowid</th>\n",
" <th>label</th>\n",
" <th>features</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1-1</td>\n",
" <td>0</td>\n",
" <td>[0, 0, 0, 1, 8, 1, 1, 0, 0]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1-2</td>\n",
" <td>0</td>\n",
" <td>[1, 1, 0, 1, 2, 1, 1, 0, 0]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1-3</td>\n",
" <td>0</td>\n",
" <td>[1, 2, 1, 1, 28, 1, 0, 0, 0]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1-4</td>\n",
" <td>0</td>\n",
" <td>[0, 1, 0, 1, 20, 1, 1, 0, 0]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1-5</td>\n",
" <td>1</td>\n",
" <td>[1, 2, 0, 1, 12, 1, 1, 0, 0]</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" rowid label features\n",
"0 1-1 0 [0, 0, 0, 1, 8, 1, 1, 0, 0]\n",
"1 1-2 0 [1, 1, 0, 1, 2, 1, 1, 0, 0]\n",
"2 1-3 0 [1, 2, 1, 1, 28, 1, 0, 0, 0]\n",
"3 1-4 0 [0, 1, 0, 1, 20, 1, 1, 0, 0]\n",
"4 1-5 1 [1, 2, 0, 1, 12, 1, 1, 0, 0]"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"td_quantified.head()"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"td.to_td(td_quantified,'awa_lead_scoring.featured',con,if_exists='replace',index=False)"
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"query2 = '''\n",
"SELECT\n",
" train_randomforest_classifier(features,label,'-attrs C,C,C,C,Q,C,C,C,C -trees 50')\n",
" as (model_id,model_type,pred_model,var_importance,oob_errors,oob_tests)\n",
"FROM\n",
" featured\n",
"'''"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"td_model = td.read_td(query2, hive)"
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>model_id</th>\n",
" <th>model_type</th>\n",
" <th>pred_model</th>\n",
" <th>var_importance</th>\n",
" <th>oob_errors</th>\n",
" <th>oob_tests</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>ee167fbf-0473-4242-a4a6-b92ed22c4501</td>\n",
" <td>-3</td>\n",
" <td>I?8Q|t`*!J3d;N-[6r0ceOd{cMUu;Y&lt;/szdb*/Ot^rOr{`...</td>\n",
" <td>[7.5508191889997835, 2.2270526498149352, 3.617...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>d26891fd-318d-4fd5-ac61-458a95657f94</td>\n",
" <td>-3</td>\n",
" <td>I?Q^k2_YnKC;Z/x#B!=uAe6%K6nBP7~O?Pw&gt;]c&amp;3Jm/q)^...</td>\n",
" <td>[1.3113520211354033, 4.716675080561591, 2.5454...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>12959a81-1480-4e99-9071-f4cf9444d402</td>\n",
" <td>-3</td>\n",
" <td>I?GSl2:`eJ#r;yE`^E=Fyw6~(fscDY966FzFl;PI2(#t:X...</td>\n",
" <td>[6.929184772569904, 4.883177786338777, 4.32007...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>42c05f15-5844-4a5d-9534-009bda94d382</td>\n",
" <td>-3</td>\n",
" <td>I?8QA16+!J3d?3B_hRwB-~:iFDA+ziSWCH0pGo$$c2_F6N...</td>\n",
" <td>[6.791954188778515, 4.564584380045808, 5.53491...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>8640c6b1-9d48-4b1a-b0c7-a43b5a0915e1</td>\n",
" <td>-3</td>\n",
" <td>I?Q^~0zenK4/va&lt;UFB2/S9a~=uiqrg6sk[QqgDSL&lt;V&amp;|6y...</td>\n",
" <td>[3.6549179812841817, 7.925584410539383, 3.6769...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" model_id model_type \\\n",
"0 ee167fbf-0473-4242-a4a6-b92ed22c4501 -3 \n",
"1 d26891fd-318d-4fd5-ac61-458a95657f94 -3 \n",
"2 12959a81-1480-4e99-9071-f4cf9444d402 -3 \n",
"3 42c05f15-5844-4a5d-9534-009bda94d382 -3 \n",
"4 8640c6b1-9d48-4b1a-b0c7-a43b5a0915e1 -3 \n",
"\n",
" pred_model \\\n",
"0 I?8Q|t`*!J3d;N-[6r0ceOd{cMUu;Y</szdb*/Ot^rOr{`... \n",
"1 I?Q^k2_YnKC;Z/x#B!=uAe6%K6nBP7~O?Pw>]c&3Jm/q)^... \n",
"2 I?GSl2:`eJ#r;yE`^E=Fyw6~(fscDY966FzFl;PI2(#t:X... \n",
"3 I?8QA16+!J3d?3B_hRwB-~:iFDA+ziSWCH0pGo$$c2_F6N... \n",
"4 I?Q^~0zenK4/va<UFB2/S9a~=uiqrg6sk[QqgDSL<V&|6y... \n",
"\n",
" var_importance oob_errors oob_tests \n",
"0 [7.5508191889997835, 2.2270526498149352, 3.617... 0 0 \n",
"1 [1.3113520211354033, 4.716675080561591, 2.5454... 0 0 \n",
"2 [6.929184772569904, 4.883177786338777, 4.32007... 0 0 \n",
"3 [6.791954188778515, 4.564584380045808, 5.53491... 0 0 \n",
"4 [3.6549179812841817, 7.925584410539383, 3.6769... 0 0 "
]
},
"execution_count": 55,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"td_model.head()"
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"td.to_td(td_model,'awa_lead_scoring.model',con,if_exists='replace',index=False)"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"query3 = '''\n",
"WITH t1 AS(\n",
" SELECT\n",
" t.rowid,\n",
" tree_predict(p.model_id,p.model_type,p.pred_model,t.features,true) as predicted\n",
" FROM\n",
" model as p\n",
" LEFT OUTER JOIN featured as t\n",
"),\n",
"t2 AS(\n",
" SELECT\n",
" rowid,\n",
" rf_ensemble(predicted) as predicted\n",
" FROM\n",
" t1\n",
" GROUP BY\n",
" rowid\n",
")\n",
"SELECT\n",
" rowid,\n",
" predicted.label, predicted.probability, predicted.probabilities\n",
"FROM\n",
" t2\n",
"'''"
]
},
{
"cell_type": "code",
"execution_count": 58,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"td_predicted = td.read_td(query3, hive)"
]
},
{
"cell_type": "code",
"execution_count": 59,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>rowid</th>\n",
" <th>label</th>\n",
" <th>probability</th>\n",
" <th>probabilities</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1-1</td>\n",
" <td>0</td>\n",
" <td>1.00</td>\n",
" <td>[1.0, 0.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1-10</td>\n",
" <td>0</td>\n",
" <td>0.98</td>\n",
" <td>[0.98, 0.02]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1-100</td>\n",
" <td>0</td>\n",
" <td>0.88</td>\n",
" <td>[0.88, 0.12]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1-1000</td>\n",
" <td>0</td>\n",
" <td>1.00</td>\n",
" <td>[1.0, 0.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1-10000</td>\n",
" <td>0</td>\n",
" <td>1.00</td>\n",
" <td>[1.0, 0.0]</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" rowid label probability probabilities\n",
"0 1-1 0 1.00 [1.0, 0.0]\n",
"1 1-10 0 0.98 [0.98, 0.02]\n",
"2 1-100 0 0.88 [0.88, 0.12]\n",
"3 1-1000 0 1.00 [1.0, 0.0]\n",
"4 1-10000 0 1.00 [1.0, 0.0]"
]
},
"execution_count": 59,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"td_predicted.head()"
]
},
{
"cell_type": "code",
"execution_count": 60,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"td.to_td(td_predicted,'awa_lead_scoring.predicted',con,if_exists='replace',index=False)"
]
},
{
"cell_type": "code",
"execution_count": 61,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"query4 = '''\n",
"WITH t1 AS(\n",
" SELECT\n",
" rowid,\n",
" label,\n",
" probabilities[2] as cv_probability\n",
" FROM\n",
" predicted\n",
")\n",
"SELECT\n",
" t.rowid,\n",
" f.label as actual,\n",
" t.label as predicted,\n",
" t.cv_probability,\n",
" case\n",
" when (t.cv_probability < 0.2) then 'F'\n",
" when (t.cv_probability < 0.4 AND t.cv_probability >= 0.2) then 'D'\n",
" when (t.cv_probability < 0.6 AND t.cv_probability >= 0.4) then 'C'\n",
" when (t.cv_probability < 0.8 AND t.cv_probability >= 0.6) then 'B'\n",
" else 'A'\n",
" end as grade\n",
"FROM\n",
" t1 as t\n",
" LEFT OUTER JOIN featured as f ON (t.rowid = f.rowid)\n",
"\n",
"'''"
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"td_grade = td.read_td(query4, presto)"
]
},
{
"cell_type": "code",
"execution_count": 63,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>rowid</th>\n",
" <th>actual</th>\n",
" <th>predicted</th>\n",
" <th>cv_probability</th>\n",
" <th>grade</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1-63999</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.00</td>\n",
" <td>F</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1-64</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.04</td>\n",
" <td>F</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1-640</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0.02</td>\n",
" <td>F</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1-6400</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1.00</td>\n",
" <td>A</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1-64000</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.00</td>\n",
" <td>F</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" rowid actual predicted cv_probability grade\n",
"0 1-63999 0 0 0.00 F\n",
"1 1-64 0 0 0.04 F\n",
"2 1-640 1 0 0.02 F\n",
"3 1-6400 1 1 1.00 A\n",
"4 1-64000 0 0 0.00 F"
]
},
"execution_count": 63,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"td_grade.head()"
]
},
{
"cell_type": "code",
"execution_count": 64,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"td.to_td(td_grade,'awa_lead_scoring.grade',con,if_exists='replace',index=False)"
]
},
{
"cell_type": "code",
"execution_count": 65,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"query5 = '''\n",
"SELECT\n",
" grade,\n",
" count(*)\n",
"FROM\n",
" grade\n",
"GROUP BY\n",
" grade\n",
"ORDER BY\n",
" grade\n",
"'''"
]
},
{
"cell_type": "code",
"execution_count": 66,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>grade</th>\n",
" <th>_col1</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>A</td>\n",
" <td>10458</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>B</td>\n",
" <td>1962</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>C</td>\n",
" <td>2085</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>D</td>\n",
" <td>4578</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>F</td>\n",
" <td>80917</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" grade _col1\n",
"0 A 10458\n",
"1 B 1962\n",
"2 C 2085\n",
"3 D 4578\n",
"4 F 80917"
]
},
"execution_count": 66,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"td.read_td(query5, presto)"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# python\n",
"df['is_manager'] = df.job_title.str.contains(\"manager|director|supervisor\", flags=re.IGNORECASE)\n",
"\n",
"\n",
"# dummify our categorical columns (acquisition_channel, company_size, industry)\n",
"dummies = pd.get_dummies(df.acquisition_channel, prefix=\"acquisition_channel=\")\n",
"df[dummies.columns] = dummies\n",
"\n",
"dummies = pd.get_dummies(df.company_size, prefix=\"company_size=\")\n",
"df[dummies.columns] = dummies\n",
"\n",
"dummies = pd.get_dummies(df.industry, prefix=\"industry=\")\n",
"df[dummies.columns] = dummies\n",
"\n",
"features = [\"is_manager\",\n",
" \"days_since_signup\",\n",
" \"completed_form\",\n",
" \"visited_pricing\",\n",
" \"registered_for_webinar\",\n",
" \"attended_webinar\",\n",
" \"acquisition_channel=_Cold Call\",\n",
" \"acquisition_channel=_Cold Email\",\n",
" \"acquisition_channel=_Organic Search\",\n",
" \"acquisition_channel=_Paid Leads\",\n",
" # \"acquisition_channel=_Paid Search\",\n",
" # \"company_size=_1-10\",\n",
" \"company_size=_1000-10000\",\n",
" \"company_size=_10001+\",\n",
" \"company_size=_101-250\",\n",
" \"company_size=_11-50\",\n",
" \"company_size=_251-1000\",\n",
" \"company_size=_51-100\",\n",
" \"industry=_Financial Services\",\n",
" \"industry=_Furniture\",\n",
" \"industry=_Heavy Manufacturing\",\n",
" \"industry=_Scandanavion Design\",\n",
" # \"industry=_Transportation\",\n",
" \"industry=_Web & Internet\"\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" index grade\n",
"1 A 10938\n",
"4 B 3219\n",
"3 C 6337\n",
"2 D 9727\n",
"0 F 69779\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/tduser/anaconda/lib/python3.5/site-packages/ipykernel/__main__.py:12: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)\n"
]
}
],
"source": [
"#scikit-learn\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"\n",
"rf = RandomForestClassifier(n_estimators = 50)\n",
"rf.fit(df[features], df.converted)\n",
"\n",
"probs = pd.Series(rf.predict_proba(df[features])[:,1])\n",
"\n",
"df['grade'] = pd.cut(probs, 5, labels=[\"F\",\"D\",\"C\",\"B\",\"A\"])\n",
"\n",
"lead_quality = df['grade'].value_counts()\n",
"lead_quality = lead_quality.reset_index().sort(\"index\", ascending=False)\n",
"print(lead_quality)"
]
},
{
"cell_type": "code",
"execution_count": 77,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"query7 = '''\n",
"SELECT\n",
" count(1) / 100000\n",
"FROM\n",
" grade\n",
"WHERE\n",
" actual = predicted\n",
"'''"
]
},
{
"cell_type": "code",
"execution_count": 78,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>_c0</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0.86873</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" _c0\n",
"0 0.86873"
]
},
"execution_count": 78,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"td.read_td(query7, hive)"
]
},
{
"cell_type": "code",
"execution_count": 76,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"0.89812000000000003"
]
},
"execution_count": 76,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pred = rf.predict(df[features])\n",
"\n",
"sum((pred == df.converted)) / pred.size"
]
},
{
"cell_type": "code",
"execution_count": 121,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"query8 = '''\n",
"SELECT\n",
" train_randomforest_classifier(features,label,'-attrs C,C,C,C,Q,C,C,C,C -rule GINI -trees 50 ')\n",
" as (model_id,model_type,pred_model,var_importance,oob_errors,oob_tests)\n",
"FROM\n",
" featured\n",
"'''"
]
},
{
"cell_type": "code",
"execution_count": 109,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"td_model20 = td.read_td(query8, hive)"
]
},
{
"cell_type": "code",
"execution_count": 110,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"td.to_td(td_model20,'awa_lead_scoring.model20',con,if_exists='replace',index=False)"
]
},
{
"cell_type": "code",
"execution_count": 111,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"query9 = '''\n",
"WITH t1 AS(\n",
" SELECT\n",
" t.rowid,\n",
" tree_predict(p.model_id,p.model_type,p.pred_model,t.features,true) as predicted\n",
" FROM\n",
" model20 as p\n",
" LEFT OUTER JOIN featured as t\n",
"),\n",
"t2 AS(\n",
" SELECT\n",
" rowid,\n",
" rf_ensemble(predicted) as predicted\n",
" FROM\n",
" t1\n",
" GROUP BY\n",
" rowid\n",
")\n",
"SELECT\n",
" rowid,\n",
" predicted.label, predicted.probability, predicted.probabilities\n",
"FROM\n",
" t2\n",
"'''"
]
},
{
"cell_type": "code",
"execution_count": 112,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"td_predicted20 = td.read_td(query9, hive)"
]
},
{
"cell_type": "code",
"execution_count": 113,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"td.to_td(td_predicted20,'awa_lead_scoring.predicted20',con,if_exists='replace',index=False)"
]
},
{
"cell_type": "code",
"execution_count": 114,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"query10 = '''\n",
"WITH t1 AS(\n",
" SELECT\n",
" rowid,\n",
" label,\n",
" probabilities[2] as cv_probability\n",
" FROM\n",
" predicted20\n",
")\n",
"SELECT\n",
" t.rowid,\n",
" f.label as actual,\n",
" t.label as predicted,\n",
" t.cv_probability,\n",
" case\n",
" when (t.cv_probability < 0.2) then 'F'\n",
" when (t.cv_probability < 0.4 AND t.cv_probability >= 0.2) then 'D'\n",
" when (t.cv_probability < 0.6 AND t.cv_probability >= 0.4) then 'C'\n",
" when (t.cv_probability < 0.8 AND t.cv_probability >= 0.6) then 'B'\n",
" else 'A'\n",
" end as grade\n",
"FROM\n",
" t1 as t\n",
" LEFT OUTER JOIN featured as f ON (t.rowid = f.rowid)\n",
"\n",
"'''"
]
},
{
"cell_type": "code",
"execution_count": 115,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"td_grade20 = td.read_td(query10, presto)"
]
},
{
"cell_type": "code",
"execution_count": 116,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"td.to_td(td_grade20,'awa_lead_scoring.grade20',con,if_exists='replace',index=False)"
]
},
{
"cell_type": "code",
"execution_count": 117,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"query11 = '''\n",
"SELECT\n",
" grade,\n",
" count(*)\n",
"FROM\n",
" grade20\n",
"GROUP BY\n",
" grade\n",
"ORDER BY\n",
" grade\n",
"'''"
]
},
{
"cell_type": "code",
"execution_count": 118,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"query12 = '''\n",
"SELECT\n",
" count(1) / 100000\n",
"FROM\n",
" grade20\n",
"WHERE\n",
" actual = predicted\n",
"'''"
]
},
{
"cell_type": "code",
"execution_count": 119,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>grade</th>\n",
" <th>_col1</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>A</td>\n",
" <td>12035</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>B</td>\n",
" <td>3992</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>C</td>\n",
" <td>3747</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>D</td>\n",
" <td>5744</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>F</td>\n",
" <td>74482</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" grade _col1\n",
"0 A 12035\n",
"1 B 3992\n",
"2 C 3747\n",
"3 D 5744\n",
"4 F 74482"
]
},
"execution_count": 119,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"td.read_td(query11, presto)"
]
},
{
"cell_type": "code",
"execution_count": 120,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>_c0</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0.85616</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" _c0\n",
"0 0.85616"
]
},
"execution_count": 120,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"td.read_td(query12, hive)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"anaconda-cloud": {},
"kernelspec": {
"display_name": "Python [conda root]",
"language": "python",
"name": "conda-root-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
}
},
"nbformat": 4,
"nbformat_minor": 1
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment