Skip to content

Instantly share code, notes, and snippets.

@Sanjogsharma
Created July 6, 2015 22:08
Show Gist options
  • Select an option

  • Save Sanjogsharma/da834e5635789a10acb1 to your computer and use it in GitHub Desktop.

Select an option

Save Sanjogsharma/da834e5635789a10acb1 to your computer and use it in GitHub Desktop.
HW with Yelp data
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import train_test_split
from sklearn import metrics
import statsmodels.formula.api as smf
from mpl_toolkits.mplot3d import axes3d
import pandas as pd
# visualization
import seaborn as sns
import matplotlib.pyplot as plt
with open ('yelp.json', 'r') as yelpdata:
yelpdata2 = '[' + ','.join(yelpdata.readlines()) + ']'
yelp3 = pd.read_json(yelpdata2)
yelp3.head()
yelp = pd.read_table('yelp.csv', sep = ',')
#explore relationship, used seaborn
sns.pairplot(yelp, x_vars=['cool','useful','funny'], \
y_vars='stars', kind='reg')
yelp.corr() #used statsmodel
#cools have a positive relationship with no of stars and the the other two \
#have negative
'''Fit a linear regression model /interpret the coefficients.
Do the coefficients make intuitive sense?
'''
feature_cols = ['cool', 'useful', 'funny']
X = yelp[feature_cols]
y = yelp.stars
linreg = LinearRegression()
linreg.fit(X, y)
print linreg.intercept_
print linreg.coef_
#cool .274, useful -.147, funny -.136
#yes they make intutive sense: a funnier review is more likely to be
#a negative one, I'd think.
#test-train split and evaluate, compute RMSE
zip(feature_cols, linreg.coef_)
def train_test_rmse(X, y):
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
linreg = LinearRegression()
linreg.fit(X_train, y_train)
y_pred = linreg.predict(X_test)
return np.sqrt(metrics.mean_squared_error(y_test, y_pred))
train_test_rmse(X, y) #1.184
#remove features models
#remove cool
feature_cols = ['useful', 'funny']
X = yelp[feature_cols]
train_test_rmse(X, y) #1.210
#remove funny
feature_cols = ['useful', 'cool']
X = yelp[feature_cols]
train_test_rmse(X, y) #1.196
#come up with new features
# 6403 total users - slope is negative; this might be useful
z = yelp.groupby('user_id').stars.agg(['count', 'mean'])
z.head()
z.sort('count', ascending=False, inplace = True)
sns.regplot(x = 'count', y = 'mean', data=z)
# 4174 tota lbusinesses - slope is positive.
z = yelp.groupby('business_id').stars.agg(['count', 'mean'])
z.head()
z.sort('count', ascending=False, inplace = True)
sns.regplot(x = 'count', y = 'mean', data=z)
# new feature - word count of review
yelp['rev_length'] = [len(yelp.text[item]) for item in range(0, len(yelp))]
sns.regplot(x='rev_length', y='stars', data=yelp)
# inverse relationship with stars
feature_cols = ['rev_length']
X = yelp[feature_cols]
linreg = LinearRegression()
linreg.fit(X, y)
X = yelp[feature_cols]
train_test_rmse(X, y) #1.20
feature_cols = ['rev_length','cool', 'useful', 'funny']
X = yelp[feature_cols]
linreg = LinearRegression()
linreg.fit(X, y)
X = yelp[feature_cols]
train_test_rmse(X, y) #1.178
#treating as a classification problem
##visualizing using seaborn
sns.stripplot(x="funny", y="stars", data=yelp)
#8: create a series of vector of average value.
yelp['average'] = yelp.stars.mean()
feature_cols = ['average']
X = yelp[feature_cols]
linreg = LinearRegression()
linreg.fit(X, y)
X = yelp[feature_cols]
train_test_rmse(X, y) #1.21 the RMSE is higher
feature_cols = ['cool', 'useful', 'funny']
X = yelp[feature_cols]
y = yelp.stars
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print metrics.accuracy_score(y_test, y_pred) # 27.7%
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment