Created
July 6, 2015 22:08
-
-
Save Sanjogsharma/da834e5635789a10acb1 to your computer and use it in GitHub Desktop.
HW with Yelp data
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import numpy as np | |
| from sklearn.linear_model import LinearRegression | |
| from sklearn.cross_validation import train_test_split | |
| from sklearn import metrics | |
| import statsmodels.formula.api as smf | |
| from mpl_toolkits.mplot3d import axes3d | |
| import pandas as pd | |
| # visualization | |
| import seaborn as sns | |
| import matplotlib.pyplot as plt | |
| with open ('yelp.json', 'r') as yelpdata: | |
| yelpdata2 = '[' + ','.join(yelpdata.readlines()) + ']' | |
| yelp3 = pd.read_json(yelpdata2) | |
| yelp3.head() | |
| yelp = pd.read_table('yelp.csv', sep = ',') | |
| #explore relationship, used seaborn | |
| sns.pairplot(yelp, x_vars=['cool','useful','funny'], \ | |
| y_vars='stars', kind='reg') | |
| yelp.corr() #used statsmodel | |
| #cools have a positive relationship with no of stars and the the other two \ | |
| #have negative | |
| '''Fit a linear regression model /interpret the coefficients. | |
| Do the coefficients make intuitive sense? | |
| ''' | |
| feature_cols = ['cool', 'useful', 'funny'] | |
| X = yelp[feature_cols] | |
| y = yelp.stars | |
| linreg = LinearRegression() | |
| linreg.fit(X, y) | |
| print linreg.intercept_ | |
| print linreg.coef_ | |
| #cool .274, useful -.147, funny -.136 | |
| #yes they make intutive sense: a funnier review is more likely to be | |
| #a negative one, I'd think. | |
| #test-train split and evaluate, compute RMSE | |
| zip(feature_cols, linreg.coef_) | |
| def train_test_rmse(X, y): | |
| X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1) | |
| linreg = LinearRegression() | |
| linreg.fit(X_train, y_train) | |
| y_pred = linreg.predict(X_test) | |
| return np.sqrt(metrics.mean_squared_error(y_test, y_pred)) | |
| train_test_rmse(X, y) #1.184 | |
| #remove features models | |
| #remove cool | |
| feature_cols = ['useful', 'funny'] | |
| X = yelp[feature_cols] | |
| train_test_rmse(X, y) #1.210 | |
| #remove funny | |
| feature_cols = ['useful', 'cool'] | |
| X = yelp[feature_cols] | |
| train_test_rmse(X, y) #1.196 | |
| #come up with new features | |
| # 6403 total users - slope is negative; this might be useful | |
| z = yelp.groupby('user_id').stars.agg(['count', 'mean']) | |
| z.head() | |
| z.sort('count', ascending=False, inplace = True) | |
| sns.regplot(x = 'count', y = 'mean', data=z) | |
| # 4174 tota lbusinesses - slope is positive. | |
| z = yelp.groupby('business_id').stars.agg(['count', 'mean']) | |
| z.head() | |
| z.sort('count', ascending=False, inplace = True) | |
| sns.regplot(x = 'count', y = 'mean', data=z) | |
| # new feature - word count of review | |
| yelp['rev_length'] = [len(yelp.text[item]) for item in range(0, len(yelp))] | |
| sns.regplot(x='rev_length', y='stars', data=yelp) | |
| # inverse relationship with stars | |
| feature_cols = ['rev_length'] | |
| X = yelp[feature_cols] | |
| linreg = LinearRegression() | |
| linreg.fit(X, y) | |
| X = yelp[feature_cols] | |
| train_test_rmse(X, y) #1.20 | |
| feature_cols = ['rev_length','cool', 'useful', 'funny'] | |
| X = yelp[feature_cols] | |
| linreg = LinearRegression() | |
| linreg.fit(X, y) | |
| X = yelp[feature_cols] | |
| train_test_rmse(X, y) #1.178 | |
| #treating as a classification problem | |
| ##visualizing using seaborn | |
| sns.stripplot(x="funny", y="stars", data=yelp) | |
| #8: create a series of vector of average value. | |
| yelp['average'] = yelp.stars.mean() | |
| feature_cols = ['average'] | |
| X = yelp[feature_cols] | |
| linreg = LinearRegression() | |
| linreg.fit(X, y) | |
| X = yelp[feature_cols] | |
| train_test_rmse(X, y) #1.21 the RMSE is higher | |
| feature_cols = ['cool', 'useful', 'funny'] | |
| X = yelp[feature_cols] | |
| y = yelp.stars | |
| from sklearn.neighbors import KNeighborsClassifier | |
| from sklearn import metrics | |
| from sklearn.cross_validation import train_test_split | |
| X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4) | |
| knn = KNeighborsClassifier(n_neighbors=1) | |
| knn.fit(X_train, y_train) | |
| y_pred = knn.predict(X_test) | |
| print metrics.accuracy_score(y_test, y_pred) # 27.7% |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment