Sanjogsharma · July 6, 2015 22:08
diff --git a/HW with yelp data b/HW with yelp data
 import numpy as np
 from sklearn.linear_model import LinearRegression
 from sklearn.cross_validation import train_test_split
 from sklearn import metrics
 import statsmodels.formula.api as smf
 from mpl_toolkits.mplot3d import axes3d
 import pandas as pd

 # visualization
 import seaborn as sns
 import matplotlib.pyplot as plt

 with open ('yelp.json', 'r') as yelpdata:
   yelpdata2 = '[' + ','.join(yelpdata.readlines()) + ']'

 yelp3 = pd.read_json(yelpdata2)

 yelp3.head()


 yelp = pd.read_table('yelp.csv', sep = ',')

 #explore relationship, used seaborn
 sns.pairplot(yelp, x_vars=['cool','useful','funny'], \
 y_vars='stars', kind='reg')

 yelp.corr() #used statsmodel

 #cools have a positive relationship with no of stars and the the other two \
 #have negative

 '''Fit a linear regression model /interpret the coefficients.
 Do the coefficients make intuitive sense?
 '''

 feature_cols = ['cool', 'useful', 'funny']
 X = yelp[feature_cols]
 y = yelp.stars
 linreg = LinearRegression()
 linreg.fit(X, y)
 print linreg.intercept_
 print linreg.coef_

 #cool .274, useful -.147, funny -.136
 #yes they make intutive sense: a funnier review is more likely to be 
 #a negative one, I'd think. 



 #test-train split and evaluate, compute RMSE
 zip(feature_cols, linreg.coef_)


 def train_test_rmse(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
    linreg = LinearRegression()
    linreg.fit(X_train, y_train)
    y_pred = linreg.predict(X_test)
    return np.sqrt(metrics.mean_squared_error(y_test, y_pred))

 train_test_rmse(X, y) #1.184

 #remove features models 
 #remove cool
 feature_cols = ['useful', 'funny']
 X = yelp[feature_cols]
 train_test_rmse(X, y) #1.210

 #remove funny
 feature_cols = ['useful', 'cool']
 X = yelp[feature_cols]
 train_test_rmse(X, y) #1.196

 #come up with new features
 # 6403 total users - slope is negative; this might be useful
 z = yelp.groupby('user_id').stars.agg(['count', 'mean'])
 z.head()
 z.sort('count', ascending=False, inplace = True)
 sns.regplot(x = 'count', y = 'mean', data=z)



 # 4174 tota lbusinesses - slope is positive. 
 z = yelp.groupby('business_id').stars.agg(['count', 'mean'])
 z.head()
 z.sort('count', ascending=False, inplace = True)
 sns.regplot(x = 'count', y = 'mean', data=z)



 # new feature - word count of review 

 yelp['rev_length'] = [len(yelp.text[item]) for item in range(0, len(yelp))]

 sns.regplot(x='rev_length', y='stars', data=yelp)
 # inverse relationship with stars
 feature_cols = ['rev_length']
 X = yelp[feature_cols]
 linreg = LinearRegression()
 linreg.fit(X, y)
 X = yelp[feature_cols]
 train_test_rmse(X, y) #1.20

 feature_cols = ['rev_length','cool', 'useful', 'funny']
 X = yelp[feature_cols]
 linreg = LinearRegression()
 linreg.fit(X, y)
 X = yelp[feature_cols]
 train_test_rmse(X, y) #1.178

 #treating as a classification problem
 ##visualizing using seaborn

 sns.stripplot(x="funny", y="stars", data=yelp)

 #8: create a series of vector  of average value. 
 yelp['average'] = yelp.stars.mean()
 feature_cols = ['average']
 X = yelp[feature_cols]
 linreg = LinearRegression()
 linreg.fit(X, y)
 X = yelp[feature_cols]
 train_test_rmse(X, y)  #1.21 the RMSE is higher



 feature_cols = ['cool', 'useful', 'funny']
 X = yelp[feature_cols]
 y = yelp.stars
 from sklearn.neighbors import KNeighborsClassifier
 from sklearn import metrics
 from sklearn.cross_validation import train_test_split
 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)
 knn = KNeighborsClassifier(n_neighbors=1)
 knn.fit(X_train, y_train)
 y_pred = knn.predict(X_test)
 print metrics.accuracy_score(y_test, y_pred) # 27.7%
	import numpy as np
	from sklearn.linear_model import LinearRegression
	from sklearn.cross_validation import train_test_split
	from sklearn import metrics
	import statsmodels.formula.api as smf
	from mpl_toolkits.mplot3d import axes3d
	import pandas as pd

	# visualization
	import seaborn as sns
	import matplotlib.pyplot as plt

	with open ('yelp.json', 'r') as yelpdata:
	yelpdata2 = '[' + ','.join(yelpdata.readlines()) + ']'

	yelp3 = pd.read_json(yelpdata2)

	yelp3.head()


	yelp = pd.read_table('yelp.csv', sep = ',')

	#explore relationship, used seaborn
	sns.pairplot(yelp, x_vars=['cool','useful','funny'], \
	y_vars='stars', kind='reg')

	yelp.corr() #used statsmodel

	#cools have a positive relationship with no of stars and the the other two \
	#have negative

	'''Fit a linear regression model /interpret the coefficients.
	Do the coefficients make intuitive sense?
	'''

	feature_cols = ['cool', 'useful', 'funny']
	X = yelp[feature_cols]
	y = yelp.stars
	linreg = LinearRegression()
	linreg.fit(X, y)
	print linreg.intercept_
	print linreg.coef_

	#cool .274, useful -.147, funny -.136
	#yes they make intutive sense: a funnier review is more likely to be
	#a negative one, I'd think.



	#test-train split and evaluate, compute RMSE
	zip(feature_cols, linreg.coef_)


	def train_test_rmse(X, y):
	X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
	linreg = LinearRegression()
	linreg.fit(X_train, y_train)
	y_pred = linreg.predict(X_test)
	return np.sqrt(metrics.mean_squared_error(y_test, y_pred))

	train_test_rmse(X, y) #1.184

	#remove features models
	#remove cool
	feature_cols = ['useful', 'funny']
	X = yelp[feature_cols]
	train_test_rmse(X, y) #1.210

	#remove funny
	feature_cols = ['useful', 'cool']
	X = yelp[feature_cols]
	train_test_rmse(X, y) #1.196

	#come up with new features
	# 6403 total users - slope is negative; this might be useful
	z = yelp.groupby('user_id').stars.agg(['count', 'mean'])
	z.head()
	z.sort('count', ascending=False, inplace = True)
	sns.regplot(x = 'count', y = 'mean', data=z)



	# 4174 tota lbusinesses - slope is positive.
	z = yelp.groupby('business_id').stars.agg(['count', 'mean'])
	z.head()
	z.sort('count', ascending=False, inplace = True)
	sns.regplot(x = 'count', y = 'mean', data=z)



	# new feature - word count of review

	yelp['rev_length'] = [len(yelp.text[item]) for item in range(0, len(yelp))]

	sns.regplot(x='rev_length', y='stars', data=yelp)
	# inverse relationship with stars
	feature_cols = ['rev_length']
	X = yelp[feature_cols]
	linreg = LinearRegression()
	linreg.fit(X, y)
	X = yelp[feature_cols]
	train_test_rmse(X, y) #1.20

	feature_cols = ['rev_length','cool', 'useful', 'funny']
	X = yelp[feature_cols]
	linreg = LinearRegression()
	linreg.fit(X, y)
	X = yelp[feature_cols]
	train_test_rmse(X, y) #1.178

	#treating as a classification problem
	##visualizing using seaborn

	sns.stripplot(x="funny", y="stars", data=yelp)

	#8: create a series of vector of average value.
	yelp['average'] = yelp.stars.mean()
	feature_cols = ['average']
	X = yelp[feature_cols]
	linreg = LinearRegression()
	linreg.fit(X, y)
	X = yelp[feature_cols]
	train_test_rmse(X, y) #1.21 the RMSE is higher



	feature_cols = ['cool', 'useful', 'funny']
	X = yelp[feature_cols]
	y = yelp.stars
	from sklearn.neighbors import KNeighborsClassifier
	from sklearn import metrics
	from sklearn.cross_validation import train_test_split
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)
	knn = KNeighborsClassifier(n_neighbors=1)
	knn.fit(X_train, y_train)
	y_pred = knn.predict(X_test)
	print metrics.accuracy_score(y_test, y_pred) # 27.7%
No results found