import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import sklearn.metrics as metrics
from sklearn.model_selection import cross_val_score
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegressionCV
import seaborn as sns
%matplotlib inline
from sklearn.metrics import mean_squared_error
from math import sqrt
def cf_rmse(prediction, ground_truth):
prediction = prediction[ground_truth.nonzero()].flatten()
ground_truth = ground_truth[ground_truth.nonzero()].flatten()
return sqrt(mean_squared_error(prediction, ground_truth))
df_review_train = pd.read_csv("Montreal_review_train.csv")
df_review_test = pd.read_csv("Montreal_review_test.csv")
df_user_train = pd.read_csv("Montreal_user_train.csv")
df_user_test = pd.read_csv("Montreal_user_test.csv")
df_business_train = pd.read_csv("Montreal_business_train.csv")
df_business_test = pd.read_csv("Montreal_business_test.csv")
df_user_train.head()
Unnamed: 0 | Unnamed: 0.1 | average_stars | compliment_cool | compliment_cute | compliment_funny | compliment_hot | compliment_list | compliment_more | compliment_note | ... | elite | fans | friends | funny | name | review_count | useful | user_id | yelping_since | elite_status | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 10 | 4.10 | 1302 | 41 | 1302 | 806 | 20 | 46 | 725 | ... | [2016, 2014, 2015, 2017, 2012, 2011, 2013] | 435 | ['xRYvFaMGWsvKcLCFtRIzWQ', 'zvQ7B3KZuFOX7pYLsO... | 4880 | Risa | 1122 | 26395 | Wc5L6iuvSNF5WGBlqIO8nw | 2011-07-30 | Yes |
1 | 1 | 13 | 3.79 | 1139 | 87 | 1139 | 782 | 54 | 103 | 391 | ... | [2012, 2008, 2009, 2010, 2007, 2006, 2013, 2011] | 198 | ['KOwp5RDbm7cDyrdXN8FVQQ', '7MlH7OevWSkenMyKFI... | 10715 | Holly | 698 | 24047 | Dd-TkEszFMkSF-vRih51fQ | 2006-07-03 | Yes |
2 | 2 | 37 | 3.74 | 129 | 2 | 129 | 77 | 6 | 12 | 56 | ... | [2017, 2016, 2012, 2014, 2015, 2011, 2013] | 68 | ['Cq8uhBLRO1T9l-9R9OmddQ', 'x3_b9Rv-GZpjtCDLqg... | 105 | Jeff | 754 | 151 | YTdNcIWAt2nEzZ7NY-fniw | 2011-05-16 | Yes |
3 | 3 | 110 | 4.07 | 60 | 1 | 60 | 51 | 1 | 16 | 19 | ... | [2014, 2012, 2015, 2011, 2013, 2016, 2017] | 33 | ['8s7UH21vFgkRJAJg2L8VzA', 'HWGrt1MEXlzZ71NGx0... | 9 | Cecille | 356 | 36 | bTRFge5pRWMh7IoCLn7lBw | 2007-08-03 | Yes |
4 | 4 | 117 | 3.64 | 23 | 2 | 23 | 31 | 0 | 3 | 13 | ... | [2012, 2013] | 15 | ['G-Hav6XBWPEyzI-0nNpdxw', 'EgqsK7MUgqpbaTVZAv... | 36 | Carolina | 115 | 89 | -w7ww3yW5BHE3TFyj3IHuQ | 2010-06-29 | Yes |
5 rows × 25 columns
print(df_review_train.shape)
print(len(df_review_train['user_id'].unique()))
print(len(df_review_train['business_id'].unique()))
print(df_user_train.shape)
print(len(df_user_train['user_id'].unique()))
print(df_business_train.shape)
print(len(df_business_train['business_id'].unique()))
(30999, 11)
3201
2429
(3201, 25)
3201
(2429, 18)
2429
df_review_train.shape
(30999, 11)
pivot_review_train = df_review_train.pivot(index = 'user_id', columns ='business_id', values = 'stars').fillna(0)
pivot_review_test = df_review_test.pivot(index = 'user_id', columns ='business_id', values = 'stars').fillna(0)
pivot_review_train.shape
(3201, 2429)
from scipy.sparse.linalg import svds
def user_svd_predict(df, df_):
R = df.as_matrix()
user_ratings_mean = np.mean(R, axis = 1)
R_demeaned = R - user_ratings_mean.reshape(-1, 1)
U = []
sigma = []
Vt = []
U, sigma, Vt = svds(R_demeaned, k=20)
sigma = np.diag(sigma)
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)
average_rating = list(df_['average_stars'])
average_rating = np.array(average_rating).reshape(-1, 1)
average_rating = np.repeat(average_rating, all_user_predicted_ratings.shape[1], axis=1)
all_user_predicted_ratings_total = all_user_predicted_ratings + average_rating
return (all_user_predicted_ratings_total, all_user_predicted_ratings)
def business_svd_predict(df, df_):
R = df.as_matrix()
user_ratings_mean = np.mean(R, axis = 1)
R_demeaned = R - user_ratings_mean.reshape(-1, 1)
U = []
sigma = []
Vt = []
U, sigma, Vt = svds(R_demeaned, k=20)
sigma = np.diag(sigma)
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)
average_rating = list(df_['stars'])
average_rating = np.array(average_rating).reshape(-1, 1)
average_rating = np.repeat(average_rating, all_user_predicted_ratings.shape[1], axis=1)
all_user_predicted_ratings_total = all_user_predicted_ratings + average_rating
return (all_user_predicted_ratings_total, all_user_predicted_ratings)
all_user_predicted_ratings_total = user_svd_predict(pivot_review_train, df_user_train)[0]
all_user_predicted_ratings_total_test = user_svd_predict(pivot_review_test, df_user_test)[0]
all_business_predicted_ratings_total = business_svd_predict(pivot_review_train.T, df_business_train)[0]
all_business_predicted_ratings_total_test = business_svd_predict(pivot_review_test.T, df_business_test)[0]
pivot_review_train.shape
(3201, 2429)
df_user_train.shape
(3201, 25)
user_id_train = df_user_train['user_id']
user_id_test = df_user_test['user_id']
business_id_train = df_business_train['business_id']
business_id_test = df_business_test['business_id']
preds_df_train = pd.DataFrame(all_user_predicted_ratings_total, columns=pivot_review_train.columns, index=user_id_train)
preds_df_train.head()
business_id | -0uEqc2vw1xXtuI_r1xTNg | -1xuC540Nycht_iWFeJ-dw | -7bRnaHp7OHz8KW-THqP4w | -92cC6-X87HQ1DE1UHOx3w | -AgfhwHOYrsPKt-_xV_Ipg | -BPHhtX6zzI59IX7ZY-AQA | -FDkvLmwaBrtVgYFqEWeWA | -FPc3kwUU9GTDd4LzurvTQ | -GHqz1jGYzAtn27CeHeWeA | -HsqnPAz374YSoyFDyjl3A | ... | zqV3T9HltH1pmlRFJJSFcA | zr2wA55AskfBJxrvUeDZRA | zrnP9HqoF-RI9jqoW8pytA | zsMMlOYtXm8SNy0bl1leBA | zsbsLCO-bw3gdNE9XNgBYw | zv92BYJH09YjFQOtSyYp-A | zwBEMcCVqh8wOXn_sOIfxg | zwgVuZcMgijt9k3Jq-2zQQ | zwkif4XLEDqdEwEgTWLIVQ | zzjKekzQ6i4iR-qpo405Pw |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
user_id | |||||||||||||||||||||
Wc5L6iuvSNF5WGBlqIO8nw | 4.111058 | 3.765381 | 4.079379 | 4.128996 | 4.074044 | 4.113905 | 4.090428 | 4.105445 | 4.112865 | 4.236941 | ... | 4.072490 | 4.138484 | 4.098128 | 4.177040 | 4.119881 | 4.092555 | 4.095933 | 4.094899 | 4.219485 | 4.085823 |
Dd-TkEszFMkSF-vRih51fQ | 3.792130 | 3.907054 | 3.789752 | 3.762797 | 3.788838 | 3.810440 | 3.798067 | 3.790774 | 3.802109 | 3.839521 | ... | 3.788810 | 3.806899 | 3.794853 | 3.772085 | 3.803739 | 3.794808 | 3.817778 | 3.795181 | 3.762455 | 3.772413 |
YTdNcIWAt2nEzZ7NY-fniw | 3.750744 | 3.878034 | 3.753158 | 3.769249 | 3.752987 | 3.731422 | 3.747570 | 3.749814 | 3.742257 | 3.729388 | ... | 3.763453 | 3.765689 | 3.747911 | 3.749756 | 3.746374 | 3.748165 | 3.732125 | 3.749004 | 3.779889 | 3.762268 |
bTRFge5pRWMh7IoCLn7lBw | 4.115649 | 4.236376 | 4.113031 | 4.187895 | 4.109432 | 4.201939 | 4.100459 | 4.078761 | 4.103681 | 4.439442 | ... | 4.131304 | 4.116208 | 4.090716 | 4.198155 | 4.069027 | 4.081443 | 4.195563 | 4.100826 | 3.985273 | 4.088402 |
-w7ww3yW5BHE3TFyj3IHuQ | 3.639676 | 3.640164 | 3.643093 | 3.701733 | 3.621803 | 3.658139 | 3.634003 | 3.638492 | 3.636034 | 3.699485 | ... | 3.646045 | 3.635107 | 3.640433 | 3.654267 | 3.620071 | 3.637269 | 3.671494 | 3.637200 | 3.682764 | 3.668489 |
5 rows × 2429 columns
preds_df_test = pd.DataFrame(all_user_predicted_ratings_total_test, columns=pivot_review_test.columns, index=user_id_test)
preds_df_test.head()
business_id | -0uEqc2vw1xXtuI_r1xTNg | -1xuC540Nycht_iWFeJ-dw | -7bRnaHp7OHz8KW-THqP4w | -92cC6-X87HQ1DE1UHOx3w | -AgfhwHOYrsPKt-_xV_Ipg | -FDkvLmwaBrtVgYFqEWeWA | -FPc3kwUU9GTDd4LzurvTQ | -HsqnPAz374YSoyFDyjl3A | -MwaICRwxaUi0JBfad2Y3Q | -Mz3M0g6iFZczs6a7ddf5g | ... | zktCQRlDtF6XmOpqKBz1mA | zmQyE-gIUpwBCMmTFFRbJw | zpw5S3QwUse1MH-Eerbnaw | zqV3T9HltH1pmlRFJJSFcA | zr2wA55AskfBJxrvUeDZRA | zrnP9HqoF-RI9jqoW8pytA | zsMMlOYtXm8SNy0bl1leBA | zwBEMcCVqh8wOXn_sOIfxg | zwgVuZcMgijt9k3Jq-2zQQ | zwkif4XLEDqdEwEgTWLIVQ |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
user_id | |||||||||||||||||||||
Wc5L6iuvSNF5WGBlqIO8nw | 4.102477 | 4.249559 | 4.102253 | 4.101849 | 4.100737 | 4.102513 | 4.102370 | 4.102075 | 4.101559 | 4.102597 | ... | 4.100987 | 4.100863 | 4.102219 | 4.098799 | 4.105492 | 4.101588 | 4.101210 | 4.098801 | 4.102365 | 4.101979 |
Dd-TkEszFMkSF-vRih51fQ | 3.790618 | 3.788917 | 3.790619 | 3.790514 | 3.790607 | 3.790589 | 3.790622 | 3.790513 | 3.790574 | 3.790484 | ... | 3.790574 | 3.790609 | 3.790513 | 3.790247 | 3.790299 | 3.790607 | 3.790486 | 3.790552 | 3.790628 | 3.790606 |
YTdNcIWAt2nEzZ7NY-fniw | 3.742174 | 3.718317 | 3.742233 | 3.742351 | 3.742031 | 3.742384 | 3.742243 | 3.740605 | 3.741719 | 3.742942 | ... | 3.740200 | 3.742232 | 3.743492 | 3.739589 | 3.745336 | 3.742487 | 3.742482 | 3.741877 | 3.742250 | 3.742319 |
bTRFge5pRWMh7IoCLn7lBw | 4.077589 | 4.333084 | 4.075519 | 4.064777 | 4.078651 | 4.082149 | 4.076959 | 4.093477 | 4.095684 | 4.084624 | ... | 4.072748 | 4.078266 | 4.075081 | 4.121961 | 4.046767 | 4.065211 | 4.068705 | 4.088010 | 4.077159 | 4.076365 |
-w7ww3yW5BHE3TFyj3IHuQ | 3.642578 | 3.688117 | 3.642397 | 3.645565 | 3.641951 | 3.642059 | 3.642840 | 3.642970 | 3.649755 | 3.640657 | ... | 3.644327 | 3.643082 | 3.643405 | 3.664063 | 3.646322 | 3.641457 | 3.644397 | 3.642431 | 3.642516 | 3.642415 |
5 rows × 2070 columns
def simple_recommend(user_id, count):
df = preds_df_train.T[user_id].sort_values(ascending=False)
return df.head(count)
simple_recommend('yML2P1evj7FrLncIgaFzHw', 10)
business_id
kKY726bQREexYHHNLK1H7g 4.632830
IRIlwpomRvnXvpkeaGaM2A 4.390620
mm2wLW24ESxNIEL2bjseaQ 4.201337
um_o0pxQ3DlRI9EfCzw0hw 4.194907
2gUbgbdJ7IFSbicBXlSchw 4.142699
y32M2Hkr7GsUqGG6KwOhZw 4.079658
58APdML-PG_OD4El2ePTvw 4.057295
FhgAHo-8--equM8w5UZ41Q 4.016346
JN8s_dgw9nrSzkHnXxNOtg 3.909097
s2I_Ni76bjJNK9yG60iD-Q 3.902061
Name: yML2P1evj7FrLncIgaFzHw, dtype: float64
from sklearn.metrics.pairwise import pairwise_distances
user_similarity_train = 1 - pairwise_distances(user_svd_predict(pivot_review_train, df_user_train)[1], metric='cosine')
user_similarity_test = 1 - pairwise_distances(user_svd_predict(pivot_review_test, df_user_test)[1], metric='cosine')
business_similarity_train = 1 - pairwise_distances(business_svd_predict(pivot_review_train.T, df_business_train)[1], metric='cosine')
business_similarity_test = 1 - pairwise_distances(business_svd_predict(pivot_review_test.T, df_business_test)[1], metric='cosine')
user_similarity_matrix_train = pd.DataFrame(user_similarity_train, columns=user_id_train, index=user_id_train)
user_similarity_matrix_train.head()
user_id | Wc5L6iuvSNF5WGBlqIO8nw | Dd-TkEszFMkSF-vRih51fQ | YTdNcIWAt2nEzZ7NY-fniw | bTRFge5pRWMh7IoCLn7lBw | -w7ww3yW5BHE3TFyj3IHuQ | 4hAauH0dy57uK9o8bCvGUw | VMfwMYh8iJapW807Pu1Diw | lKRbcLWDQmOmhcMa3vMCMA | 2vJ2e51kdbdAmAo_HTr4KQ | 9KpMzih4E_gEioFtNeuIIw | ... | v7q2D8s1vsglwQaQcyb8_A | hOYNnE3qzb8TDKd3jqvq7Q | LqywrHdM-H8gSdKtGrhBuw | iIIbkFd_kgK3n2ewvLstXA | KJIS0INMJKhBmGqFkHMc-A | Ih3dwaCS1snsbhS8vRdxHA | LY-KaOJyXzbwZyqjQfl7xA | e3XuTKzX3w8LP-mEqQgJ9g | awdAcl2dA_WvUPWKOCS1OA | 0wXvG8Jiu8zdZhvezBgOwA |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
user_id | |||||||||||||||||||||
Wc5L6iuvSNF5WGBlqIO8nw | 1.000000 | 0.123739 | -0.033120 | 0.288995 | 0.581747 | 0.137748 | 0.093395 | 0.367794 | 0.107102 | -0.062889 | ... | 0.317822 | 0.267291 | 0.089310 | 0.074979 | 0.187363 | -0.049799 | 0.018654 | 0.126737 | 0.120642 | 0.096226 |
Dd-TkEszFMkSF-vRih51fQ | 0.123739 | 1.000000 | 0.082072 | 0.350151 | 0.107605 | 0.043687 | 0.261767 | -0.083328 | -0.021795 | 0.186220 | ... | 0.108294 | 0.242365 | 0.253467 | 0.053828 | 0.244121 | -0.036538 | 0.265373 | 0.397636 | 0.114079 | -0.098947 |
YTdNcIWAt2nEzZ7NY-fniw | -0.033120 | 0.082072 | 1.000000 | 0.057411 | 0.078848 | 0.348380 | 0.474439 | 0.275562 | 0.461942 | 0.127301 | ... | -0.010050 | 0.215982 | -0.109354 | 0.237667 | 0.423541 | 0.186048 | 0.425421 | 0.331947 | 0.043822 | 0.423622 |
bTRFge5pRWMh7IoCLn7lBw | 0.288995 | 0.350151 | 0.057411 | 1.000000 | 0.378264 | 0.021736 | 0.323239 | 0.408859 | 0.279707 | 0.094915 | ... | 0.140430 | -0.173267 | 0.514292 | 0.148501 | 0.373318 | 0.159174 | 0.036335 | 0.105005 | 0.271699 | 0.127442 |
-w7ww3yW5BHE3TFyj3IHuQ | 0.581747 | 0.107605 | 0.078848 | 0.378264 | 1.000000 | -0.207281 | 0.115763 | 0.133768 | 0.536527 | 0.002525 | ... | 0.090438 | -0.006708 | 0.176181 | -0.010482 | 0.351390 | -0.071426 | 0.141043 | 0.097905 | 0.579939 | -0.221361 |
5 rows × 3201 columns
user_similarity_matrix_test = pd.DataFrame(user_similarity_test, columns=user_id_test, index=user_id_test)
user_similarity_matrix_test.head()
user_id | Wc5L6iuvSNF5WGBlqIO8nw | Dd-TkEszFMkSF-vRih51fQ | YTdNcIWAt2nEzZ7NY-fniw | bTRFge5pRWMh7IoCLn7lBw | -w7ww3yW5BHE3TFyj3IHuQ | 4hAauH0dy57uK9o8bCvGUw | VMfwMYh8iJapW807Pu1Diw | lKRbcLWDQmOmhcMa3vMCMA | 2vJ2e51kdbdAmAo_HTr4KQ | 9KpMzih4E_gEioFtNeuIIw | ... | v7q2D8s1vsglwQaQcyb8_A | hOYNnE3qzb8TDKd3jqvq7Q | LqywrHdM-H8gSdKtGrhBuw | iIIbkFd_kgK3n2ewvLstXA | KJIS0INMJKhBmGqFkHMc-A | Ih3dwaCS1snsbhS8vRdxHA | LY-KaOJyXzbwZyqjQfl7xA | e3XuTKzX3w8LP-mEqQgJ9g | awdAcl2dA_WvUPWKOCS1OA | 0wXvG8Jiu8zdZhvezBgOwA |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
user_id | |||||||||||||||||||||
Wc5L6iuvSNF5WGBlqIO8nw | 1.000000 | 0.273028 | 0.029052 | 0.184856 | 0.024051 | 0.461267 | 0.180832 | -0.215365 | -0.025200 | 0.160955 | ... | 0.234125 | -0.125812 | 0.262517 | 0.101626 | 0.191844 | 0.252877 | -0.068769 | 0.252614 | -0.127824 | 0.112969 |
Dd-TkEszFMkSF-vRih51fQ | 0.273028 | 1.000000 | 0.756588 | 0.152833 | 0.264246 | 0.858565 | 0.570801 | 0.283083 | -0.025041 | 0.529752 | ... | 0.934899 | 0.572209 | 0.580154 | 0.787125 | 0.327144 | -0.019333 | 0.120580 | 0.962273 | 0.103892 | 0.289312 |
YTdNcIWAt2nEzZ7NY-fniw | 0.029052 | 0.756588 | 1.000000 | 0.144151 | 0.128624 | 0.711422 | 0.576731 | 0.552788 | -0.020392 | 0.799825 | ... | 0.787761 | 0.504840 | 0.515459 | 0.643395 | 0.443141 | 0.199837 | 0.492037 | 0.668754 | 0.193777 | 0.227147 |
bTRFge5pRWMh7IoCLn7lBw | 0.184856 | 0.152833 | 0.144151 | 1.000000 | 0.342405 | 0.138795 | 0.421830 | 0.194041 | 0.101054 | 0.136212 | ... | 0.246640 | 0.014915 | 0.265498 | 0.147580 | 0.369932 | 0.225324 | -0.008432 | 0.081523 | -0.241318 | -0.066618 |
-w7ww3yW5BHE3TFyj3IHuQ | 0.024051 | 0.264246 | 0.128624 | 0.342405 | 1.000000 | 0.218998 | 0.527180 | -0.051809 | -0.016700 | 0.247150 | ... | 0.235919 | 0.186573 | 0.156075 | 0.305037 | 0.159912 | -0.208084 | -0.007545 | 0.276780 | 0.130077 | 0.028004 |
5 rows × 2800 columns
business_similarity_matrix_train = pd.DataFrame(business_similarity_train, columns=business_id_train, index=business_id_train)
business_similarity_matrix_train.head()
business_id | 58APdML-PG_OD4El2ePTvw | 8Rdz0VPY8CuT2GQZ7ho2sw | DAMTCTsSeACXbkSABkhZqQ | 6I6uDGwCDggrWXi2T4lfaA | qUdGBSFkiPhEL6I718y-Gg | ujcbqs6jZfaESgSLvbjWuQ | XjbPr3o-YTsticeavLjTEg | Y22IfhXChXoRp3vKi6QwaQ | MhINNBBwzGn4-n_YI67wog | OLg1IeS-QxZgNprQ4Hg9gg | ... | LLBmqBunk40IHdHH_QfjkA | -ZHeHh4bwLlecbcAD7fTqw | SnD7fcwR4NR7Cgtx7Qm4ZQ | ml7HQlaAcszdBZZHljvYgg | Y5I-z2S3Eeno6cDyn0e6Cg | ODZLMTbjCnpDNkW1JbMjlQ | kWDAdT4m3vbnmE0CgLs4gA | rofWaZTIuaedAxT_UKleSw | bYfEp3NMskYfEzWL8tVb4w | HzUxQ1WpeNmeecXN-HPlPw |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
business_id | |||||||||||||||||||||
58APdML-PG_OD4El2ePTvw | 1.000000 | 0.271844 | 0.079825 | 0.428932 | 0.314548 | 0.304685 | 0.176533 | 0.329332 | 0.320960 | 0.537847 | ... | 0.283587 | 0.349941 | 0.018316 | 0.433927 | 0.185373 | 0.038287 | -0.286370 | -0.044779 | 0.350037 | 0.162044 |
8Rdz0VPY8CuT2GQZ7ho2sw | 0.271844 | 1.000000 | -0.060567 | 0.191821 | 0.296899 | 0.099059 | -0.033960 | 0.052488 | 0.032706 | 0.248680 | ... | 0.210382 | 0.143372 | -0.075271 | 0.367880 | 0.214407 | 0.050381 | -0.001927 | 0.117788 | 0.357570 | 0.531283 |
DAMTCTsSeACXbkSABkhZqQ | 0.079825 | -0.060567 | 1.000000 | 0.344558 | 0.041525 | 0.080128 | -0.008973 | -0.141186 | -0.304736 | 0.151596 | ... | 0.302561 | 0.119427 | 0.007250 | -0.082208 | -0.231451 | -0.235235 | 0.129771 | 0.020775 | -0.147551 | 0.295134 |
6I6uDGwCDggrWXi2T4lfaA | 0.428932 | 0.191821 | 0.344558 | 1.000000 | 0.566044 | 0.147380 | 0.233676 | 0.181992 | 0.093687 | 0.269127 | ... | 0.281470 | 0.342534 | 0.178220 | 0.417244 | -0.037037 | -0.043609 | 0.100768 | -0.097680 | 0.602414 | 0.637140 |
qUdGBSFkiPhEL6I718y-Gg | 0.314548 | 0.296899 | 0.041525 | 0.566044 | 1.000000 | -0.030378 | 0.244858 | 0.121635 | 0.357527 | 0.103237 | ... | 0.363627 | 0.323686 | 0.165245 | 0.420615 | -0.010195 | -0.153927 | -0.098615 | 0.079447 | 0.414936 | 0.419869 |
5 rows × 2429 columns
business_similarity_matrix_test = pd.DataFrame(business_similarity_test, columns=business_id_test, index=business_id_test)
business_similarity_matrix_test.head()
business_id | 58APdML-PG_OD4El2ePTvw | DAMTCTsSeACXbkSABkhZqQ | 6I6uDGwCDggrWXi2T4lfaA | qUdGBSFkiPhEL6I718y-Gg | ujcbqs6jZfaESgSLvbjWuQ | Y22IfhXChXoRp3vKi6QwaQ | MhINNBBwzGn4-n_YI67wog | OLg1IeS-QxZgNprQ4Hg9gg | DwJlGxAJvohbDR_5jV-ERA | i5j3FrxdR224KIjfv8x2CQ | ... | 3uu5jvP5JKdSUW9jk-HO7A | Akhq4AKxKRDPa6BHpiSEVQ | LLBmqBunk40IHdHH_QfjkA | -ZHeHh4bwLlecbcAD7fTqw | SnD7fcwR4NR7Cgtx7Qm4ZQ | ml7HQlaAcszdBZZHljvYgg | Y5I-z2S3Eeno6cDyn0e6Cg | rofWaZTIuaedAxT_UKleSw | bYfEp3NMskYfEzWL8tVb4w | HzUxQ1WpeNmeecXN-HPlPw |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
business_id | |||||||||||||||||||||
58APdML-PG_OD4El2ePTvw | 1.000000 | -0.023429 | 0.234868 | 0.231090 | 0.091581 | 0.490803 | 0.710535 | 0.365032 | 0.206606 | 0.239764 | ... | -0.105276 | 0.373045 | 0.028260 | 0.139765 | 0.027663 | 0.213343 | 0.297128 | 0.192612 | 0.847896 | 0.792960 |
DAMTCTsSeACXbkSABkhZqQ | -0.023429 | 1.000000 | 0.013408 | -0.056256 | 0.099845 | -0.113187 | -0.060088 | 0.582035 | 0.032511 | -0.043455 | ... | -0.017552 | -0.115553 | 0.365847 | -0.009046 | 0.496379 | 0.022771 | -0.121338 | -0.001104 | -0.041568 | -0.024040 |
6I6uDGwCDggrWXi2T4lfaA | 0.234868 | 0.013408 | 1.000000 | 0.023901 | -0.007454 | 0.240044 | 0.315998 | 0.516830 | -0.185376 | 0.057671 | ... | -0.017692 | 0.178879 | 0.080482 | -0.038698 | 0.006968 | 0.828757 | 0.000083 | -0.022214 | 0.342843 | 0.480901 |
qUdGBSFkiPhEL6I718y-Gg | 0.231090 | -0.056256 | 0.023901 | 1.000000 | 0.036277 | 0.230125 | 0.330451 | 0.077361 | 0.199359 | 0.448048 | ... | -0.061041 | 0.359701 | 0.173053 | 0.042803 | 0.300711 | 0.026037 | 0.238533 | -0.153242 | 0.378002 | 0.386268 |
ujcbqs6jZfaESgSLvbjWuQ | 0.091581 | 0.099845 | -0.007454 | 0.036277 | 1.000000 | 0.085431 | 0.067322 | 0.350011 | -0.409916 | 0.101035 | ... | 0.103053 | 0.330126 | 0.238725 | 0.149777 | -0.080113 | -0.096041 | 0.187941 | -0.123287 | 0.186262 | 0.152023 |
5 rows × 2070 columns
df1 = pd.merge(df_user_train, df_review_train, on='user_id')
df_train_total = pd.merge(df1, df_business_train, on='business_id')
df2 = pd.merge(df_user_test, df_review_test, on='user_id')
df_test_total = pd.merge(df2, df_business_test, on='business_id')
pivot_user_train = df_train_total.pivot(index = 'user_id', columns ='business_id', values = 'average_stars').fillna(0)
pivot_user_test = df_test_total.pivot(index = 'user_id', columns ='business_id', values = 'average_stars').fillna(0)
pivot_business_train = df_train_total.pivot(index = 'user_id', columns ='business_id', values = 'stars_y').fillna(0)
pivot_business_test = df_test_total.pivot(index = 'user_id', columns ='business_id', values = 'stars_y').fillna(0)
items_train = df_review_train.shape[0]
total_train = np.sum(df_review_train['stars'])
global_mean_train = total_train / items_train
print(global_mean_train)
pivot_user_train[pivot_user_train != 0] = global_mean_train
items_test = df_review_test.shape[0]
total_test = np.sum(df_review_test['stars'])
global_mean_test = total_test / items_test
print(global_mean_test)
pivot_user_test[pivot_user_test != 0] = global_mean_test
3.820671634568857
3.8194511314395765
def predict(ratings, similarity, type='user'):
if type == 'user':
mean_user_rating = ratings.mean(axis=1)
#You use np.newaxis so that mean_user_rating has same format as ratings
ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
elif type == 'item':
pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
return pred
train_data_matrix = user_svd_predict(pivot_review_train, df_user_train)[1]
item_similarity = pairwise_distances(train_data_matrix.T, metric='cosine')
user_similarity = pairwise_distances(train_data_matrix, metric='cosine')
item_prediction = predict(train_data_matrix, item_similarity, type='item')
user_prediction = predict(train_data_matrix, user_similarity, type='user')
test_data_matrix = user_svd_predict(pivot_review_test, df_user_test)[1]
item_similarity_test = pairwise_distances(test_data_matrix.T, metric='cosine')
user_similarity_test = pairwise_distances(test_data_matrix, metric='cosine')
item_prediction_test = predict(test_data_matrix, item_similarity_test, type='item')
user_prediction_test = predict(test_data_matrix, user_similarity_test, type='user')
pivot_train = pivot_review_train.as_matrix()
pivot_test = pivot_review_test.as_matrix()
pivot_user_train_ = pivot_user_train.as_matrix()
pivot_user_test_ = pivot_user_test.as_matrix()
pivot_business_train_ = pivot_business_train.as_matrix()
pivot_business_test_ = pivot_business_test.as_matrix()
pivot_pred_train = np.add(item_prediction, pivot_user_train_)
pivot_pred_train = np.add(pivot_pred_train, user_prediction)
pivot_pred_test = np.add(item_prediction_test, pivot_user_test_)
pivot_pred_test = np.add(pivot_pred_test, user_prediction_test)
print("CF Train RMSE score is {}".format(cf_rmse(pivot_train, pivot_pred_train)))
print("CF Test RMSE score is {}".format(cf_rmse(pivot_test, pivot_pred_test)))
CF Train RMSE score is 0.08932886240815875
CF Test RMSE score is 0.05339812124201512