Collaborative Filtering

Contents

import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import sklearn.metrics as metrics
from sklearn.model_selection import cross_val_score
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegressionCV
import seaborn as sns
%matplotlib inline
from sklearn.metrics import mean_squared_error
from math import sqrt
def cf_rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten()
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))
df_review_train = pd.read_csv("Montreal_review_train.csv")
df_review_test = pd.read_csv("Montreal_review_test.csv")
df_user_train = pd.read_csv("Montreal_user_train.csv")
df_user_test = pd.read_csv("Montreal_user_test.csv")
df_business_train = pd.read_csv("Montreal_business_train.csv")
df_business_test = pd.read_csv("Montreal_business_test.csv")
df_user_train.head()
Unnamed: 0 Unnamed: 0.1 average_stars compliment_cool compliment_cute compliment_funny compliment_hot compliment_list compliment_more compliment_note ... elite fans friends funny name review_count useful user_id yelping_since elite_status
0 0 10 4.10 1302 41 1302 806 20 46 725 ... [2016, 2014, 2015, 2017, 2012, 2011, 2013] 435 ['xRYvFaMGWsvKcLCFtRIzWQ', 'zvQ7B3KZuFOX7pYLsO... 4880 Risa 1122 26395 Wc5L6iuvSNF5WGBlqIO8nw 2011-07-30 Yes
1 1 13 3.79 1139 87 1139 782 54 103 391 ... [2012, 2008, 2009, 2010, 2007, 2006, 2013, 2011] 198 ['KOwp5RDbm7cDyrdXN8FVQQ', '7MlH7OevWSkenMyKFI... 10715 Holly 698 24047 Dd-TkEszFMkSF-vRih51fQ 2006-07-03 Yes
2 2 37 3.74 129 2 129 77 6 12 56 ... [2017, 2016, 2012, 2014, 2015, 2011, 2013] 68 ['Cq8uhBLRO1T9l-9R9OmddQ', 'x3_b9Rv-GZpjtCDLqg... 105 Jeff 754 151 YTdNcIWAt2nEzZ7NY-fniw 2011-05-16 Yes
3 3 110 4.07 60 1 60 51 1 16 19 ... [2014, 2012, 2015, 2011, 2013, 2016, 2017] 33 ['8s7UH21vFgkRJAJg2L8VzA', 'HWGrt1MEXlzZ71NGx0... 9 Cecille 356 36 bTRFge5pRWMh7IoCLn7lBw 2007-08-03 Yes
4 4 117 3.64 23 2 23 31 0 3 13 ... [2012, 2013] 15 ['G-Hav6XBWPEyzI-0nNpdxw', 'EgqsK7MUgqpbaTVZAv... 36 Carolina 115 89 -w7ww3yW5BHE3TFyj3IHuQ 2010-06-29 Yes

5 rows × 25 columns

print(df_review_train.shape)
print(len(df_review_train['user_id'].unique()))
print(len(df_review_train['business_id'].unique()))

print(df_user_train.shape)
print(len(df_user_train['user_id'].unique()))

print(df_business_train.shape)
print(len(df_business_train['business_id'].unique()))
(30999, 11)
3201
2429
(3201, 25)
3201
(2429, 18)
2429
df_review_train.shape
(30999, 11)
pivot_review_train = df_review_train.pivot(index = 'user_id', columns ='business_id', values = 'stars').fillna(0)
pivot_review_test = df_review_test.pivot(index = 'user_id', columns ='business_id', values = 'stars').fillna(0)
pivot_review_train.shape
(3201, 2429)
from scipy.sparse.linalg import svds
def user_svd_predict(df, df_):
    R = df.as_matrix()
    user_ratings_mean = np.mean(R, axis = 1)
    R_demeaned = R - user_ratings_mean.reshape(-1, 1)
    
    U = []
    sigma = []
    Vt = []
    U, sigma, Vt = svds(R_demeaned, k=20)
    sigma = np.diag(sigma)
    all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)
    average_rating = list(df_['average_stars'])
    average_rating = np.array(average_rating).reshape(-1, 1)
    average_rating = np.repeat(average_rating, all_user_predicted_ratings.shape[1], axis=1)
    all_user_predicted_ratings_total = all_user_predicted_ratings + average_rating
    return (all_user_predicted_ratings_total, all_user_predicted_ratings)
def business_svd_predict(df, df_):
    R = df.as_matrix()
    user_ratings_mean = np.mean(R, axis = 1)
    R_demeaned = R - user_ratings_mean.reshape(-1, 1)
    
    U = []
    sigma = []
    Vt = []
    U, sigma, Vt = svds(R_demeaned, k=20)
    sigma = np.diag(sigma)
    all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)
    average_rating = list(df_['stars'])
    average_rating = np.array(average_rating).reshape(-1, 1)
    average_rating = np.repeat(average_rating, all_user_predicted_ratings.shape[1], axis=1)
    all_user_predicted_ratings_total = all_user_predicted_ratings + average_rating
    return (all_user_predicted_ratings_total, all_user_predicted_ratings)
all_user_predicted_ratings_total = user_svd_predict(pivot_review_train, df_user_train)[0]
all_user_predicted_ratings_total_test = user_svd_predict(pivot_review_test, df_user_test)[0]
all_business_predicted_ratings_total = business_svd_predict(pivot_review_train.T, df_business_train)[0]
all_business_predicted_ratings_total_test = business_svd_predict(pivot_review_test.T, df_business_test)[0]
pivot_review_train.shape
(3201, 2429)
df_user_train.shape
(3201, 25)
user_id_train = df_user_train['user_id']
user_id_test = df_user_test['user_id']

business_id_train = df_business_train['business_id']
business_id_test = df_business_test['business_id']
preds_df_train = pd.DataFrame(all_user_predicted_ratings_total, columns=pivot_review_train.columns, index=user_id_train)
preds_df_train.head()
business_id -0uEqc2vw1xXtuI_r1xTNg -1xuC540Nycht_iWFeJ-dw -7bRnaHp7OHz8KW-THqP4w -92cC6-X87HQ1DE1UHOx3w -AgfhwHOYrsPKt-_xV_Ipg -BPHhtX6zzI59IX7ZY-AQA -FDkvLmwaBrtVgYFqEWeWA -FPc3kwUU9GTDd4LzurvTQ -GHqz1jGYzAtn27CeHeWeA -HsqnPAz374YSoyFDyjl3A ... zqV3T9HltH1pmlRFJJSFcA zr2wA55AskfBJxrvUeDZRA zrnP9HqoF-RI9jqoW8pytA zsMMlOYtXm8SNy0bl1leBA zsbsLCO-bw3gdNE9XNgBYw zv92BYJH09YjFQOtSyYp-A zwBEMcCVqh8wOXn_sOIfxg zwgVuZcMgijt9k3Jq-2zQQ zwkif4XLEDqdEwEgTWLIVQ zzjKekzQ6i4iR-qpo405Pw
user_id
Wc5L6iuvSNF5WGBlqIO8nw 4.111058 3.765381 4.079379 4.128996 4.074044 4.113905 4.090428 4.105445 4.112865 4.236941 ... 4.072490 4.138484 4.098128 4.177040 4.119881 4.092555 4.095933 4.094899 4.219485 4.085823
Dd-TkEszFMkSF-vRih51fQ 3.792130 3.907054 3.789752 3.762797 3.788838 3.810440 3.798067 3.790774 3.802109 3.839521 ... 3.788810 3.806899 3.794853 3.772085 3.803739 3.794808 3.817778 3.795181 3.762455 3.772413
YTdNcIWAt2nEzZ7NY-fniw 3.750744 3.878034 3.753158 3.769249 3.752987 3.731422 3.747570 3.749814 3.742257 3.729388 ... 3.763453 3.765689 3.747911 3.749756 3.746374 3.748165 3.732125 3.749004 3.779889 3.762268
bTRFge5pRWMh7IoCLn7lBw 4.115649 4.236376 4.113031 4.187895 4.109432 4.201939 4.100459 4.078761 4.103681 4.439442 ... 4.131304 4.116208 4.090716 4.198155 4.069027 4.081443 4.195563 4.100826 3.985273 4.088402
-w7ww3yW5BHE3TFyj3IHuQ 3.639676 3.640164 3.643093 3.701733 3.621803 3.658139 3.634003 3.638492 3.636034 3.699485 ... 3.646045 3.635107 3.640433 3.654267 3.620071 3.637269 3.671494 3.637200 3.682764 3.668489

5 rows × 2429 columns

preds_df_test = pd.DataFrame(all_user_predicted_ratings_total_test, columns=pivot_review_test.columns, index=user_id_test)
preds_df_test.head()
business_id -0uEqc2vw1xXtuI_r1xTNg -1xuC540Nycht_iWFeJ-dw -7bRnaHp7OHz8KW-THqP4w -92cC6-X87HQ1DE1UHOx3w -AgfhwHOYrsPKt-_xV_Ipg -FDkvLmwaBrtVgYFqEWeWA -FPc3kwUU9GTDd4LzurvTQ -HsqnPAz374YSoyFDyjl3A -MwaICRwxaUi0JBfad2Y3Q -Mz3M0g6iFZczs6a7ddf5g ... zktCQRlDtF6XmOpqKBz1mA zmQyE-gIUpwBCMmTFFRbJw zpw5S3QwUse1MH-Eerbnaw zqV3T9HltH1pmlRFJJSFcA zr2wA55AskfBJxrvUeDZRA zrnP9HqoF-RI9jqoW8pytA zsMMlOYtXm8SNy0bl1leBA zwBEMcCVqh8wOXn_sOIfxg zwgVuZcMgijt9k3Jq-2zQQ zwkif4XLEDqdEwEgTWLIVQ
user_id
Wc5L6iuvSNF5WGBlqIO8nw 4.102477 4.249559 4.102253 4.101849 4.100737 4.102513 4.102370 4.102075 4.101559 4.102597 ... 4.100987 4.100863 4.102219 4.098799 4.105492 4.101588 4.101210 4.098801 4.102365 4.101979
Dd-TkEszFMkSF-vRih51fQ 3.790618 3.788917 3.790619 3.790514 3.790607 3.790589 3.790622 3.790513 3.790574 3.790484 ... 3.790574 3.790609 3.790513 3.790247 3.790299 3.790607 3.790486 3.790552 3.790628 3.790606
YTdNcIWAt2nEzZ7NY-fniw 3.742174 3.718317 3.742233 3.742351 3.742031 3.742384 3.742243 3.740605 3.741719 3.742942 ... 3.740200 3.742232 3.743492 3.739589 3.745336 3.742487 3.742482 3.741877 3.742250 3.742319
bTRFge5pRWMh7IoCLn7lBw 4.077589 4.333084 4.075519 4.064777 4.078651 4.082149 4.076959 4.093477 4.095684 4.084624 ... 4.072748 4.078266 4.075081 4.121961 4.046767 4.065211 4.068705 4.088010 4.077159 4.076365
-w7ww3yW5BHE3TFyj3IHuQ 3.642578 3.688117 3.642397 3.645565 3.641951 3.642059 3.642840 3.642970 3.649755 3.640657 ... 3.644327 3.643082 3.643405 3.664063 3.646322 3.641457 3.644397 3.642431 3.642516 3.642415

5 rows × 2070 columns

def simple_recommend(user_id, count):
    df = preds_df_train.T[user_id].sort_values(ascending=False)
    return df.head(count)
simple_recommend('yML2P1evj7FrLncIgaFzHw', 10)
business_id
kKY726bQREexYHHNLK1H7g    4.632830
IRIlwpomRvnXvpkeaGaM2A    4.390620
mm2wLW24ESxNIEL2bjseaQ    4.201337
um_o0pxQ3DlRI9EfCzw0hw    4.194907
2gUbgbdJ7IFSbicBXlSchw    4.142699
y32M2Hkr7GsUqGG6KwOhZw    4.079658
58APdML-PG_OD4El2ePTvw    4.057295
FhgAHo-8--equM8w5UZ41Q    4.016346
JN8s_dgw9nrSzkHnXxNOtg    3.909097
s2I_Ni76bjJNK9yG60iD-Q    3.902061
Name: yML2P1evj7FrLncIgaFzHw, dtype: float64
from sklearn.metrics.pairwise import pairwise_distances
user_similarity_train = 1 - pairwise_distances(user_svd_predict(pivot_review_train, df_user_train)[1], metric='cosine')
user_similarity_test = 1 - pairwise_distances(user_svd_predict(pivot_review_test, df_user_test)[1], metric='cosine')
business_similarity_train = 1 - pairwise_distances(business_svd_predict(pivot_review_train.T, df_business_train)[1], metric='cosine')
business_similarity_test = 1 - pairwise_distances(business_svd_predict(pivot_review_test.T, df_business_test)[1], metric='cosine')
user_similarity_matrix_train = pd.DataFrame(user_similarity_train, columns=user_id_train, index=user_id_train)
user_similarity_matrix_train.head()
user_id Wc5L6iuvSNF5WGBlqIO8nw Dd-TkEszFMkSF-vRih51fQ YTdNcIWAt2nEzZ7NY-fniw bTRFge5pRWMh7IoCLn7lBw -w7ww3yW5BHE3TFyj3IHuQ 4hAauH0dy57uK9o8bCvGUw VMfwMYh8iJapW807Pu1Diw lKRbcLWDQmOmhcMa3vMCMA 2vJ2e51kdbdAmAo_HTr4KQ 9KpMzih4E_gEioFtNeuIIw ... v7q2D8s1vsglwQaQcyb8_A hOYNnE3qzb8TDKd3jqvq7Q LqywrHdM-H8gSdKtGrhBuw iIIbkFd_kgK3n2ewvLstXA KJIS0INMJKhBmGqFkHMc-A Ih3dwaCS1snsbhS8vRdxHA LY-KaOJyXzbwZyqjQfl7xA e3XuTKzX3w8LP-mEqQgJ9g awdAcl2dA_WvUPWKOCS1OA 0wXvG8Jiu8zdZhvezBgOwA
user_id
Wc5L6iuvSNF5WGBlqIO8nw 1.000000 0.123739 -0.033120 0.288995 0.581747 0.137748 0.093395 0.367794 0.107102 -0.062889 ... 0.317822 0.267291 0.089310 0.074979 0.187363 -0.049799 0.018654 0.126737 0.120642 0.096226
Dd-TkEszFMkSF-vRih51fQ 0.123739 1.000000 0.082072 0.350151 0.107605 0.043687 0.261767 -0.083328 -0.021795 0.186220 ... 0.108294 0.242365 0.253467 0.053828 0.244121 -0.036538 0.265373 0.397636 0.114079 -0.098947
YTdNcIWAt2nEzZ7NY-fniw -0.033120 0.082072 1.000000 0.057411 0.078848 0.348380 0.474439 0.275562 0.461942 0.127301 ... -0.010050 0.215982 -0.109354 0.237667 0.423541 0.186048 0.425421 0.331947 0.043822 0.423622
bTRFge5pRWMh7IoCLn7lBw 0.288995 0.350151 0.057411 1.000000 0.378264 0.021736 0.323239 0.408859 0.279707 0.094915 ... 0.140430 -0.173267 0.514292 0.148501 0.373318 0.159174 0.036335 0.105005 0.271699 0.127442
-w7ww3yW5BHE3TFyj3IHuQ 0.581747 0.107605 0.078848 0.378264 1.000000 -0.207281 0.115763 0.133768 0.536527 0.002525 ... 0.090438 -0.006708 0.176181 -0.010482 0.351390 -0.071426 0.141043 0.097905 0.579939 -0.221361

5 rows × 3201 columns

user_similarity_matrix_test = pd.DataFrame(user_similarity_test, columns=user_id_test, index=user_id_test)
user_similarity_matrix_test.head()
user_id Wc5L6iuvSNF5WGBlqIO8nw Dd-TkEszFMkSF-vRih51fQ YTdNcIWAt2nEzZ7NY-fniw bTRFge5pRWMh7IoCLn7lBw -w7ww3yW5BHE3TFyj3IHuQ 4hAauH0dy57uK9o8bCvGUw VMfwMYh8iJapW807Pu1Diw lKRbcLWDQmOmhcMa3vMCMA 2vJ2e51kdbdAmAo_HTr4KQ 9KpMzih4E_gEioFtNeuIIw ... v7q2D8s1vsglwQaQcyb8_A hOYNnE3qzb8TDKd3jqvq7Q LqywrHdM-H8gSdKtGrhBuw iIIbkFd_kgK3n2ewvLstXA KJIS0INMJKhBmGqFkHMc-A Ih3dwaCS1snsbhS8vRdxHA LY-KaOJyXzbwZyqjQfl7xA e3XuTKzX3w8LP-mEqQgJ9g awdAcl2dA_WvUPWKOCS1OA 0wXvG8Jiu8zdZhvezBgOwA
user_id
Wc5L6iuvSNF5WGBlqIO8nw 1.000000 0.273028 0.029052 0.184856 0.024051 0.461267 0.180832 -0.215365 -0.025200 0.160955 ... 0.234125 -0.125812 0.262517 0.101626 0.191844 0.252877 -0.068769 0.252614 -0.127824 0.112969
Dd-TkEszFMkSF-vRih51fQ 0.273028 1.000000 0.756588 0.152833 0.264246 0.858565 0.570801 0.283083 -0.025041 0.529752 ... 0.934899 0.572209 0.580154 0.787125 0.327144 -0.019333 0.120580 0.962273 0.103892 0.289312
YTdNcIWAt2nEzZ7NY-fniw 0.029052 0.756588 1.000000 0.144151 0.128624 0.711422 0.576731 0.552788 -0.020392 0.799825 ... 0.787761 0.504840 0.515459 0.643395 0.443141 0.199837 0.492037 0.668754 0.193777 0.227147
bTRFge5pRWMh7IoCLn7lBw 0.184856 0.152833 0.144151 1.000000 0.342405 0.138795 0.421830 0.194041 0.101054 0.136212 ... 0.246640 0.014915 0.265498 0.147580 0.369932 0.225324 -0.008432 0.081523 -0.241318 -0.066618
-w7ww3yW5BHE3TFyj3IHuQ 0.024051 0.264246 0.128624 0.342405 1.000000 0.218998 0.527180 -0.051809 -0.016700 0.247150 ... 0.235919 0.186573 0.156075 0.305037 0.159912 -0.208084 -0.007545 0.276780 0.130077 0.028004

5 rows × 2800 columns

business_similarity_matrix_train = pd.DataFrame(business_similarity_train, columns=business_id_train, index=business_id_train)
business_similarity_matrix_train.head()
business_id 58APdML-PG_OD4El2ePTvw 8Rdz0VPY8CuT2GQZ7ho2sw DAMTCTsSeACXbkSABkhZqQ 6I6uDGwCDggrWXi2T4lfaA qUdGBSFkiPhEL6I718y-Gg ujcbqs6jZfaESgSLvbjWuQ XjbPr3o-YTsticeavLjTEg Y22IfhXChXoRp3vKi6QwaQ MhINNBBwzGn4-n_YI67wog OLg1IeS-QxZgNprQ4Hg9gg ... LLBmqBunk40IHdHH_QfjkA -ZHeHh4bwLlecbcAD7fTqw SnD7fcwR4NR7Cgtx7Qm4ZQ ml7HQlaAcszdBZZHljvYgg Y5I-z2S3Eeno6cDyn0e6Cg ODZLMTbjCnpDNkW1JbMjlQ kWDAdT4m3vbnmE0CgLs4gA rofWaZTIuaedAxT_UKleSw bYfEp3NMskYfEzWL8tVb4w HzUxQ1WpeNmeecXN-HPlPw
business_id
58APdML-PG_OD4El2ePTvw 1.000000 0.271844 0.079825 0.428932 0.314548 0.304685 0.176533 0.329332 0.320960 0.537847 ... 0.283587 0.349941 0.018316 0.433927 0.185373 0.038287 -0.286370 -0.044779 0.350037 0.162044
8Rdz0VPY8CuT2GQZ7ho2sw 0.271844 1.000000 -0.060567 0.191821 0.296899 0.099059 -0.033960 0.052488 0.032706 0.248680 ... 0.210382 0.143372 -0.075271 0.367880 0.214407 0.050381 -0.001927 0.117788 0.357570 0.531283
DAMTCTsSeACXbkSABkhZqQ 0.079825 -0.060567 1.000000 0.344558 0.041525 0.080128 -0.008973 -0.141186 -0.304736 0.151596 ... 0.302561 0.119427 0.007250 -0.082208 -0.231451 -0.235235 0.129771 0.020775 -0.147551 0.295134
6I6uDGwCDggrWXi2T4lfaA 0.428932 0.191821 0.344558 1.000000 0.566044 0.147380 0.233676 0.181992 0.093687 0.269127 ... 0.281470 0.342534 0.178220 0.417244 -0.037037 -0.043609 0.100768 -0.097680 0.602414 0.637140
qUdGBSFkiPhEL6I718y-Gg 0.314548 0.296899 0.041525 0.566044 1.000000 -0.030378 0.244858 0.121635 0.357527 0.103237 ... 0.363627 0.323686 0.165245 0.420615 -0.010195 -0.153927 -0.098615 0.079447 0.414936 0.419869

5 rows × 2429 columns

business_similarity_matrix_test = pd.DataFrame(business_similarity_test, columns=business_id_test, index=business_id_test)
business_similarity_matrix_test.head()
business_id 58APdML-PG_OD4El2ePTvw DAMTCTsSeACXbkSABkhZqQ 6I6uDGwCDggrWXi2T4lfaA qUdGBSFkiPhEL6I718y-Gg ujcbqs6jZfaESgSLvbjWuQ Y22IfhXChXoRp3vKi6QwaQ MhINNBBwzGn4-n_YI67wog OLg1IeS-QxZgNprQ4Hg9gg DwJlGxAJvohbDR_5jV-ERA i5j3FrxdR224KIjfv8x2CQ ... 3uu5jvP5JKdSUW9jk-HO7A Akhq4AKxKRDPa6BHpiSEVQ LLBmqBunk40IHdHH_QfjkA -ZHeHh4bwLlecbcAD7fTqw SnD7fcwR4NR7Cgtx7Qm4ZQ ml7HQlaAcszdBZZHljvYgg Y5I-z2S3Eeno6cDyn0e6Cg rofWaZTIuaedAxT_UKleSw bYfEp3NMskYfEzWL8tVb4w HzUxQ1WpeNmeecXN-HPlPw
business_id
58APdML-PG_OD4El2ePTvw 1.000000 -0.023429 0.234868 0.231090 0.091581 0.490803 0.710535 0.365032 0.206606 0.239764 ... -0.105276 0.373045 0.028260 0.139765 0.027663 0.213343 0.297128 0.192612 0.847896 0.792960
DAMTCTsSeACXbkSABkhZqQ -0.023429 1.000000 0.013408 -0.056256 0.099845 -0.113187 -0.060088 0.582035 0.032511 -0.043455 ... -0.017552 -0.115553 0.365847 -0.009046 0.496379 0.022771 -0.121338 -0.001104 -0.041568 -0.024040
6I6uDGwCDggrWXi2T4lfaA 0.234868 0.013408 1.000000 0.023901 -0.007454 0.240044 0.315998 0.516830 -0.185376 0.057671 ... -0.017692 0.178879 0.080482 -0.038698 0.006968 0.828757 0.000083 -0.022214 0.342843 0.480901
qUdGBSFkiPhEL6I718y-Gg 0.231090 -0.056256 0.023901 1.000000 0.036277 0.230125 0.330451 0.077361 0.199359 0.448048 ... -0.061041 0.359701 0.173053 0.042803 0.300711 0.026037 0.238533 -0.153242 0.378002 0.386268
ujcbqs6jZfaESgSLvbjWuQ 0.091581 0.099845 -0.007454 0.036277 1.000000 0.085431 0.067322 0.350011 -0.409916 0.101035 ... 0.103053 0.330126 0.238725 0.149777 -0.080113 -0.096041 0.187941 -0.123287 0.186262 0.152023

5 rows × 2070 columns

df1 = pd.merge(df_user_train, df_review_train, on='user_id')
df_train_total = pd.merge(df1, df_business_train, on='business_id')

df2 = pd.merge(df_user_test, df_review_test, on='user_id')
df_test_total = pd.merge(df2, df_business_test, on='business_id')
pivot_user_train = df_train_total.pivot(index = 'user_id', columns ='business_id', values = 'average_stars').fillna(0)
pivot_user_test = df_test_total.pivot(index = 'user_id', columns ='business_id', values = 'average_stars').fillna(0)
pivot_business_train = df_train_total.pivot(index = 'user_id', columns ='business_id', values = 'stars_y').fillna(0)
pivot_business_test = df_test_total.pivot(index = 'user_id', columns ='business_id', values = 'stars_y').fillna(0)
items_train = df_review_train.shape[0]
total_train = np.sum(df_review_train['stars'])
global_mean_train = total_train / items_train
print(global_mean_train)
pivot_user_train[pivot_user_train != 0] = global_mean_train

items_test = df_review_test.shape[0]
total_test = np.sum(df_review_test['stars'])
global_mean_test = total_test / items_test
print(global_mean_test)
pivot_user_test[pivot_user_test != 0] = global_mean_test
3.820671634568857
3.8194511314395765
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        #You use np.newaxis so that mean_user_rating has same format as ratings
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return pred
train_data_matrix = user_svd_predict(pivot_review_train, df_user_train)[1]
item_similarity = pairwise_distances(train_data_matrix.T, metric='cosine')
user_similarity = pairwise_distances(train_data_matrix, metric='cosine')
item_prediction = predict(train_data_matrix, item_similarity, type='item')
user_prediction = predict(train_data_matrix, user_similarity, type='user')

test_data_matrix = user_svd_predict(pivot_review_test, df_user_test)[1]
item_similarity_test = pairwise_distances(test_data_matrix.T, metric='cosine')
user_similarity_test = pairwise_distances(test_data_matrix, metric='cosine')
item_prediction_test = predict(test_data_matrix, item_similarity_test, type='item')
user_prediction_test = predict(test_data_matrix, user_similarity_test, type='user')
pivot_train = pivot_review_train.as_matrix()
pivot_test = pivot_review_test.as_matrix()
pivot_user_train_ = pivot_user_train.as_matrix()
pivot_user_test_ = pivot_user_test.as_matrix()
pivot_business_train_ = pivot_business_train.as_matrix()
pivot_business_test_ = pivot_business_test.as_matrix()
pivot_pred_train = np.add(item_prediction, pivot_user_train_)
pivot_pred_train = np.add(pivot_pred_train, user_prediction)

pivot_pred_test = np.add(item_prediction_test, pivot_user_test_)
pivot_pred_test = np.add(pivot_pred_test, user_prediction_test)
print("CF Train RMSE score is {}".format(cf_rmse(pivot_train, pivot_pred_train)))
print("CF Test RMSE score is {}".format(cf_rmse(pivot_test, pivot_pred_test)))
CF Train RMSE score is 0.08932886240815875
CF Test RMSE score is 0.05339812124201512