#Set up workspace
import numpy as np
import pandas as pd
from collections import Counter
from datetime import datetime
import os
import json
#Reading in dataframes:
users = pd.read_json("user.json",lines=True) #users
businesses = pd.read_json("business.json",lines=True) #businesses
with open('review.json') as json_file: #reviews
reviews = json_file.readlines()
reviews = list(map(json.loads, reviews))
pd.DataFrame(reviews)
reviews_df=pd.DataFrame(reviews,columns=['business_id','cool','date','funny','review_id','stars','text','useful','user_id'])
#Formatting users dataset
users['elite'] = [x if len(x)>0 else 'NaN' for x in users['elite']]
elite_status = [] #Creating a new elite status column
for i in users['elite']:
if i=='NaN':
elite_status.append('No')
else:
elite_status.append('Yes')
users = users.assign(elite_status = elite_status)
#Formatting datetime
users['yelping_since'] = [datetime.strptime(i, '%Y-%m-%d') for i in users['yelping_since']]
#Filtering businesses dataset for just restaurants
new_column = []
for j in businesses['categories']:
if j.count('Restaurants')>0:
new_column.append('Yes')
else:
new_column.append('No')
businesses['Restaurant_Status']=new_column
businesses = businesses[businesses['Restaurant_Status']=='Yes']
#Formula to define dataframes filtered by city
def choose_city(city,review_count=5):
city_businesses = businesses[businesses['city']==city]
city_businesses = city_businesses[city_businesses['review_count']>(review_count - 1)]
#Subsetting reviews dataset
business_list = list(set(city_businesses['business_id']))
city_reviews = reviews_df[reviews_df['business_id'].isin(business_list)]
keep_bids = []
for bid, df in city_reviews.groupby('business_id'):
if df.shape[0]>=review_count:
keep_bids.append(bid)
city_reviews = city_reviews[city_reviews['business_id'].isin(keep_bids)]
#users
city_user_list = list(set(city_reviews['user_id']))
city_users = users[users['user_id'].isin(city_user_list)]
city_user_list = list(set(city_users['user_id']))
city_reviews = city_reviews[city_reviews['user_id'].isin(city_user_list)]
#Saving each file to .csv
city_reviews.to_json(str(city)+'_reviews.json')
city_users.to_json(str(city)+'_users.json')
city_businesses.to_json(str(city)+'_businesses.json')
return {'businesses':city_businesses, 'users':city_users ,'reviews':city_reviews }
Next, create an ordered list of cities by number of reviews. From this ordered list we’ll select cities with a moderate but not excessive number of reviews to be our sample set. Then we’ll save the dataframes for each of these cities to .csv’s so that we can just read these in in future, without having to do the entire set of steps above.
#count cities
cities = list(set(businesses['city']))
cities = list(filter(None, cities))
business_counts = []
for i in cities:
businesses_list = list(businesses['city'])
business_counts.append(businesses_list.count(i))
cities_df = pd.DataFrame({
'Cities': cities,
'Businesses': business_counts})
cities_df = cities_df.sort_values(by=['Businesses'],ascending=False)
medium_cities = list(cities_df['Cities'][25:50])
medium_cities
#looping through to create many different cities dfs:
cities_dfs = []
for i in ['Montreal']:
cities_dfs.append(choose_city(i))
dfs_dict = dict(zip(medium_cities,cities_dfs))