The objective is to locate and recommend to the client which neighbourhood in New York City will be best choice to start a restaurant.
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import requests
from bs4 import BeautifulSoup
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
#!conda install -c conda-forge folium
import os
import folium # map rendering library
# Matplotlib and associated plotting modules
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as colors
import matplotlib as mp
import re
import csv
%matplotlib inline
print('Libraries imported.')
def geo_location(address):
# get geo location of address
geolocator = Nominatim(user_agent="ny_explorer")
locatidn = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
return latitude,longitude
Define a function to intract with FourSquare API and get top 100 venues within a radius of 1000 metres for a given latitude and longitude.
def get_venues(lat,lng):
#set variables
radius=1000
LIMIT=100
CLIENT_ID = 'SJVNROHFKAQ0CZ5ULIM0V5K5IXUPI2UV1FARMVENUHS3HB4F'# Foursquare ID, note there is a daily call quota limit
CLIENT_SECRET ='W1H0JXEDQWHBWX1ZZYT2IXCRTWLGPSNQ2NOGQ5IA4ZKCQKEU' # Foursquare Secret, note there is a daily call quota it
VERSION = '20180605' # Foursquare API version
#url to fetch data from foursquare api
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
CLIENT_ID,
CLIENT_SECRET,
VERSION,
lat,
lng,
radius,
LIMIT)
# get all the data
results = requests.get(url).json()
venue_data=results["response"]['groups'][0]['items']
venue_details=[]
for row in venue_data:
try:
venue_id=row['venue']['id']
venue_name=row['venue']['name']
venue_category=row['venue']['categories'][0]['name']
venue_details.append([venue_id,venue_name,venue_category])
except KeyError:
pass
column_names=['ID','Name','Category']
df = pd.DataFrame(venue_details,columns=column_names)
return df
Define a function to get venue details like like count , rating , tip counts for a given venue id. to be used for ranking.
def get_venue_details(venue_id):
CLIENT_ID = 'FM32E0UU4KR1WU4VZIG1F5G1J2XFMKEGNF52UGPTSQ1J1CF1'# Foursquare ID, note there is a daily call quota limit
CLIENT_SECRET ='R5SUZ4FBHNDVUMHP15ZKFIKI5VCQBLGPYSFGZY2LYQXMILX2' # Foursquare Secret, note there is a daily call quota it it
VERSION = '20180605' # Foursquare API version
#url to fetch data from foursquare api
url = 'https://api.foursquare.com/v2/venues/{}?&client_id={}&client_secret={}&v={}'.format(
venue_id,
CLIENT_ID,
CLIENT_SECRET,
VERSION)
# get all the data
results = requests.get(url).json()
venue_data=results['response']['venue']
venue_details=[]
try:
venue_id=venue_data['id']
venue_name=venue_data['name']
venue_likes=venue_data['likes']['count']
venue_rating=venue_data['rating']
venue_tips=venue_data['tips']['count']
venue_details.append([venue_id,venue_name,venue_likes,venue_rating,venue_tips])
except KeyError:
pass
column_names=['ID','Name','Likes','Rating','Tips']
df = pd.DataFrame(venue_details,columns=column_names)
return df
Define a funtion to get the New York city data such as Boroughs, Neighborhoods along with their latitude and longitude.
def get_new_york_data():
url='https://cocl.us/new_york_dataset'
resp=requests.get(url).json()
# all data is present in features label
features=resp['features']
# define the dataframe columns
column_names = ['Borough', 'Neighborhood', 'Latitude', 'Longitude']
# instantiate the dataframe
new_york_data = pd.DataFrame(columns=column_names)
for data in features:
borough = data['properties']['borough']
neighborhood_name = data['properties']['name']
neighborhood_latlon = data['geometry']['coordinates']
neighborhood_lat = neighborhood_latlon[1]
neighborhood_lon = neighborhood_latlon[0]
new_york_data = new_york_data.append({'Borough': borough,
'Neighborhood': neighborhood_name,
'Latitude': neighborhood_lat,
'Longitude': neighborhood_lon}, ignore_index=True)
return new_york_data
# get new york data
new_york_data=get_new_york_data()
new_york_data.head()
new_york_data.shape
Based on the dataset, there are a total of 306 different Neighborhoods in New York to select from
from matplotlib import pyplot as plt
plt.style.use('ggplot')
plt.figure(figsize=(9,5), dpi = 80)
# title
plt.title('Number of Neighbourhood in NYC by Borough')
#On x-axis
plt.xlabel('Borough', fontsize = 15)
#On y-axis
plt.ylabel('No.of Neighborhood', fontsize=15)
#giving a bar plot
new_york_data.groupby('Borough')['Neighborhood'].count().plot(kind='bar')
#legend
plt.legend()
plt.show()
Based on the data, Queens is the most densely populated borough in New York City with 80 neighborhoods
# prepare neighborhood list that contains indian restaurant
column_names=['Borough', 'Neighborhood', 'ID','Name']
indian_rest_ny=pd.DataFrame(columns=column_names)
count=1
for row in new_york_data.values.tolist():
Borough, Neighborhood, Latitude, Longitude=row
venues = get_venues(Latitude,Longitude)
indian_resturants=venues[venues['Category']=='Indian Restaurant']
print('(',count,'/',len(new_york_data),')','Indian Resturants in '+Neighborhood+', '+Borough+':'+str(len(indian_resturants)))
for resturant_detail in indian_resturants.values.tolist():
id, name , category=resturant_detail
indian_rest_ny = indian_rest_ny.append({'Borough': Borough,
'Neighborhood': Neighborhood,
'ID': id,
'Name' : name
}, ignore_index=True)
count+=1
indian_rest_ny.head()
indian_rest_ny.shape
from matplotlib import pyplot as plt
plt.style.use('ggplot')
plt.figure(figsize=(9,5), dpi = 100)
# title
plt.title('Number of Indian Restaurants in NYC by Borough')
#On x-axis
plt.xlabel('Borough', fontsize = 15)
#On y-axis
plt.ylabel('No.of Indian Restaurant', fontsize=15)
#giving a bar plot
indian_rest_ny.groupby('Borough')['ID'].count().plot(kind='bar')
#legend
plt.legend()
#displays the plot
plt.show()
It is noted that Queens has the highest number of Indian Restaurants.
indian_rest_ny[indian_rest_ny['Neighborhood']=='Floral Park']
from matplotlib import pyplot as plt
plt.style.use('ggplot')
plt.figure(figsize=(9,5), dpi = 100)
# title
plt.title('Number of Indian Restaurants in NYC by Neighbourhood')
#On x-axis
plt.xlabel('Neighborhood', fontsize = 15)
#On y-axis
plt.ylabel('No.of Indian Restaurants', fontsize=15)
#giving a bar plot
indian_rest_ny.groupby('Neighborhood')['ID'].count().nlargest(5).plot(kind='bar')
#legend
plt.legend()
#displays the plot
plt.show()
Floral Park in Queens has the most Indian Resturants with a total count of 11.
# prepare neighborhood list that contains indian resturants
column_names=['Borough', 'Neighborhood', 'ID','Name','Likes','Rating','Tips']
indian_rest_stats_ny=pd.DataFrame(columns=column_names)
count=1
for row in indian_rest_ny.values.tolist():
Borough,Neighborhood,ID,Name=row
# prepare neighborhood list that contains indian resturants
column_names=['Borough', 'Neighborhood', 'ID','Name','Likes','Rating','Tips']
indian_rest_stats_ny=pd.DataFrame(columns=column_names)
count=1
for row in indian_rest_ny.values.tolist():
Borough,Neighborhood,ID,Name=row
try:
venue_details=get_venue_details(ID)
print(venue_details)
id,name,likes,rating,tips=venue_details.values.tolist()[0]
except (IndexError, KeyError) as e:
print('No data available for id=',ID)
# we will assign 0 value for these resturants as they may have been
#recently opened or details does not exist in FourSquare Database
id,name,likes,rating,tips=[0]*5
print('(',count,'/',len(indian_rest_ny),')','processed')
indian_rest_stats_ny = indian_rest_stats_ny.append({'Borough': Borough,
'Neighborhood': Neighborhood,
'ID': id,
'Name' : name,
'Likes' : likes,
'Rating' : rating,
'Tips' : tips
}, ignore_index=True)
count+=1
indian_rest_stats_ny.head()
indian_rest_stats_ny.shape
indian_rest_ny.shape
indian_rest_stats_ny.to_csv('indian_rest_stats_ny.csv', index=False)
indian_rest_stats_ny_csv=pd.read_csv('indian_rest_stats_ny.csv')
indian_rest_stats_ny_csv.shape
indian_rest_stats_ny_csv.head()
indian_rest_stats_ny.info()
indian_rest_stats_ny['Likes']=indian_rest_stats_ny['Likes'].astype('float64')
indian_rest_stats_ny['Tips']=indian_rest_stats_ny['Tips'].astype('float64')
indian_rest_stats_ny.info()
# Restaurants with maximum Likes
indian_rest_stats_ny.iloc[indian_rest_stats_ny['Likes'].idxmax()]
# Restaurants with maximum Rating
indian_rest_stats_ny.iloc[indian_rest_stats_ny['Rating'].idxmax()]
# Restaurants with maximum Tips
indian_rest_stats_ny.iloc[indian_rest_stats_ny['Tips'].idxmax()]
ny_neighborhood_stats=indian_rest_stats_ny.groupby('Neighborhood',as_index=False).mean()[['Neighborhood','Rating']]
ny_neighborhood_stats.columns=['Neighborhood','Average Rating']
ny_neighborhood_stats.sort_values(['Average Rating'],ascending=False).head(10)
ny_borough_stats=indian_rest_stats_ny.groupby('Borough',as_index=False).mean()[['Borough','Rating']]
ny_borough_stats.columns=['Borough','Average Rating']
ny_borough_stats.sort_values(['Average Rating'],ascending=False).head()
plt.figure(figsize=(9,5), dpi = 100)
# title
plt.title('Average rating of Indian Restaurant in each NYC Borough')
#On x-axis
plt.xlabel('Borough', fontsize = 15)
#On y-axis
plt.ylabel('Average Rating', fontsize=15)
#giving a bar plot
indian_rest_stats_ny.groupby('Borough').mean()['Rating'].plot(kind='bar')
#legend
plt.legend()
#displays the plot
plt.show()
Brooklyn has the highest average rating for Indian Restaurants.
ny_neighborhood_stats=ny_neighborhood_stats[ny_neighborhood_stats['Average Rating']>=8.0]
ny_neighborhood_stats
ny_neighborhood_stats=pd.merge(ny_neighborhood_stats,new_york_data, on='Neighborhood')
ny_neighborhood_stats=ny_neighborhood_stats[['Borough','Neighborhood','Latitude','Longitude','Average Rating']]
ny_neighborhood_stats.sort_values(['Average Rating'],ascending=False).head(10)
# create map and display it
ny_map = folium.Map(location=(40.693943, -73.985880), zoom_start=12)
# instantiate a feature group for the incidents in the dataframe
incidents = folium.map.FeatureGroup()
# loop through the 100 crimes and add each to the incidents feature group
for lat, lng, in ny_neighborhood_stats[['Latitude','Longitude']].values:
incidents.add_child(
folium.CircleMarker(
[lat, lng],
radius=10, # define how big you want the circle markers to be
color='yellow',
fill=True,
fill_color='blue',
fill_opacity=0.6
)
)
ny_neighborhood_stats['Label']=ny_neighborhood_stats['Neighborhood']+', '+ny_neighborhood_stats['Borough']+'('+ny_neighborhood_stats['Average Rating'].map(str)+')'
# add pop-up text to each marker on the map
for lat, lng, label in ny_neighborhood_stats[['Latitude','Longitude','Label']].values:
folium.Marker([lat, lng], popup=label).add_to(ny_map)
# add incidents to map
ny_map.add_child(incidents)
Findings
Midtown or Tribeca in Manhattan would be the best choice to start a restaurant given that