%matplotlib inline
import datetime as dt
import pandas as pd
import numpy as np
import matplotlib
import seaborn as sns
import scipy.stats as stats
import matplotlib.pyplot as plt
matplotlib.style.use('ggplot')
var = pd.read_csv('/content/Uber project dataset.csv')
var.head(10)
var.info()
print("Prints all the columns that are of numerical type")
var.select_dtypes(include=['number'])
print("Prints all the columns that are of non-numerical type")
var.select_dtypes(include=['object'])
Extracting date parameters - Date, Day, Month, Year from the date column
var['Start_date']=pd.to_datetime(var['Start_date'])
var['Date'] = var['Start_date'].dt.day
var['Month'] = var['Start_date'].dt.month
var['Year'] = var['Start_date'].dt.year
var['Weekday']=var['Start_date'].dt.day_name()
print(var)
[var.isna().any(axis=1)]
var[var.isna().any(axis=1)]
var.dropna(axis=0, how="any", inplace=True)
var.shape
Creating a pandas dataframe
df=pd.DataFrame(var)
print(df)
df.nunique()
df.describe()
Findings about the parameters: Fare and Distance Travelled
plt.figure(figsize=(10,8))
plt.hist(df['Fare'],bins=50)
plt.figure(figsize=(10,8))
plt.hist(df['KM'], bins=50)
From the above histograms, most of the fare amount falls between 0 - 2000 range and most of the distance travelled is shorter comparatively (0-50 Km).It is also found that the parameters "Kms travelled" and "Fare" are not normally distributed. In order to prove this statistically,we perform the following test:
Shapiro test for normality testing
from scipy.stats import shapiro
norm=df['Fare']
stat, p = shapiro(norm)
print('stat=%.2f,p=%.30f'% (stat,p))
if p>0.05:
print("Fare data normally distributed")
else:
print("Fare data not normally distributed")
norm=df['KM']
stat, p = shapiro(norm)
print('stat=%.2f,p=%.30f'% (stat,p))
if p>0.05:
print("Km data normally distributed")
else:
print("Km data not normally distributed")
So, it is statistically proved that the considered data are not normally distributed by considering a 95% confidence.
Outliers of fare and distance parameters
From the previous section, it is found that the parameters "KM" and "Fare" are not normally distributed. So we can't able to perform z test for outlier testing. We can work with the outliers using the Five- Point Data Desctiption to work with outliers.
At first, we are creating Box plots for the parameters to visualize the outliers.
plt.figure(figsize=(10,6))
sns.boxplot(x=df["Fare"])
plt.figure(figsize=(10,6))
sns.boxplot(x=df["KM"])
In this case, removing all the outliers of Fare and Distance parameters is not considerable, since they are the parameters that are eventual to happen. Instead, it is better to calculate a new parameter, Fare per km and removing the outliers based on that would be considerable, since the outliers in this case might be due to unexpected surge.
Price/km distribution
df["Fare_per_km"] = df["Fare"] / df["KM"]
plt.hist(df["Fare_per_km"])
sns.boxplot(df["Fare_per_km"])
df["Fare_per_km"].describe()
Finding the quantiles of price/km parameter
def outer_boundary(data, variable):
Q1 = data[variable].quantile(.25)
Q3 = data[variable].quantile(.75)
IQR = Q3 - Q1
low_bound = Q1 - 1.5*IQR
up_bound = Q3 + 1.5*IQR
return low_bound, up_bound
low, up = outer_boundary(df,"Fare_per_km")
print("Lower limit = ", low)
print("Upper limit = ", up)
The data falling away from the above boundaries are outliers. Here the values greater than 54.535 are outliers.
Removal of outliers with reference to the lower and upper limit of the 5 point data description
df_outliers = np.where(df["Fare_per_km"] > up,True,np.where(df["Fare_per_km"] < low,True,False))
df.shape
df_new = df.loc[~(df_outliers),]
df_new.shape
df_new["Fare_per_km"].max()
df_new["Fare_per_km"].min()
df_new.describe()
sns.boxplot(y=df_new["Fare_per_km"]) #The outliers correponding to Fare per km of the dataset are removed and the new dataset aligns with the new IQR, Q1 and Q3 values. In this case, the below plot showing some outliers are specific to this updated dataset are considerable.
df_new.describe() - df.describe()
From the above steps, the outliers based on fare per km are removed, due to which the 5 Point data description aligned with the new dataset, which produced some outliers for the fare per km value, but those values are considerable.
Also the difference in the 5 point data description before and after removing the outliers also presented.
df_new.nunique()
Max and Min Fare details:
print("Max Fare details:")
print(" ")
print("Max Fare:",df_new.Fare[df_new.Fare==df_new.Fare.max()])
print("KM for Max Fare:",df_new.KM[df_new.Fare==df_new.Fare.max()])
print("Date:",df_new.Start_date[df_new.Fare==df_new.Fare.max()])
print("Date:",df_new.Weekday[df_new.Fare==df_new.Fare.max()])
print("Min Fare details:")
print(" ")
print("Min Fare:",df_new.Fare[df_new.Fare==df_new.Fare.min()])
print("KM for Min Fare:",df_new.KM[df_new.Fare==df_new.Fare.min()])
print("Date:",df_new.Start_date[df_new.Fare==df_new.Fare.min()])
print("Date:",df_new.Weekday[df_new.Fare==df_new.Fare.min()])
In general, it is known that whenever distance increases, Fare also inceases. In order to prove this, we are going to perform the collowing correlation tests:
Null Hypothesis: There's no correlation between Distance Travelled and Fare.
Alternate Hypothesis: There's a correlation between Distance Travelled and Fare.
Test condition: For a 95% confidence, if p value is less than 0.05, then reject null hypothesis, else accept null hypothesis.
sample1 = df_new["KM"]
sample2 = df_new["Fare"]
#Spearman Rank Correlation
from scipy.stats import spearmanr
corr, p = spearmanr(sample1, sample2)
print('corr=%.3f, p=%.3f' % (corr,p))
if p < 0.05:
print("Reject Ho and accept Ha")
else:
print("Reject Ha and accept H0")
#Pearson Correlation
from scipy.stats import pearsonr
corr, p = pearsonr(sample1, sample2)
print('corr=%.3f, p=%.3f' % (corr,p))
if p < 0.05:
print("Reject Ho and accept Ha")
else:
print("Reject Ha and accept H0")
sns.lineplot(data=df_new,x="KM",y="Fare")
Both the correlation tests resulted a correlation value of 1. This means that "Distance Travelled" and "Fare" parameters are in complete positive linear relationship. Also, from the above test, Null hypothesis is rejected.
So, "Distance Travelled" and "Fare" are correlated.
1 b. Correlation between Distance Travelled and Time duration
In general, it is known that whenever distance increases, Time also inceases. In order to prove this, we are going to perform the collowing correlation tests:
Null Hypothesis: There's no relation between Distance Travelled and Time Duration
Alternate Hypothesis: There's a between Distance Travelled and Time Duration
Test condition: For a 95% confidence, if p value is less than 0.05, then reject null hypothesis, else accept null hypothesis.
sample_1 = df_new["KM"]
sample_2 = df_new["Duration_min"]
#Spearman Rank Correlation
from scipy.stats import spearmanr
corr, p = spearmanr(sample_1, sample_2)
print('corr=%.3f, p=%.3f' % (corr,p))
if p < 0.05:
print("Reject Ho and accept Ha")
else:
print("Reject Ha and accept H0")
#Pearson Correlation
from scipy.stats import pearsonr
corr, p = pearsonr(sample_1, sample_2)
print('corr=%.3f, p=%.3f' % (corr,p))
if p < 0.05:
print("Reject Ho and accept Ha")
else:
print("Reject Ha and accept H0")
sns.scatterplot(data=df_new, x="KM", y="Duration_min",color="blue")
The correlation values from both the above tests are greater than 0.05. Also, both tests results in the rejection of null hypothesis.
So, There's a positive relationship between "Distance Travelled" and "Time taken".
Conclusion:
Both the above tests for Distance Travelled vs Time duration as well as Distance Travelled vs Fare resulted in a positive linear relationship.
In order to check whether there's an effect of method of payment an revenue, we perform One-way Anova test.
One way Anova Testing:
Null Hypothesis H0: There's no relation between type of payment and the revenue.
Alternate Hypothesis Ha: There's relation between type of payment and the revenue.
Significance value (alpha) = 0.05
data1 = df_new[["Payment_Type","Fare"]]
data1
#replacing character values to numeric values in Payment_type column
#Cash = 1, Card =2, Digital =3
data1['Payment_Type'].replace(['Cash', 'Card','Digital'],
[1,2,3], inplace=True)
data1
import statsmodels.api as sm
from statsmodels.formula.api import ols
new = ols('Fare ~ Payment_Type', data = data1).fit()
anovaRes = sm.stats.anova_lm(new,typ=2)
anovaRes['PR(>F)']
Here, the f value reopresents the variance of fare among 3 groups, and it shows that the variation is low.
Also, the p-value is very high = 0.944
0.944 > 0.05
Here, wew are accepting the null hypothesis, that the average revenue made by all the modes of payment are same.
Number of rides for modes of payment
x=sns.countplot(x=df_new['Payment_Type'])
for p in x.patches:
x.annotate(f'\n{p.get_height()}', (p.get_x()+0.2, p.get_height()), ha='left', va='top', color='black', size=12)
sns.set_palette("colorblind")
x=sns.countplot(x=df_new['Payment_Method'])
for p in x.patches:
x.annotate(f'\n{p.get_height()}', (p.get_x()+0.2, p.get_height()), ha='left', va='top', color='black', size=12)
Revenue based on different types of payment
plt.figure(figsize=(10,7))
sns.set_palette("Paired")
plot=sns.stripplot(y=df_new['Payment_Method'],x=df['Fare'])
plot.set(xlabel='Fare',ylabel= 'Type of Payment',title='Fare vs Payment type')
plt.show()
data = df_new.groupby("Payment_Method")["Fare"].sum()
data
#grouping the data based on the type of fares
sns.set_palette("bright")
plt.figure(figsize=(10,8))
data.plot.pie(autopct="%.1f%%")
It is found that Uber got more revenue from non-cash payments (84.6%) than cash payments (15.4%). Here Uber cash (Uber wallet payment) is the topmost contributor of payment mode followed by UPI payments.
Conclusion:
From the above analysis it is found that, the sum of revenue made by the digital payments is more than the card and cash payments.
But, proportionally, the number of digital payments made are higher comparatively to the other modes of payments.
This makes the average revenue made by different payment types are similar eventhough digital payments collected more revenue.
In this test, we are about to determine whether there's any effect of weekdays parameter on the purpose of usage . ( Maybe weekends with more usage form Meal/Entertainment and weekdays with official activities).
Chi - Square Hypothesis Testing:
For the comparision of 2 categorical variables, we are going to perform chi-square test.
Null Hypothesis H0: There's no relation between Weekday and Purpose.
Alternate Hypothesis Ha: There's a relation between Weekday and Purpose.
Significance value (alpha): 0.05.
tab = pd.crosstab(df_new.Weekday, df_new.Purpose) #creating a crosstab of 2 columns
tab
from scipy.stats import chi2_contingency
stat,p,dof,expected = chi2_contingency(tab)
print("stat=%.3f, p=%.3f" % (stat,p))
if p < 0.05:
print("Reject H0 and accept Ha")
else:
print("Accept H0 and reject Ha")
From the above, the stat value (difference between observed and expected values) is considerably larger and from the test results, it is clear that , there's no relationship between the Weekdays and the Purpose parameters.
Ride Category
sns.set_palette("Paired")
x=sns.countplot(x=df_new['Category'])
for p in x.patches:
x.annotate(f'\n{p.get_height()}', (p.get_x()+0.2, p.get_height()), ha='left', va='top', color='black', size=12)
It is clearly visible that more number of rides are made for business purposes than for personal.
Purpose of rides
plt.figure(figsize=(15,8))
x = sns.countplot(x=df_new['Purpose'])
for p in x.patches:
x.annotate(f'\n{p.get_height()}', (p.get_x()+0.2, p.get_height()), ha='left', va='top', color='black', size=12)
It is clearly evident that users use Uber mainly for Meeting and Entertainment activities than for other activities like Supplies, Customer Visit, Airport / Travel etc. Also they use the least for Charity and other commute activities.
I. Trips and Driver rating
freq_table = pd.crosstab(df_new['Driver_Rating'],'no_of_drivers')
freq_table
plt.figure(figsize=(10,8))
sns.set_palette("rocket")
x=sns.countplot(x=df_new['Driver_Rating'])
plt.ylabel('Number of Drivers')
for p in x.patches:
x.annotate(f'\n{p.get_height()}', (p.get_x()+0.2, p.get_height()), ha='left', va='top', color='white', size=12)
From the above plot, it is found that the number of trips with different driver rating lies bwtween 169 and 192. Around 180 trips falls under "1 star driver rating" and around 178 trips falls under "5 star driver rating". 171 trips weren't rated. Highest number of drivers falls under "3 Star rating" and lowest number of drivers falls under "2 star rating".
II. Month vs Number of Trips
sns.set_palette("Paired")
plt.figure(figsize=(15,8))
x=sns.countplot(x=df_new['Month'])
x.set(xlabel="Frequency",ylabel="Month")
for p in x.patches:
x.annotate(f'\n{p.get_height()}', (p.get_x()+0.2, p.get_height()), ha='left', va='top', color='black', size=12)
From the plot, it is found that the month January records more number of trips and June records the less number of trips.
III. Day vs Number of Trips
plt.figure(figsize=(10,8))
order = ["Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"]
x=sns.countplot(x=df_new["Weekday"], order=order)
x.set(xlabel="Frequency",ylabel="Weekday")
for p in x.patches:
x.annotate(f'\n{p.get_height()}', (p.get_x()+0.2, p.get_height()), ha='left', va='top', color='black', size=12)
When aggredated based on weekdays, Sunday records the most number of trips. This might be due to the fact that people spend their weekend in travelling.Followed by Wednesday lies on the 2nd position. On the other hand, Monday records the least number of trips.
IV. Time vs Frequency
sns.set_palette("pastel")
plt.figure(figsize=(15,8))
x=sns.countplot(x=df_new['SHour'])
x.set(xlabel="Hour",ylabel="Frequency")
for p in x.patches:
x.annotate(f'\n{p.get_height()}', (p.get_x()+0.2, p.get_height()), ha='left', va='top', color='black', size=12)
The above plot is showing almost the expected results:
V. Number of trips completed
x=sns.countplot(x=df_new["Status"])
for p in x.patches:
x.annotate(f'\n{p.get_height()}', (p.get_x()+0.2, p.get_height()), ha='left', va='top', color='black', size=12)
Among the total trips, 932 were completed and 143 trips were cancelled, which is around 15.34%
VI. Revenue based on purpose of trips
purpose = df_new.groupby("Purpose")["Fare"].sum()
purpose
plt.figure(figsize=(17,8))
x=sns.barplot(x="Purpose", y="Fare", data=df_new,ci = None, estimator=sum)
Based on purpose, Uber collected the maximum revenue in Meeting(140000), Meal/ Entertainment(105000) and Customer Visit(80000)
VII. PAIRPLOTS
sns.pairplot(df_new)
VIII. CORRELATION MATRIX
correlation=df_new.corr()
plt.figure(figsize=(15,15))
sns.heatmap(correlation,cmap="coolwarm",annot=True,vmax=.5,vmin=-.5,center=0,square=True,linewidths=.5)
From the above correlation heatmap,
Highly Correlated:
Positive correlation:
Negative correlation:
Moderately correlated
Positive correlation:
Negative correlation:
!sudo apt-get install texlive-xetex texlive-fonts-recommended texlive-plain-generic
!jupyter nbconvert --to html /content/EDA_Uber.ipynb