viruthik's picture
Update eda.py
b74633f
raw
history blame
3.55 kB
import streamlit as st
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from PIL import Image
st.set_page_config(
page_title='Flight Price Prediction - EDA',
layout='wide',
initial_sidebar_state='expanded'
)
def runEDA():
#Title
st.title('Flight Price Prediction')
#Sub Header
st.subheader('EDA for Flight Price Prediction')
st.markdown('---')
#show dataframe
st.title('Dataset')
df = pd.read_csv('flight_price_prediction.csv')
st.dataframe(df)
plt.style.use('default')
st.write('## Histogram Price')
fig = plt.figure(figsize=(15,5))
sns.histplot(df['price'], bins=20, kde=True).set(title='Price')
st.pyplot(fig)
st.write('Based on the histogram plot, we can see that most of the flight having price less than 10k INR (Indian Rupee). But for few flight price is goes up to 120k INR, this probably the price of business class.')
st.markdown('---')
st.write('## Airlines Flights Count')
df_airlines = df.groupby(['airline']).agg(counts=('flight', 'count')).sort_values(by=['counts'], ascending=False)
fig, ax = plt.subplots(ncols=1, figsize=(15, 5))
ax.pie(df_airlines['counts'], labels=df_airlines['counts'].index, autopct='%.2f%%')
ax.set_title("Airlines Flight Count")
st.pyplot(fig)
st.write('The flight is dominated by `Vistara` Airlines with more than 127k flights. Their biggest competitor is `Air India` with 80k flights.')
st.markdown('---')
st.write('## Flight Stops/Transit Count')
df_stops = df.groupby(['stops']).agg(counts=('airline', 'count'))
fig, ax = plt.subplots(ncols=1, figsize=(15, 5))
ax.pie(df_stops['counts'], labels=df_stops['counts'].index, autopct='%.2f%%')
ax.set_title("Flight Stops (Transit) Count")
st.pyplot(fig)
st.write('Most of flight on this dataset is having one transit. Only 12% of the data that is a direct flight.')
st.markdown('---')
df_departure = df.groupby(['departure_time']).agg({'price':'mean'}).sort_values(by=['price'], ascending=True)
fig = plt.figure(figsize=(7, 5))
sns.barplot(data=df_departure, x=df_departure.index.to_list(), y='price', orient='v').set(title='Average price per Departure Time')
st.pyplot(fig)
st.write('From the bar plot above we can see that Late Night ticket average price is cheapest compared to other time. Meanwhile night and morning flight have the most expensive average price.')
st.markdown('---')
plt.style.use('dark_background')
fig = plt.figure(figsize=(20,8))
sns.lineplot(data=df, x='duration', y='price', hue='class', palette='hls')
plt.title('Ticket Price Versus Flight Duration Based on Class',fontsize=20)
plt.xlabel('Duration', fontsize=15)
plt.ylabel('Price', fontsize=15)
st.pyplot(fig)
st.write('Based on the line graph above, we can see that as the flight duration increase the ticket price is also increases in both the Economy and Business classes')
st.markdown('---')
fig = plt.figure(figsize=(20,8))
sns.lineplot(data=df, x='days_left', y='price', color='blue')
plt.title('Days Left For Departure Versus Ticket Price',fontsize=20)
plt.xlabel('Days Left for Departure',fontsize=15)
plt.ylabel('Price',fontsize=15)
st.pyplot(fig)
st.write('Based on the line graph above, we can see that as the flight duration increase the ticket price is also increases in both the Economy and Business classes')
st.markdown('---')
if __name__ == '__main__':
runEDA()