Spaces:

ardian407
/

used_car_predictions

Sleeping

App Files Files Community

ardian407 commited on Nov 9, 2023

Commit

8f2ce94

•

1 Parent(s): 59cd3f0

Upload 10 files

Browse files

Files changed (10) hide show

app.py +9 -0
dataset.csv +0 -0
eda.py +162 -0
hot_encoder.pkl +3 -0
ord_encoder.pkl +3 -0
pipelines.pkl +3 -0
predictions.py +73 -0
requirements.txt +8 -0
rf_model.pkl +3 -0
scaler.pkl +3 -0

app.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import streamlit as st
+import eda
+import predictions
+halaman = st.sidebar.selectbox('Halaman: ',('EDA', 'Prediction'))
+if halaman == 'EDA':
+    eda.run()
+else:
+    predictions.run()

dataset.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

eda.py ADDED Viewed

	@@ -0,0 +1,162 @@

+import streamlit as st
+import seaborn as sns
+import plotly.express as px
+import matplotlib.pyplot as plt
+import pandas as pd
+import numpy as np
+from plotly.subplots import make_subplots
+import plotly.graph_objects as go
+def run():
+    # title
+    st.title('EDA Analysis Used Car Price')
+    st.write('EDA ini terfokus pada eksplorasi untuk memahami data lebih dalam dengan melihat distribusi data, korelasi atau hubungan antar kolom dan eksplorasi berdasarkan business domain.')
+    st.markdown('---')
+    dataset = pd.read_csv('dataset.csv')
+    st.markdown("<h4 style='text-align: center; color: white;'>Dataset Preview</h4>", unsafe_allow_html=True)
+    st.dataframe(dataset,hide_index=True, height=200, width=1000)
+    st.write("Deskripsi Kolom: ")
+    st.write('`model` = Tipe atau model mobil (categorical - nominal)')
+    st.write('`year` = Tahun pembuatan mobil (categorical - ordinal)')
+    st.write('`price` = Harga (dalam Pound Sterling - numerical)')
+    st.write('`transmission` = Tipe transmisi mobil (categorical - nominal)')
+    st.write('`mileage` = Jumlah jarak tempuh dalam miles (numerical)')
+    st.write('`fuelType` = Jenis bahan bakar (cateogrical - nominal)')
+    st.write('`tax` = Jumlah pajak dalam Pound Sterling')
+    st.write('`mpg`= Miles per Gallon (1 gallon = 3.78541 liter - numerical) ')
+    st.write('`engineSize` = Ukuran mesin (Numerical)')
+    st.write('`company` = Brand atau nama perusahaan pembuat mobil (categorical - nominal)')
+    st.markdown('---')
+    numeric_data = dataset[['mileage','tax','engineSize','mpg']]
+    st.markdown("<h3 style='text-align: center; color: white;'>Perkembangan Harga Pertahun</h3>", unsafe_allow_html=True)
+    year = dataset[['year', 'price']].groupby('year').mean()
+    audi = dataset[['year', 'price']][dataset['company'] == "Audi"].groupby('year').mean()
+    toyota = dataset[['year', 'price']][dataset['company'] == "Toyota"].groupby('year').mean()
+    hyundai = dataset[['year', 'price']][dataset['company'] == "Hyundai"].groupby('year').mean()
+    bmw = dataset[['year', 'price']][dataset['company'] == "BMW"].groupby('year').mean()
+    # Reindex the 'audi' DataFrame to match the years in the 'year' DataFrame
+    audi = audi.reindex(year.index, fill_value=0)
+    toyota = toyota.reindex(year.index, fill_value=0)
+    hyundai = hyundai.reindex(year.index, fill_value=0)
+    bmw = bmw.reindex(year.index, fill_value=0)
+    # Create a Plotly figure
+    fig = go.Figure()
+    # Plot the average price for all cars with a blue solid line
+    fig.add_trace(go.Scatter(x=year.index, y=year['price'], mode='lines', name='All Cars', line=dict(color='cyan')))
+    # Plot the average price for Audi cars with a red dashed line
+    fig.add_trace(go.Scatter(x=audi.index, y=audi['price'], mode='lines', name='Audi', line=dict(color='red', dash='dash')))
+    fig.add_trace(go.Scatter(x=toyota.index, y=toyota['price'], mode='lines', name='Toyota', line=dict(color='blue', dash='dash')))
+    fig.add_trace(go.Scatter(x=hyundai.index, y=hyundai['price'], mode='lines', name='Hyundai', line=dict(color='yellow', dash='dash')))
+    fig.add_trace(go.Scatter(x=bmw.index, y=bmw['price'], mode='lines', name='BMW', line=dict(color='white', dash='dash')))
+    # Customize the layout
+    fig.update_layout(
+        title='Average Price by Year',
+        xaxis_title='Year',
+        yaxis_title='Average Price',
+        xaxis=dict(tickvals=year.index, ticktext=year.index, tickangle=45),
+        legend=dict(x=0, y=1)
+    )
+    # Display the Plotly figure using Streamlit
+    st.plotly_chart(fig)
+    st.write('Dari chart di atas, kita bisa melihat bahwa:\n- Setiap perusahaan memiliki rata-rata yang berbeda dari tahun ke tahun.\n- Rata-rata harga pada semua perusahaan mengalami kenaikan yang mirip sebagaimana bertambahnya tahun. Ini menandakan semakin muda sebuah mobil, maka harga nya semakin tinggi.')
+    st.markdown('---')
+    st.markdown("<h3 style='text-align: center; color: white;'>Average Price berdasarkan Brand</h3>", unsafe_allow_html=True)
+    fig=plt.figure(figsize=(7,5))
+    ax=sns.barplot(dataset, x='company', y='price', estimator='mean', palette='viridis')
+    plt.bar_label(ax.containers[0], label_type='center', color='white')
+    st.pyplot(fig)
+    st.write('Terlihat bahwa company yang ada terbagi seperti terbagi menjadi 2 kelas. Kelas dengan rata-rata harga yang tinggi yaitu Audi dan BMW, lalu kelas dengan harga rata-rata yang rendah yaitu Toyota dan Hyundai.')
+    st.markdown('---')
+    st.markdown("<h3 style='text-align: center; color: white;'>Average Price Model berdasarkan Brand</h3>", unsafe_allow_html=True)
+    company = dataset['company'].unique().tolist()
+    fig=plt.figure(figsize=(20,30))
+    for i, comp in enumerate(company):
+        plt.subplot(3,2,i+1)
+        data = dataset[['model','price']][dataset['company']==comp].groupby('model').mean().sort_values("price", ascending=False)
+        ax=sns.barplot(data, x='price', y=data.index, palette='viridis')
+        plt.bar_label(ax.containers[0])
+        plt.title(f"Rata-Rata Harga {comp}")
+    st.pyplot(fig)
+    st.write('Kita bisa melihat bahwa setiap model dari setiap company memiliki harga rata-rata yang berbeda satu sama lain namun terdapat kemiripan antar model, hal ini akan digunakan sebagai tolak ukur pada proses pengurangan cardinality di feature engineering. ')
+    st.markdown('---')
+    st.markdown("<h3 style='text-align: center; color: white;'>Average Price berdasarkan Tipe Transmisi</h3>", unsafe_allow_html=True)
+    fig=plt.figure(figsize=(12,8))
+    ax=sns.barplot(dataset, x='transmission', y='price', estimator='mean', palette='Blues')
+    plt.bar_label(ax.containers[0], label_type='center', fontsize=14)
+    st.pyplot(fig)
+    st.write('Ketika melihat rata-rata harga setiap transmission, kita bisa lihat bahwa yang paling mahal justru adalah semi-auto, disusul automatic, lalu other, dan Manual. Ini bisa disebabkan oleh adanya outliers. Pada dunia nyata jika diurutkan dari yang termurah yaitu manual, semi-auto, automatic.')
+    st.markdown('---')
+    st.markdown("<h3 style='text-align: center; color: white;'>Rata-Rata Efisiensi Bahan Bakar berdasarkan Brand</h3>", unsafe_allow_html=True)
+    fig = plt.figure(figsize=(10, 8))
+    ax=sns.barplot(dataset, x='company', y='mpg', estimator='mean', palette='viridis')
+    plt.bar_label(ax.containers[0], label_type='center')
+    st.pyplot(fig)
+    st.write('Dari sini kita melihat bahwa Brand dengan efisiensi bahan bakar terbaik adalah Audi, disusul dengan Hyundai, BMW dan Toyota. Semakin kecil rata-rata efisiensi bahan bakar, semakin irit atau hemat konsumsi bahan bakar sebuah mobil.')
+    st.markdown('---')
+    st.markdown("<h3 style='text-align: center; color: white;'>Efisiensi Berdasarkan Transmission</h3>", unsafe_allow_html=True)
+    fig=plt.figure(figsize=(10,6))
+    ax = sns.barplot(data=dataset, x='transmission', y='mpg', estimator='mean')
+    plt.bar_label(ax.containers[0], label_type='center')
+    st.pyplot(fig)
+    st.write('Disini kita bisa melihat bahwa Semi-Auto memiliki efisiensi yang paling baik di antara tipe transmissi lain.')
+    st.markdown('---')
+    st.markdown("<h3 style='text-align: center; color: white;'>Data Distribution</h3>", unsafe_allow_html=True)
+    # list nama kolom
+    cols = dataset[['price', 'mileage', 'tax', 'mpg', 'engineSize']]
+    # figur size
+    fig=plt.figure(figsize=(30,13))
+    # iterasi untuk membuat chart
+    for i, col in enumerate(cols):
+        plt.subplot(2,3, i+1) # subplot
+        sns.histplot(dataset[col], kde=True, bins=30) #histogram plot
+        plt.title(f' Distribusi {col} \nskewness: {dataset[col].skew():.4f}') # judul plot beserta skewness
+        plt.xticks(rotation=20)
+    st.pyplot(fig)
+    st.write('Terlihat hanya kolom tax yang memiliki skewness mendekati normal.')
+    st.markdown('---')
+    st.markdown("<h3 style='text-align: center; color: white;'>Korelasi antara numerik Variables</h3>", unsafe_allow_html=True)
+    fig=plt.figure(figsize=(7,7))
+    sns.heatmap(numeric_data.corr("spearman"), annot=True, cmap='Blues')
+    st.pyplot(fig)
+    st.write('- Nilai korelasi terkuat terhadap price ada pada variable year dan engine Size.\n- Sedangkan variable yang paling lemah ada pada kolom Tax')
+    st.markdown('---')
+    st.markdown("<h3 style='text-align: center; color: white;'>Linearitas antara price dengan variable lain</h3>", unsafe_allow_html=True)
+    pair = ['mileage', 'year','engineSize', 'mpg']
+    fig=plt.figure(figsize=(15,10))
+    for i, col in enumerate(pair):
+        plt.subplot(2,2,i+1)
+        sns.scatterplot(dataset, y='price', x=col)
+        plt.title(f'Korelasi antara price dan {col}')
+    st.pyplot(fig)
+    st.write("Dari chart di atas dapat dikatakan:\n- Semakin tinggi jarak tempuh (mileage) sebuah mobil, maka semakin murah harganya. Mobil dengan jarak tempuh yang sedikit, cenderung memiliki harga yang tinggi.\n- Semakin muda usia mobil, harga nya pun semakin mahal. \n- Untuk engineSize, tidak terlalu terlihat pola pada scatter. Namun, kita bisa sedikitnya melihat bahwa ada kecenderungan ketika engineSize semakin besar, maka harga cenderung lebih mahal.\n- Hubungan antara price dan mpg menunjukan bahwa semakin efisien konsumsi bahan bakar mobil, maka harga cenderung bisa lebih mahal")
+    st.markdown('---')
+if __name__ == '__main__':
+    run()

hot_encoder.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:24d936998400edb8909c98d04eb8d1391834490b914ed1755c85e265dd96e921
+size 2605

ord_encoder.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3a62568e04a76876c9454105c97f25520efaa3e594b5dafaa7526cce6f7291f2
+size 1444

pipelines.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6ad7b4bb4c8f8be475985afbfd1fa97992357a81666c12eaf1ce40095d37957e
+size 178683728

predictions.py ADDED Viewed

	@@ -0,0 +1,73 @@

+import streamlit as st
+import pandas as pd
+import joblib
+# load pipelines dengan joblib
+pipe = joblib.load('pipelines.pkl')
+dataset = pd.read_csv('dataset.csv')
+# buat fungsi untuk run
+def run():
+    # buat form
+    st.write('## Predict Used Car Price')
+    my_dict = {"Audi": dataset['model'][dataset['company']=='Audi'].unique().tolist(),
+                "Toyota": dataset['model'][dataset['company']=='Toyota'].unique().tolist(),
+                "BMW": dataset['model'][dataset['company']=='BMW'].unique().tolist(),
+                "Hyundai": dataset['model'][dataset['company']=='Hyundai'].unique().tolist()}
+    col1, col2 = st.columns([1, 1])
+    with col1:
+        company = st.selectbox('Brand', options=my_dict.keys())
+    with col2:
+        model = st.selectbox('Pilih Model', options=my_dict[company])
+    column1, column2 = st.columns([2,2])
+    with column1:
+        year = st.selectbox('Tahun', options=[int(x) for x in range(2020, 1990, -1)])
+    with column2:
+        transimssion = st.selectbox('Tipe Transmisi', options=dataset['transmission'].unique().tolist())
+    kol1, kol2, kol3 = st.columns([3,1,1])
+    with kol1:
+        mileage = st.slider('Jarak Tempuh',0,100000,1000)
+    with kol2:
+        fueltype = st.selectbox('Jenis Bahan Bakar', options=dataset['fuelType'].unique().tolist())
+    with kol3:
+         mpg = st.number_input('Efisiensi', min_value=0, value=50, step=1, max_value=100, help="Input Efisiensi Bahan Bakar")
+    engineSize=st.selectbox('Ukuran Mesin', options=dataset['engineSize'].unique().tolist())
+    with st.form('Form Car Details'):
+        company=company
+        model=model
+        tahun=year
+        trans=transimssion
+        efisiensi = mpg
+        engine = engineSize
+        submitted = st.form_submit_button('Predict')
+    # dictionary hasil data input
+    data = {
+    'model': model,
+    'year': (int(tahun)),
+    'transmission': trans,
+    'mileage':mileage,
+    'fuelType':fueltype,
+    'mpg':efisiensi,
+    'engineSize': engine,
+    'company':company,
+    }
+    # data inference ke dataframe
+    data_inf = pd.DataFrame([data]).reset_index(drop=True)
+    # tampilkan hasil input dalam dataframe
+    st.dataframe(data_inf)
+    # if clause jika tombol sudah di tekan
+    if submitted:
+        prediction = pipe.predict(data_inf)
+        st.write(f'#### Prediction: GBP {str(int(prediction))} atau IDR {str(int(prediction)*19230.08)}')
+# supaya halaman berjalan
+if __name__ == '__main__':
+    run()

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+streamlit
+pandas
+seaborn
+matplotlib
+numpy
+scikit-learn==1.2.2
+Pillow
+plotly

rf_model.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:eead2fd3069bfa560154c01ce9e3873f356cccc4d380340be057704aa67202b2
+size 178667521

scaler.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:739a49283d91fa1462cf15b19f2a0f81cf39798b0b3edff44dd67b1e320b5b78
+size 1135