File size: 8,524 Bytes
5426d51
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
import streamlit as st
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import ScalarFormatter  # Import ScalarFormatter
import plotly.express as px
import numpy as np

st.set_option('deprecation.showPyplotGlobalUse', False)

# Extract location
input = 'data_3/data_test.csv'
output = 'data_3/data_test_city.csv'
# Load the addresses file into a DataFrame
addresses_df = pd.read_csv(input, encoding='UTF-8-SIG')
# print(addresses_df.head())

# Load the cities/districts file into a DataFrame
cities_districts_df = pd.read_csv('data_3/Cities.csv', encoding='UTF-8-SIG')

# Function to find city and district for each address
def find_city_district(location):
    location = str(location)  # Ensure location is a string
    for index, row in cities_districts_df.iterrows():
        if str(row["City"]) in location and str(row["District"]) in location:
            return row["City"], row["District"]
    return None, None

# Apply the function to the addresses DataFrame
addresses_df[["City", "District"]] = addresses_df["Location"].apply(find_city_district).apply(pd.Series)

# Save the new DataFrame to a CSV file
addresses_df.to_csv(output, index=False)

data = pd.read_csv('data_3/data_test_city.csv')
print(data.info())

df = data.dropna(subset = 'Price')
df = df.dropna(subset = 'City')
df=df[~((df['Price'] == 'Thỏa thuận'))]
df['Price'] = pd.to_numeric(df['Price'].str.replace(',', ''), errors='coerce')
df['Price'].astype(float)

print(df.info())

def plot_minmax_prices(selected_category):
    # Filter the data based on the selected category
    filtered_data = df[df['Category'] == selected_category]
    
    # Create a pivot table
    pivot_table = filtered_data.pivot_table(index=['City', 'Category'], values='Price', aggfunc=['min', 'max']).reset_index()
    print(pivot_table.head())
    pivot_table.columns=['City','Category','Min Price','Max Price']
    # Display the data table for the filtered data
    st.subheader('Tổng hợp Giá bất động sản cao nhất và thấp nhất ở các tỉnh thành')
    st.dataframe(pivot_table)

def plot_by_category(selected_category):
    # Get the unique city names and sort them alphabetically
    unique_cities = sorted(df['City'].unique())
    selected_city = st.sidebar.selectbox('Chọn thành phố hoặc tỉnh', unique_cities)
    # Filter the data for the selected city
    filtered_data = df[(df['City'] == selected_city) & (df['Category'] == selected_category)]
    # Display the data table for the filtered data
    # st.write('### Data Table')
    # st.write(filtered_data)

    # Check if data is empty
    if filtered_data.empty:
        print("filtered_data is empty")
        st.warning(f"No data available for {selected_category} in {selected_city}.")
    else:
        # Plot Number of property by District
        st.subheader(f'Số lượng bất động sản {selected_category}{selected_city}')
        fig = plt.figure(figsize=(6, 3))
        sns.countplot(data=filtered_data, y='District')
        plt.xticks(rotation=25)  # Rotate x-axis labels for better readability
        plt.xlabel('Số lượng')
        plt.ylabel('Quận/Huyện')
        st.pyplot(fig)

        # Plot Price per Area
        st.subheader(f'Giá bất động sản {selected_category} theo M² ở {selected_city}')
        # Create a new column for Price per Area
        filtered_data['Price per Area'] = filtered_data['Price'] / filtered_data['Area']
        # Plot the data
        fig = plt.figure(figsize=(6, 3))
        sns.barplot(data=filtered_data,y='District',x='Price per Area')
        plt.xticks(rotation=45)
        plt.xlabel('Giá trung bình')
        plt.ylabel('Quận/Huyện')
        # Show the full number of price instead of scientific notation
        plt.ticklabel_format(style='plain', axis='x')
        st.pyplot(fig)

        # Plot the estate type by City
        # Create a pie chart showing the proportion of estate types by city
        st.subheader(f'Loại bất động sản ở {selected_city}')
        estate_type_counts = filtered_data['Estate type'].value_counts()
        fig = px.pie(
        values=estate_type_counts.values,
        names=estate_type_counts.index,
        )
        # Display the chart
        st.plotly_chart(fig)

        # Plot the certification status by City
        # Replace empty values (including spaces) with NaN in the 'Certification Status' column
        filtered_data['Certification status'] = filtered_data['Certification status'].replace(' ', pd.NA)
        # Replace blank (empty) values with "Không xác định" in the 'Certification Status' column
        filtered_data['Certification status'].fillna("Không xác định", inplace=True)
        certification_count = len(filtered_data[filtered_data['Certification status'].notna()])
        if certification_count == 0:
            st.write('')
        else:
            # Create a pie chart showing the proportion of certification status by city
            st.subheader(f'Tình trạng pháp lý của bất động sản ở {selected_city}')
            certification_counts = filtered_data['Certification status'].value_counts()
            fig = px.pie(
            values=certification_counts.values,
            names=certification_counts.index,
            )
            # Display the chart
            st.plotly_chart(fig)

        # Plot the directions per city and Category
        direction_count = len(filtered_data[filtered_data['Direction'].notna()])
        if direction_count == 0:
            st.write('')
        else:
            # Create a pie chart showing the proportion of estate types by city
            st.subheader(f'Hướng bất động sản {selected_category}{selected_city}')
            # Create a horizontal bar chart
            fig = plt.figure(figsize=(6, 3))
            sns.set(style='whitegrid')
            sns.countplot(data=filtered_data, x="Direction", palette="Spectral")
            plt.xlabel('Hướng')
            plt.ylabel('Số lượng')
            # plt.title(f'Directions of property in {selected_city}')
            plt.show()
            # Display the chart
            st.pyplot(fig)

        # Create a pie chart showing the proportion of estate types by city
        st.subheader(f'Tỷ lệ bất động sản có chỗ đậu xe ở {selected_city}')
        # Create a pie chart to show the proportion of parking slot and non-parking slot
        # parking_slot_count = filtered_data[filtered_data['Parking slot'].notna()]['Parking slot'].count()
        parking_slot_count = len(filtered_data[~np.isnan(filtered_data['Parking slot'])])
        # non_parking_slot_count = filtered_data[filtered_data['Parking slot'].isna()]['Parking slot'].count()
        non_parking_slot_count = len(filtered_data[np.isnan(filtered_data['Parking slot'])])
        fig_pie = px.pie(
        names=['Có chỗ đậu xe', 'Không có chỗ đậu xe'],
        values=[parking_slot_count, non_parking_slot_count]
        )
        # Display the pie chart
        st.plotly_chart(fig_pie)
        if parking_slot_count == 0:
            st.write('')
        else:
            st.subheader(f'Số lượng chỗ đậu xe ở {selected_city}')
            filtered_data2 = filtered_data[filtered_data['Parking slot'].notna() & (filtered_data['Parking slot'] != ' ')]
            # Create a horizontal bar chart
            plt.figure(figsize=(6, 3))
            sns.set(style="whitegrid")
            sns.countplot(data=filtered_data2, x="Parking slot", palette="Spectral")
            plt.xlabel('Số lượng chỗ đậu xe/bất động sản')
            plt.ylabel('Số lượng')
            # Display the chart
            st.pyplot()
    
        # Create a pie chart showing the proportion of estate types by city
        st.subheader(f'Tỷ lệ người bán ở {selected_city}')
        # Create a pie chart to show the proportion of parking slot and non-parking slot
        personal_count = filtered_data[filtered_data['Seller type'] == 'Cá Nhân - Chính Chủ']['Seller type'].count()
        non_personal_count = filtered_data[filtered_data['Seller type'] == 'Công Ty Nhà Đất - Môi Giới BĐS']['Seller type'].count()
        fig_pie = px.pie(
        names=['Cá Nhân - Chính Chủ', 'Công Ty Nhà Đất - Môi Giới BĐS'],
        values=[personal_count, non_personal_count],
        )
        # Display the pie chart
        st.plotly_chart(fig_pie)