File size: 3,155 Bytes
475dc77
 
 
 
 
caffa37
 
 
 
 
 
 
 
 
89af670
 
475dc77
89af670
475dc77
 
89af670
 
 
475dc77
 
 
 
89af670
475dc77
 
 
caffa37
 
 
475dc77
 
 
caffa37
 
 
89af670
475dc77
89af670
475dc77
 
89af670
475dc77
 
 
 
89af670
475dc77
 
89af670
475dc77
89af670
475dc77
 
caffa37
475dc77
 
89af670
475dc77
89af670
475dc77
 
 
89af670
475dc77
 
 
 
 
 
 
 
 
 
 
 
caffa37
89af670
caffa37
 
 
475dc77
89af670
475dc77
 
caffa37
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import gradio as gr
import pandas as pd
import plotly.graph_objects as go
import numpy as np

def plot_zip_code_correlation(zip_codes_str, start_date, end_date):
    # Validate dates
    start_year = pd.to_datetime(start_date).year
    end_year = pd.to_datetime(end_date).year
    if start_year < 2000 or end_year < 2000:
        raise ValueError("Please select dates no earlier than the year 2000.")
    if start_year > end_year:
        raise ValueError("Start date must be before end date.")

    # Process ZIP codes (ensure 5-digit format)
    zip_codes = [z.strip().zfill(5) for z in zip_codes_str.split(",")]

    # Load data
    df = pd.read_csv('https://files.zillowstatic.com/research/public_csvs/zhvi/Zip_zhvi_uc_sfrcondo_tier_0.33_0.67_sm_sa_month.csv')

    # Ensure ZIP codes in dataframe are strings with leading zeros
    df['RegionName'] = df['RegionName'].astype(str).str.zfill(5)
    df = df[df['RegionName'].isin(zip_codes)]

    if df.empty:
        raise ValueError("No data found for the provided ZIP codes.")

    # Extract date columns within the selected range
    date_columns = []
    for col in df.columns[7:]:
        try:
            date = pd.to_datetime(col)
            if start_date <= str(date.date()) <= end_date:
                date_columns.append(col)
        except:
            continue

    if not date_columns:
        raise ValueError("No data available within the selected date range.")

    # Build price matrix
    price_matrix = []
    valid_zip_list = []

    for zip_code in zip_codes:
        df_zip = df[df['RegionName'] == zip_code]
        if not df_zip.empty:
            prices = df_zip.loc[:, date_columns].values.flatten()
            if not np.isnan(prices).all():
                price_matrix.append(prices)
                valid_zip_list.append(zip_code)

    if len(price_matrix) < 2:
        raise ValueError(f"Not enough data for correlation calculation. Ensure at least two valid ZIP codes with overlapping data between {start_date} and {end_date}.")

    price_matrix_df = pd.DataFrame(price_matrix, index=valid_zip_list, columns=date_columns)
    price_matrix_df = price_matrix_df.T.dropna()

    # Calculate correlation matrix
    corr_matrix = price_matrix_df.corr()

    # Prepare 3D plot
    z_data = corr_matrix.values
    x_data, y_data = np.meshgrid(valid_zip_list, valid_zip_list)

    fig = go.Figure(data=[go.Surface(z=z_data, x=x_data, y=y_data)])
    fig.update_layout(
        title=f'3D Correlation Matrix of Housing Prices ({start_date} to {end_date})',
        scene=dict(
            xaxis_title='ZIP Code',
            yaxis_title='ZIP Code',
            zaxis_title='Correlation',
        ),
        autosize=True
    )

    return fig

iface = gr.Interface(
    fn=plot_zip_code_correlation,
    inputs=[
        gr.Textbox(label="Enter comma-separated ZIP codes (e.g., 07001,07002,07003)"),
        gr.Textbox(label="Start Date (YYYY-MM-DD) - No earlier than 2000"),
        gr.Textbox(label="End Date (YYYY-MM-DD) - No earlier than 2000")
    ],
    outputs=gr.Plot(),
    title="3D ZIP Code Housing Price Correlation Matrix"
)

iface.launch(share=False, debug=True)