Spaces:
Runtime error
Runtime error
import gradio as gr | |
import numpy as np | |
from PIL import Image | |
import requests | |
import pandas as pd | |
import matplotlib.pyplot as plt | |
import numpy as np | |
import joblib | |
import hopsworks | |
from tqdm import tqdm | |
import xgboost as xgb | |
from geopy.geocoders import Nominatim | |
from datetime import date | |
from datetime import timedelta | |
from autogluon.tabular import TabularPredictor | |
import shutil | |
# Login to hopsworks and get the feature store | |
# streetName;number;sqm;rooms;soldDate;monthlyFee;monthlyCost;floor;yearBuilt;brf;agency;lat;lon;gdp;unemployment;interestRate | |
columnHeaders = ['streetName','number','sqm','rooms','soldDate','monthlyFee','monthlyCost','floor','yearBuilt', 'brf','agency','lat','lon'] # ,'gdp','unemployment','interestRate' | |
featureToMinMax = { | |
'sqm': (10, 800), | |
'rooms': (1, 20), | |
'monthlyFee': (0, 60000), | |
'monthlyCost': (0, 20000), | |
'floor': (-3, 35), | |
'yearBuilt': (1850, 2023), | |
'lat': (58.8, 60.2), | |
'lon': (17.5, 19.1), | |
'gdp': (505.1, 630.14), | |
'unemployment': (6.36, 8.66), | |
'interestRate': (-0.5, 2.64), | |
'number': (0, 300), | |
'soldDate': (2010, 2025) | |
} # Extracted from the data | |
featureToName = { | |
'number' : 'Street number', | |
'sqm' : 'Size of the apartment in square meters', | |
'rooms' : 'Number of rooms', | |
'monthlyFee' : 'Monthly fee', | |
'monthlyCost' : 'Monthly operating cost', | |
'floor' : 'Floor', | |
'yearBuilt' : 'Year built', | |
'streetName' : 'Name of street', | |
} | |
topAgencies = ['Fastighetsbyrån','Notar','Svensk Fastighetsförmedling','HusmanHagberg','Länsförsäkringar Fastighetsförmedling','Erik Olsson','SkandiaMäklarna','Svenska Mäklarhuset','Bjurfors','Mäklarhuset','BOSTHLM','Innerstadsspecialisten','MOHV','Mäklarringen','Historiska Hem','Södermäklarna','Karlsson & Uddare','UNIK Fastighetsförmedling','Edward & Partners','Widerlöv'] | |
def downloadAutogluonModel(): | |
# Download saved Autogluon model from Hopsworks | |
project = hopsworks.login() | |
mr = project.get_model_registry() | |
temp = mr.get_model("ag_model_20230109", version=5) | |
temp_ag_folder_path = temp.download() | |
print(temp_ag_folder_path) | |
moveFolder(temp_ag_folder_path) | |
ag_model = TabularPredictor.load("AutogluonModels/ag_model_20230109") # '/ag_model_20230109' | |
return ag_model | |
def moveFolder(temp_ag_folder_path): | |
# Move Autogluon model folder to the correct folder | |
original = temp_ag_folder_path | |
target = "AutogluonModels/" | |
shutil.move(original, target) | |
def downloadModel(): | |
# Download saved Autogluon model from Hopsworks | |
project = hopsworks.login() | |
mr = project.get_model_registry() | |
temp = mr.get_model("xgboost_model", version=5) | |
model_path = temp.download() | |
xgb_model = joblib.load(model_path + "/xgboost_model.pkl") | |
return xgb_model | |
def getAddressInfo(streetName, number): | |
streetName = cleanAddress(streetName) | |
try: | |
return getCoordinatesFromAddress(streetName, number) | |
except AddressNotFound: | |
return None, None | |
# Adds the financial data to the apartment data | |
def populateApartmentData(aptDf): | |
print('Populating with financial data...') | |
gdpDf = pd.read_csv(f'./data/historicalGDP.csv', sep=';') | |
unemploymentDf = pd.read_csv(f'./data/historicalUnemployment.csv', sep=';') | |
interestRateDf = pd.read_csv(f'./data/historicalInterest.csv', sep=';') | |
gdpDf = interpolateTime(gdpDf) | |
unemploymentDf = interpolateTime(unemploymentDf) | |
interestRateDf = interpolateTime(interestRateDf) | |
aptDf['gdp'] = aptDf['soldDate'].apply(getValueFromTime, args=(gdpDf,)) | |
aptDf['unemployment'] = aptDf['soldDate'].apply(getValueFromTime, args=(unemploymentDf,)) | |
aptDf['interestRate'] = aptDf['soldDate'].apply(getValueFromTime, args=(interestRateDf,)) | |
return aptDf | |
def interpolateTime(df): | |
df['date'] = pd.to_datetime(df['date']) | |
df = df.set_index('date') | |
df = df.resample('MS').mean() | |
df = df.interpolate(method='time') | |
return fixChange(df) | |
def getValueFromTime(datetime, dataDf): | |
# Get the value from the dataDf at the given datetime | |
# If the datetime is not in the dataDf, print the datetime and return '0' | |
# First, set the day of the datetime to the first day of the month | |
# parse datetime to enable replacement | |
datetime = pd.to_datetime(datetime) | |
datetime = datetime.replace(day=1) | |
try: | |
return dataDf.loc[datetime, 'value'] | |
except KeyError: | |
# Try adding one month | |
nextMonth = datetime.month + 1 | |
if nextMonth > 12: | |
datetime = datetime.replace(month=1) | |
datetime = datetime.replace(year=datetime.year + 1) | |
def fixChange(df): | |
# Set change to be the difference between the current and previous price | |
df['change'] = df['value'].diff() | |
# If the change is Nan set it to 0 | |
df['change'] = df['change'].fillna(0) | |
return df | |
def cleanAddress(x): | |
# Remove "-" from the street | |
x = ''.join(x.split('-')) | |
# Remove all zero width spaces, non-breaking spaces and non-breaking hyphens | |
x = x.replace('\u200b', '') | |
x = x.replace('\u00a0', '') | |
x = x.replace('\u2011', '') | |
# Remove all soft hyphens | |
x = x.replace('\xad', '') | |
x = x.replace('\u200c', '') | |
x.strip() | |
return x | |
class AddressNotFound(Exception): | |
pass | |
def getCoordinatesFromAddress(streetName, number): | |
HOST_ADDRESS = '165.227.162.37' | |
HOST_PORT = '8080' | |
EMAIL = 'nathan.allard@gmail.com' | |
DOMAIN = HOST_ADDRESS + ':' + HOST_PORT | |
LOCATOR = Nominatim(user_agent=EMAIL, domain=DOMAIN, scheme='http', timeout=10) | |
number = str(int(float(number))) | |
address = f'{streetName} {number}, Stockholm' | |
if number == '0': | |
address = f'{streetName}, Stockholm' | |
location = LOCATOR.geocode(address) | |
if location is None: | |
raise AddressNotFound | |
else: | |
# Return with a precision of 6 decimals (accuracy of <1 meter) | |
lat = round(location.latitude, 6) | |
lon = round(location.longitude, 6) | |
return lat, lon | |
def dateToFloat(date): | |
year, month, day = str(date).split('-') | |
day = day.split(' ')[0] | |
return int(year) + int(month) / 12 + int(day) / 365 | |
def normalize(x, minVal, maxVal, feature): | |
# Not fantastic | |
res = (float(x) - minVal) / (maxVal - minVal) | |
return min(max(res, 0), 1) | |
def normalizeData(df): | |
# Normalize select numerical values to a value between 0 and 1 | |
print('Normalizing data...') | |
for feature, minMax in tqdm(featureToMinMax.items()): | |
min = minMax[0] | |
max = minMax[1] | |
if feature == 'soldDate': | |
df[feature] = df[feature].apply(lambda x: dateToFloat(x)) | |
df[feature] = df[feature].apply(lambda x: normalize(x, min, max, feature)) | |
return df | |
def parsePrice(price): | |
featureToMinMaxPrice = { | |
'price': (1.5e5, 7e7) | |
} | |
MIN = featureToMinMaxPrice['price'][0] | |
MAX = featureToMinMaxPrice['price'][1] | |
price = float(price) | |
price = price * (MAX - MIN) + MIN | |
return f'{addDotsToPrice(int(price))} SEK' | |
def addDotsToPrice(price): | |
# Takes an int like 1000000 and returns a string like 1.000.000 | |
toReturn = '' | |
price = str(price) | |
for i, c in enumerate(price): | |
toReturn += c | |
if (len(price) - i) % 3 == 1 and i != len(price) - 1 and c != '-': | |
toReturn += '.' | |
return toReturn | |
def xgbFix(df): | |
features_to_categorical = ["streetName", "brf", "agency"] | |
features_to_float = ["number", "sqm", "rooms", "monthlyFee", | |
"monthlyCost", "floor", "yearBuilt", "gdp", "unemployment", | |
"interestRate", "lat", "lon", "soldDate"] | |
df[features_to_categorical] = df[features_to_categorical].astype("category") | |
df[features_to_float] = df[features_to_float].astype(float) | |
return df | |
model = downloadModel() | |
autoModel = downloadAutogluonModel() | |
def xgboostPred(df): | |
# Drop categorical features | |
df = df.drop(['streetName', 'brf', 'agency'], axis=1) | |
# Save first row as a numpy array | |
results = [] | |
for _,row in df.iterrows(): | |
input_list = row.to_numpy() | |
res = model.predict(np.asarray(input_list).reshape(1, -1)) | |
results.append(res[0]) # This is not done in a good way | |
return results | |
def addExtraAgencyFun(df): | |
# Make 20 copies of the first row with the 20 different top agencies in Sweden | |
# Make a copy of the first row | |
firstRow = df.iloc[0] | |
# Make a list of the copies | |
rows = [firstRow] * len(topAgencies) | |
# Make a dataframe from the list | |
df2 = pd.DataFrame(rows) | |
# Add the top agencies to the dataframe | |
for i, agency in enumerate(topAgencies): | |
df2['agency'].iloc[i] = agency | |
# Concatenate the two dataframes | |
df = pd.concat([df, df2], ignore_index=True) | |
return df | |
def autoPred(df): | |
df = addExtraAgencyFun(df) | |
res = autoModel.predict(df) | |
# Convert to a list | |
res = res.tolist() | |
# Get the last 20 values | |
agencyResults = res[-20:] | |
res = res[:-20] | |
# Get the mean of the agencies | |
agencyToResult = {agency:result for agency, result in zip(topAgencies, agencyResults)} | |
for agency, result in agencyToResult.items(): | |
print(agency, str(result)) | |
# Get the top and bottom 3 agencies with the highest results | |
sortedAgencies = sorted(agencyToResult.items(), key=lambda x: x[1]) | |
meanPrice = sum(agencyResults) / len(agencyResults) | |
top3 = sortedAgencies[-5:] | |
top3.reverse() | |
agencyString = parseAgencyResult(top3, meanPrice) | |
return res, agencyString | |
def parseAgencyResult(top3, meanPrice): | |
toReturn = 'To get the most money for your apartment, you should sell it with the help of one of these agencies:\n' | |
toReturn += 'Top 5:\n' | |
for agency, result in top3: | |
diff = result - meanPrice | |
toReturn += f'{agency}: {parsePrice(result)} ({parsePrice(diff)} above mean)\n' | |
return toReturn | |
def isValidInput(streetName, number, sqm, rooms, monthlyFee, monthlyCost, floor, yearBuilt): | |
# Street name is a string, all other values are numbers | |
if streetName == '': | |
return 'Street name is empty' | |
# If Street name contains numbers it should fail | |
if any(char.isdigit() for char in streetName): | |
return 'Only letters are allowed in street name' | |
toCheck = [number, sqm, rooms, monthlyFee, monthlyCost, floor, yearBuilt] | |
toCheckName = ['number', 'sqm', 'rooms', 'monthlyFee', 'monthlyCost', 'floor', 'yearBuilt'] | |
for val, name in zip(toCheck, toCheckName): | |
MIN = featureToMinMax[name][0] | |
MAX = featureToMinMax[name][1] | |
if val < MIN: | |
return f'{featureToName.get(name)} is too low' | |
if val > MAX: | |
return f'{featureToName.get(name)} is too high' | |
return None | |
def getDates(): | |
today = date.today() | |
# inAMonth = today + timedelta(days=30) | |
inAYear = today + timedelta(days=365) | |
lastYear = today - timedelta(days=365) | |
beforeUkraineWar = '2022-02-24' | |
threeYearsAgo = today - timedelta(days=365*3) | |
dateToExplanation = { | |
today.strftime("%Y-%m-%d") : 'today', | |
# inAMonth.strftime("%Y-%m-%d") : 'in a month', | |
inAYear.strftime("%Y-%m-%d") : 'in a year', | |
lastYear.strftime("%Y-%m-%d") : 'last year', | |
threeYearsAgo.strftime("%Y-%m-%d") : 'three years ago', | |
beforeUkraineWar : 'before Russia invaded Ukraine', | |
} | |
return dateToExplanation | |
def sthlm(streetName, number, sqm, rooms, monthlyFee, monthlyCost, floor, yearBuilt, auto): | |
inputErrors = isValidInput(streetName, number, sqm, rooms, monthlyFee, monthlyCost, floor, yearBuilt) | |
if inputErrors is not None: | |
return '0', '', '', inputErrors | |
lat, lon = getAddressInfo(streetName, number) | |
# If none | |
if lat is None or lon is None: | |
return '0', '', '', 'Address not found in the OpenStreetMap dataset (Nominatim), please try another address' | |
agency = 'Notar' # Make fun if categorical works | |
brf = 'BRF Kartboken 1' # TODO: remove | |
dates = getDates() | |
input_variables = pd.DataFrame( | |
columns=columnHeaders) | |
for soldDate in dates.keys(): | |
# Parse the input so we can run it through the model | |
# Create a dataframe from the input values | |
input_variables = input_variables.append( | |
pd.DataFrame( | |
[[streetName,number,sqm,rooms,soldDate,monthlyFee,monthlyCost,floor,yearBuilt,brf,agency,lat,lon]], columns=columnHeaders)) | |
df = populateApartmentData(input_variables) | |
df = normalizeData(df) | |
pricePred = None | |
agencyInfo = 'Please use AutoGluon instead of XGBoost to get information about agencies' | |
if auto: | |
pricePred, agencyInfo = autoPred(df) | |
else: | |
df = xgbFix(df) | |
pricePred = xgboostPred(df) | |
explanations = list(dates.values()) | |
result = [] # | |
mainPred = None | |
mainExplanation = None | |
for i, pred in enumerate(pricePred): | |
explanation = explanations[i] | |
if i == 0: | |
mainExplanation = explanation | |
mainPred = pred | |
else: | |
diff = pred - mainPred | |
if diff > 0: | |
result.append(f'If sold {explanation} it would have been worth more: {parsePrice(pred)} (+{parsePrice(diff)})') | |
else: | |
result.append(f'If sold {explanation} it would have been worth less: {parsePrice(pred)} ({parsePrice(diff)})') | |
return f'Predicted price of the apartment {mainExplanation}: {parsePrice(mainPred)}', '\n'.join(result), agencyInfo, '' | |
# All features present in the sthlm dataset | |
numericalInputs = ['number', 'sqm','rooms', 'monthlyFee','monthlyCost','floor','yearBuilt'] | |
inputs = [gr.inputs.Textbox(lines=1, label='streetName')] | |
# Generate the input form | |
for feature in numericalInputs: | |
minVal = featureToMinMax[feature][0] | |
maxVal = featureToMinMax[feature][1] | |
theLabel = f'{featureToName.get(feature)} (min: {minVal}, max: {maxVal})' | |
inputs.append(gr.inputs.Number(default=0, label=theLabel)) | |
# Add a switch to choose between xgboost and autogluon | |
inputs.append(gr.inputs.Checkbox( label='Use AutoGluon instead of XGBoost', default=False)) | |
# Create the interface | |
resultOutputs = [gr.outputs.Label(label='Price if sold today'), gr.outputs.Textbox(label='If sold at a different time'), gr.outputs.Textbox(label='Best agencies to use'), gr.outputs.Textbox(label='Error', type='error')] | |
demo = gr.Interface( | |
fn=sthlm, | |
title="Stockholm Housing Valuation", | |
description="Predict the price of an apartment in Stockholm\nTo get information about which agency to use, please select AutoGluon", | |
allow_flagging="never", | |
inputs=inputs, | |
outputs=resultOutputs) | |
demo.launch() | |