Nathanotal's picture
asdf
9f26ac9
raw
history blame
7.87 kB
import gradio as gr
import numpy as np
from PIL import Image
import requests
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import joblib
import hopsworks
from tqdm import tqdm
import xgboost as xgb
from geopy.geocoders import Nominatim
from datetime import date
from datetime import timedelta
# Login to hopsworks and get the feature store
# TODO: Remove brf
# area;streetName;number;sqm;rooms;soldDate;monthlyFee;monthlyCost;floor;yearBuilt;brf;agency;lat;lon;gdp;unemployment;interestRate
columnHeaders = ['area','streetName','number','sqm','rooms','soldDate','monthlyFee','monthlyCost','floor','yearBuilt', 'brf','agency','lat','lon','gdp','unemployment','interestRate']
featureToMinMax = {
'sqm': (10, 800),
'rooms': (1, 20),
'monthlyFee': (0, 60000),
'monthlyCost': (0, 20000),
'floor': (-3, 35),
'yearBuilt': (1850, 2023),
'lat': (58.8, 60.2),
'lon': (17.5, 19.1),
'gdp': (505.1, 630.14),
'unemployment': (6.36, 8.66),
'interestRate': (-0.5, 2.64),
'number': (0, 300),
'soldDate': (2010, 2025)
} # Extracted from the data
def downloadModel():
# Download saved Autogluon model from Hopsworks
project = hopsworks.login(api_key_value='OWXnoeaQ1Bg6I0IE.EgaQo2HmubMIzfChCahCK6sQVLs4vyrhj2ODWHcYr0RN9f1gqac2dJjn8p2fXwcQ') # TODO: Remove, lol
mr = project.get_model_registry()
temp = mr.get_model("xgboost_model", version=5)
model_path = temp.download()
xgb_model = joblib.load(model_path + "/xgboost_model.pkl")
return xgb_model
def getAddressInfo(streetName, number):
streetName = cleanAddress(streetName)
try:
return getCoordinatesFromAddress(streetName, number)
except AddressNotFound:
return None, None
def cleanAddress(x):
# Remove "-" from the street
x = ''.join(x.split('-'))
# Remove all zero width spaces, non-breaking spaces and non-breaking hyphens
x = x.replace('\u200b', '')
x = x.replace('\u00a0', '')
x = x.replace('\u2011', '')
# Remove all soft hyphens
x = x.replace('\xad', '')
x = x.replace('\u200c', '')
x.strip()
return x
class AddressNotFound(Exception):
pass
def getCoordinatesFromAddress(streetName, number):
HOST_ADDRESS = '165.227.162.37'
HOST_PORT = '8080'
EMAIL = 'nathan.allard@gmail.com'
DOMAIN = HOST_ADDRESS + ':' + HOST_PORT
LOCATOR = Nominatim(user_agent=EMAIL, domain=DOMAIN, scheme='http', timeout=10)
number = str(int(float(number)))
address = f'{streetName} {number}, Stockholm'
if number == '0':
address = f'{streetName}, Stockholm'
location = LOCATOR.geocode(address)
if location is None:
raise AddressNotFound
else:
# Return with a precision of 6 decimals (accuracy of <1 meter)
lat = round(location.latitude, 6)
lon = round(location.longitude, 6)
return lat, lon
def getFinancialInfo(date):
gdp, unemployment, interestRate = None, None, None
return 600.0, 7.0, 0
def dateToFloat(date):
year, month, day = str(date).split('-')
day = day.split(' ')[0]
return int(year) + int(month) / 12 + int(day) / 365
def normalize(x, minVal, maxVal, feature):
# Not fantastic
res = (float(x) - minVal) / (maxVal - minVal)
return min(max(res, 0), 1)
def normalizeData(df):
# We do this manually because we want the UI to be able to transform the input data the same way
# Normalize select numerical values to a value between 0 and 1
print('Normalizing data...')
for feature, minMax in tqdm(featureToMinMax.items()):
min = minMax[0]
max = minMax[1]
if feature == 'soldDate':
df[feature] = df[feature].apply(lambda x: dateToFloat(x))
df[feature] = df[feature].apply(lambda x: normalize(x, min, max, feature))
return df
def parsePrice(price):
featureToMinMaxPrice = {
'price': (1.5e5, 7e7)
}
MIN = featureToMinMaxPrice['price'][0]
MAX = featureToMinMaxPrice['price'][1]
price = price * MAX + MIN
return f'{str(int(price))} SEK'
def xgbFix(df):
features_to_categorical = ["area", "streetName", "brf", "agency"]
features_to_float = ["number", "sqm", "rooms", "monthlyFee",
"monthlyCost", "floor", "yearBuilt", "gdp", "unemployment",
"interestRate", "lat", "lon", "soldDate"]
df[features_to_categorical] = df[features_to_categorical].astype("category")
df[features_to_float] = df[features_to_float].astype(float)
return df
model = downloadModel()
def xgboostPred(df, explanation):
# Drop categorical features
df = df.drop(['area', 'streetName', 'brf', 'agency'], axis=1)
# Save first row as a numpy array
input_list = df.iloc[0].to_numpy()
res = model.predict(np.asarray(input_list).reshape(1, -1))
return res
def autoPred():
pass
def getDates():
today = date.today()
inAMonth = today + timedelta(days=30)
inAYear = today + timedelta(days=365)
lastYear = today - timedelta(days=365)
beforeUkraineWar = '2022-02-24'
threeYearsAgo = today - timedelta(days=365*3)
dateToExplanation = {
today.strftime("%Y-%m-%d") : 'today',
inAMonth.strftime("%Y-%m-%d") : 'in a month',
inAYear.strftime("%Y-%m-%d") : 'in a year',
lastYear.strftime("%Y-%m-%d") : 'last year',
threeYearsAgo.strftime("%Y-%m-%d") : 'three years ago',
beforeUkraineWar : 'before Russia invaded Ukraine',
}
return dateToExplanation
def sthlm(streetName, area, number, sqm, rooms, monthlyFee, monthlyCost, floor, yearBuilt):
lat, lon = getAddressInfo(streetName, number)
# If none
if lat is None or lon is None:
return '', 'Address not found in the OpenStreetMap dataset (Nominatim), please try another address'
agency = 'Notar' # Make fun if categorical works
brf = 'BRF Kartboken 1' # TODO: remove
dates = getDates()
input_variables = pd.DataFrame(
columns=columnHeaders)
for soldDate in dates.keys():
gdp, unemployment, interestRate = getFinancialInfo(soldDate)
# Parse the input so we can run it through the model
# Create a dataframe from the input values
input_variables = input_variables.append(
pd.DataFrame(
[[area,streetName,number,sqm,rooms,soldDate,monthlyFee,monthlyCost,floor,yearBuilt,brf,agency,lat,lon,gdp,unemployment,interestRate]], columns=columnHeaders))
df = normalizeData(input_variables)
df = xgbFix(df)
pricePred = xgboostPred(df)
explanations = dates.values()
result = []
for i, pred in enumerate(pricePred):
explanation = explanations[i]
result.append(f'Predicted price of the apartment {explanation}: {parsePrice(pred)}')
return '\n'.join(result), ''
# All features present in the sthlm dataset
numericalInputs = ['number', 'sqm','rooms', 'monthlyFee','monthlyCost','floor','yearBuilt']
categoricalInputs = ['area']
inputs = [gr.inputs.Textbox(lines=1, label='streetName')]
catToInput = {
'feature': ['Bromma', 'Abrahamsberg', 'Akalla']
}
# Generate the input form
for feature in categoricalInputs:
inputs.append(gr.inputs.Dropdown(
choices=catToInput.get('feature'), default="a", label=feature))
for feature in numericalInputs:
minVal = featureToMinMax[feature][0]
maxVal = featureToMinMax[feature][1]
inputs.append(gr.inputs.Number(default=0, label=feature, minimum=minVal, maximum=maxVal))
# Create the interface
demo = gr.Interface(
fn=sthlm,
title="Stockholm Housing Valuation",
description="Predict the price of an apartment in Stockholm",
allow_flagging="never",
inputs=inputs,
outputs=['text', 'text'])
demo.launch()