add CustomNextPlaceModel
Browse files- CustomNextPlaceModel.py +189 -0
CustomNextPlaceModel.py
ADDED
@@ -0,0 +1,189 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Tuple, TypedDict, Optional
|
2 |
+
import datetime
|
3 |
+
from datetime import datetime, timedelta
|
4 |
+
import pandas as pd
|
5 |
+
from next_place_ai.classes import DataPreparation, DatasetManager, AzureScore
|
6 |
+
from dotenv import load_dotenv
|
7 |
+
import os
|
8 |
+
|
9 |
+
load_dotenv()
|
10 |
+
|
11 |
+
|
12 |
+
class ProcessedSynapse(TypedDict):
|
13 |
+
id: Optional[str]
|
14 |
+
nextplace_id: Optional[str]
|
15 |
+
property_id: Optional[str]
|
16 |
+
listing_id: Optional[str]
|
17 |
+
address: Optional[str]
|
18 |
+
city: Optional[str]
|
19 |
+
state: Optional[str]
|
20 |
+
zip_code: Optional[str]
|
21 |
+
price: Optional[float]
|
22 |
+
beds: Optional[int]
|
23 |
+
baths: Optional[float]
|
24 |
+
sqft: Optional[int]
|
25 |
+
lot_size: Optional[int]
|
26 |
+
year_built: Optional[int]
|
27 |
+
days_on_market: Optional[int]
|
28 |
+
latitude: Optional[float]
|
29 |
+
longitude: Optional[float]
|
30 |
+
property_type: Optional[str]
|
31 |
+
last_sale_date: Optional[str]
|
32 |
+
hoa_dues: Optional[float]
|
33 |
+
query_date: Optional[str]
|
34 |
+
|
35 |
+
|
36 |
+
class CustomNextPlaceModel:
|
37 |
+
|
38 |
+
def __init__(self):
|
39 |
+
self.repo_id = os.getenv('REPO_ID')
|
40 |
+
self.hf_token = os.getenv('HF_TOKEN')
|
41 |
+
self._load_model()
|
42 |
+
|
43 |
+
def _load_model(self):
|
44 |
+
"""
|
45 |
+
Load all required models for the prediction pipeline
|
46 |
+
"""
|
47 |
+
try:
|
48 |
+
# Model A scoring
|
49 |
+
self.score_a = AzureScore(
|
50 |
+
repo_id=self.repo_id,
|
51 |
+
token=self.hf_token,
|
52 |
+
model_filename='A',
|
53 |
+
scored_labels='A'
|
54 |
+
)
|
55 |
+
|
56 |
+
# Model B scorings
|
57 |
+
self.score_b_1 = AzureScore(
|
58 |
+
repo_id=self.repo_id,
|
59 |
+
token=self.hf_token,
|
60 |
+
model_filename='B_1',
|
61 |
+
scored_labels='B'
|
62 |
+
)
|
63 |
+
self.score_b_2 = AzureScore(
|
64 |
+
repo_id=self.repo_id,
|
65 |
+
token=self.hf_token,
|
66 |
+
model_filename='B_2',
|
67 |
+
scored_labels='B'
|
68 |
+
)
|
69 |
+
self.score_b_3 = AzureScore(
|
70 |
+
repo_id=self.repo_id,
|
71 |
+
token=self.hf_token,
|
72 |
+
model_filename='B_3',
|
73 |
+
scored_labels='B'
|
74 |
+
)
|
75 |
+
|
76 |
+
# Model C scorings
|
77 |
+
self.score_c_models = {
|
78 |
+
'1': AzureScore(repo_id=self.repo_id, token=self.hf_token, model_filename='model_[1]', scored_labels='price'),
|
79 |
+
'2': AzureScore(repo_id=self.repo_id, token=self.hf_token, model_filename='model_[2]', scored_labels='price'),
|
80 |
+
'3_4': AzureScore(repo_id=self.repo_id, token=self.hf_token, model_filename='model_[3, 4]', scored_labels='price'),
|
81 |
+
'5_6': AzureScore(repo_id=self.repo_id, token=self.hf_token, model_filename='model_[5, 6]', scored_labels='price'),
|
82 |
+
'7': AzureScore(repo_id=self.repo_id, token=self.hf_token, model_filename='model_[7]', scored_labels='price'),
|
83 |
+
'8_9': AzureScore(repo_id=self.repo_id, token=self.hf_token, model_filename='model_C_8_9', scored_labels='price')
|
84 |
+
}
|
85 |
+
|
86 |
+
# Time model
|
87 |
+
self.score_t_1 = AzureScore(
|
88 |
+
repo_id=self.repo_id,
|
89 |
+
token=self.hf_token,
|
90 |
+
model_filename='model_T_1',
|
91 |
+
scored_labels='days'
|
92 |
+
)
|
93 |
+
|
94 |
+
# Data preparation module
|
95 |
+
self.data_manager = DatasetManager(repo_id=self.repo_id, token=self.hf_token)
|
96 |
+
|
97 |
+
except Exception as e:
|
98 |
+
raise ValueError(f"Error loading models: {str(e)}")
|
99 |
+
|
100 |
+
def predict(self, validators_data: pd.DataFrame) -> pd.DataFrame:
|
101 |
+
"""
|
102 |
+
Main prediction pipeline for processing input data
|
103 |
+
|
104 |
+
Args:
|
105 |
+
validators_data (pd.DataFrame): Input validation dataset
|
106 |
+
|
107 |
+
Returns:
|
108 |
+
pd.DataFrame: Processed prediction results
|
109 |
+
"""
|
110 |
+
# Ensure input is a DataFrame and has at least one row
|
111 |
+
if not isinstance(validators_data, pd.DataFrame) or validators_data.empty:
|
112 |
+
raise ValueError("Input must be a non-empty pandas DataFrame")
|
113 |
+
|
114 |
+
# Prepare data preparation instance
|
115 |
+
dp = DataPreparation(validators_data)
|
116 |
+
|
117 |
+
# Prepare initial dataset
|
118 |
+
dp.prepare_data()
|
119 |
+
|
120 |
+
# Predict A scores
|
121 |
+
score_A = self.score_a.predict_proba_dataset(dp.X)
|
122 |
+
|
123 |
+
# Combine datasets
|
124 |
+
combined_dataset = dp.combine_datasets(score_A, dp.X)
|
125 |
+
combined_dataset = combined_dataset.drop(columns=['0'])
|
126 |
+
combined_dataset, _ = dp.create_convolution_features(combined_dataset, combined_dataset.columns.to_list(), 3)
|
127 |
+
|
128 |
+
# Predict B scores for different categories
|
129 |
+
score_B_1 = self.score_b_1.predict_proba_dataset(combined_dataset[combined_dataset['A']==1])
|
130 |
+
score_B_2 = self.score_b_2.predict_proba_dataset(combined_dataset[combined_dataset['A']==2])
|
131 |
+
score_B_3 = self.score_b_3.predict_proba_dataset(combined_dataset[combined_dataset['A']==3])
|
132 |
+
|
133 |
+
# Concatenate B scores
|
134 |
+
df_B = pd.concat([score_B_1, score_B_2, score_B_3], ignore_index=True)
|
135 |
+
|
136 |
+
# Further combine and process dataset
|
137 |
+
combined_dataset = dp.combine_datasets(df_B, dp.X)
|
138 |
+
combined_dataset = combined_dataset.drop(columns=['0'])
|
139 |
+
combined_dataset, _ = dp.create_convolution_features(combined_dataset, combined_dataset.columns.to_list(), 3)
|
140 |
+
|
141 |
+
# Predict C scores for different categories
|
142 |
+
c_scores = {
|
143 |
+
'1': self.score_c_models['1'].predict_dataset(combined_dataset[combined_dataset['B'].isin([1])])
|
144 |
+
if not combined_dataset[combined_dataset['B'].isin([1])].empty else pd.DataFrame({'price': [0]}),
|
145 |
+
'2': self.score_c_models['2'].predict_dataset(combined_dataset[combined_dataset['B'].isin([2])])
|
146 |
+
if not combined_dataset[combined_dataset['B'].isin([2])].empty else pd.DataFrame({'price': [0]}),
|
147 |
+
'3_4': self.score_c_models['3_4'].predict_dataset(combined_dataset[combined_dataset['B'].isin([3, 4])])
|
148 |
+
if not combined_dataset[combined_dataset['B'].isin([3, 4])].empty else pd.DataFrame({'price': [0]}),
|
149 |
+
'5_6': self.score_c_models['5_6'].predict_dataset(combined_dataset[combined_dataset['B'].isin([5, 6])])
|
150 |
+
if not combined_dataset[combined_dataset['B'].isin([5, 6])].empty else pd.DataFrame({'price': [0]}),
|
151 |
+
'7': self.score_c_models['7'].predict_dataset(combined_dataset[combined_dataset['B'].isin([7])])
|
152 |
+
if not combined_dataset[combined_dataset['B'].isin([7])].empty else pd.DataFrame({'price': [0]}),
|
153 |
+
'8_9': self.score_c_models['8_9'].predict_dataset(combined_dataset[combined_dataset['B'].isin([8, 9])])
|
154 |
+
if not combined_dataset[combined_dataset['B'].isin([8, 9])].empty else pd.DataFrame({'price': [0]})
|
155 |
+
}
|
156 |
+
df_C = pd.concat(
|
157 |
+
[c_scores[key][['price']] for key in c_scores
|
158 |
+
if isinstance(c_scores[key], pd.DataFrame) and 'price' in c_scores[key].columns and not c_scores[key].empty],
|
159 |
+
ignore_index=True
|
160 |
+
)
|
161 |
+
|
162 |
+
df_C_ = df_C[df_C['price'] != 0].copy()
|
163 |
+
|
164 |
+
# Combine datasets
|
165 |
+
t_df_ = pd.concat([combined_dataset.reset_index(drop=True), df_C_.reset_index(drop=True)], axis=1)
|
166 |
+
|
167 |
+
# Predict time
|
168 |
+
score_t_1 = self.score_t_1.predict_dataset(t_df_).astype(int)
|
169 |
+
|
170 |
+
# Final result
|
171 |
+
result = pd.concat([df_C_.reset_index(drop=True), score_t_1.reset_index(drop=True)], axis=1)
|
172 |
+
|
173 |
+
return result
|
174 |
+
|
175 |
+
def run_inference(self, input_data: ProcessedSynapse) -> Tuple[float, str]:
|
176 |
+
|
177 |
+
input_data = pd.DataFrame([input_data])
|
178 |
+
result = self.predict(input_data)
|
179 |
+
predicted_sale_price, predicted_days = result['price'].iloc[0], result['days'].iloc[0] # кол-во дней нужно преобразовать в дату в виде строки
|
180 |
+
|
181 |
+
current_days_on_market = input_data.get('days_on_market', 0) or 0
|
182 |
+
|
183 |
+
# Вычисление даты размещения на рынке
|
184 |
+
date_listed = datetime.now() - timedelta(days=current_days_on_market)
|
185 |
+
|
186 |
+
# Вычисление предсказанной даты продажи
|
187 |
+
predicted_sale_date = (date_listed + timedelta(days=predicted_days)).strftime('%Y-%m-%d')
|
188 |
+
|
189 |
+
return predicted_sale_price, predicted_sale_date
|