vavelychko commited on
Commit
9cfbab9
·
verified ·
1 Parent(s): 8ca52ab

add CustomNextPlaceModel

Browse files
Files changed (1) hide show
  1. CustomNextPlaceModel.py +189 -0
CustomNextPlaceModel.py ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Tuple, TypedDict, Optional
2
+ import datetime
3
+ from datetime import datetime, timedelta
4
+ import pandas as pd
5
+ from next_place_ai.classes import DataPreparation, DatasetManager, AzureScore
6
+ from dotenv import load_dotenv
7
+ import os
8
+
9
+ load_dotenv()
10
+
11
+
12
+ class ProcessedSynapse(TypedDict):
13
+ id: Optional[str]
14
+ nextplace_id: Optional[str]
15
+ property_id: Optional[str]
16
+ listing_id: Optional[str]
17
+ address: Optional[str]
18
+ city: Optional[str]
19
+ state: Optional[str]
20
+ zip_code: Optional[str]
21
+ price: Optional[float]
22
+ beds: Optional[int]
23
+ baths: Optional[float]
24
+ sqft: Optional[int]
25
+ lot_size: Optional[int]
26
+ year_built: Optional[int]
27
+ days_on_market: Optional[int]
28
+ latitude: Optional[float]
29
+ longitude: Optional[float]
30
+ property_type: Optional[str]
31
+ last_sale_date: Optional[str]
32
+ hoa_dues: Optional[float]
33
+ query_date: Optional[str]
34
+
35
+
36
+ class CustomNextPlaceModel:
37
+
38
+ def __init__(self):
39
+ self.repo_id = os.getenv('REPO_ID')
40
+ self.hf_token = os.getenv('HF_TOKEN')
41
+ self._load_model()
42
+
43
+ def _load_model(self):
44
+ """
45
+ Load all required models for the prediction pipeline
46
+ """
47
+ try:
48
+ # Model A scoring
49
+ self.score_a = AzureScore(
50
+ repo_id=self.repo_id,
51
+ token=self.hf_token,
52
+ model_filename='A',
53
+ scored_labels='A'
54
+ )
55
+
56
+ # Model B scorings
57
+ self.score_b_1 = AzureScore(
58
+ repo_id=self.repo_id,
59
+ token=self.hf_token,
60
+ model_filename='B_1',
61
+ scored_labels='B'
62
+ )
63
+ self.score_b_2 = AzureScore(
64
+ repo_id=self.repo_id,
65
+ token=self.hf_token,
66
+ model_filename='B_2',
67
+ scored_labels='B'
68
+ )
69
+ self.score_b_3 = AzureScore(
70
+ repo_id=self.repo_id,
71
+ token=self.hf_token,
72
+ model_filename='B_3',
73
+ scored_labels='B'
74
+ )
75
+
76
+ # Model C scorings
77
+ self.score_c_models = {
78
+ '1': AzureScore(repo_id=self.repo_id, token=self.hf_token, model_filename='model_[1]', scored_labels='price'),
79
+ '2': AzureScore(repo_id=self.repo_id, token=self.hf_token, model_filename='model_[2]', scored_labels='price'),
80
+ '3_4': AzureScore(repo_id=self.repo_id, token=self.hf_token, model_filename='model_[3, 4]', scored_labels='price'),
81
+ '5_6': AzureScore(repo_id=self.repo_id, token=self.hf_token, model_filename='model_[5, 6]', scored_labels='price'),
82
+ '7': AzureScore(repo_id=self.repo_id, token=self.hf_token, model_filename='model_[7]', scored_labels='price'),
83
+ '8_9': AzureScore(repo_id=self.repo_id, token=self.hf_token, model_filename='model_C_8_9', scored_labels='price')
84
+ }
85
+
86
+ # Time model
87
+ self.score_t_1 = AzureScore(
88
+ repo_id=self.repo_id,
89
+ token=self.hf_token,
90
+ model_filename='model_T_1',
91
+ scored_labels='days'
92
+ )
93
+
94
+ # Data preparation module
95
+ self.data_manager = DatasetManager(repo_id=self.repo_id, token=self.hf_token)
96
+
97
+ except Exception as e:
98
+ raise ValueError(f"Error loading models: {str(e)}")
99
+
100
+ def predict(self, validators_data: pd.DataFrame) -> pd.DataFrame:
101
+ """
102
+ Main prediction pipeline for processing input data
103
+
104
+ Args:
105
+ validators_data (pd.DataFrame): Input validation dataset
106
+
107
+ Returns:
108
+ pd.DataFrame: Processed prediction results
109
+ """
110
+ # Ensure input is a DataFrame and has at least one row
111
+ if not isinstance(validators_data, pd.DataFrame) or validators_data.empty:
112
+ raise ValueError("Input must be a non-empty pandas DataFrame")
113
+
114
+ # Prepare data preparation instance
115
+ dp = DataPreparation(validators_data)
116
+
117
+ # Prepare initial dataset
118
+ dp.prepare_data()
119
+
120
+ # Predict A scores
121
+ score_A = self.score_a.predict_proba_dataset(dp.X)
122
+
123
+ # Combine datasets
124
+ combined_dataset = dp.combine_datasets(score_A, dp.X)
125
+ combined_dataset = combined_dataset.drop(columns=['0'])
126
+ combined_dataset, _ = dp.create_convolution_features(combined_dataset, combined_dataset.columns.to_list(), 3)
127
+
128
+ # Predict B scores for different categories
129
+ score_B_1 = self.score_b_1.predict_proba_dataset(combined_dataset[combined_dataset['A']==1])
130
+ score_B_2 = self.score_b_2.predict_proba_dataset(combined_dataset[combined_dataset['A']==2])
131
+ score_B_3 = self.score_b_3.predict_proba_dataset(combined_dataset[combined_dataset['A']==3])
132
+
133
+ # Concatenate B scores
134
+ df_B = pd.concat([score_B_1, score_B_2, score_B_3], ignore_index=True)
135
+
136
+ # Further combine and process dataset
137
+ combined_dataset = dp.combine_datasets(df_B, dp.X)
138
+ combined_dataset = combined_dataset.drop(columns=['0'])
139
+ combined_dataset, _ = dp.create_convolution_features(combined_dataset, combined_dataset.columns.to_list(), 3)
140
+
141
+ # Predict C scores for different categories
142
+ c_scores = {
143
+ '1': self.score_c_models['1'].predict_dataset(combined_dataset[combined_dataset['B'].isin([1])])
144
+ if not combined_dataset[combined_dataset['B'].isin([1])].empty else pd.DataFrame({'price': [0]}),
145
+ '2': self.score_c_models['2'].predict_dataset(combined_dataset[combined_dataset['B'].isin([2])])
146
+ if not combined_dataset[combined_dataset['B'].isin([2])].empty else pd.DataFrame({'price': [0]}),
147
+ '3_4': self.score_c_models['3_4'].predict_dataset(combined_dataset[combined_dataset['B'].isin([3, 4])])
148
+ if not combined_dataset[combined_dataset['B'].isin([3, 4])].empty else pd.DataFrame({'price': [0]}),
149
+ '5_6': self.score_c_models['5_6'].predict_dataset(combined_dataset[combined_dataset['B'].isin([5, 6])])
150
+ if not combined_dataset[combined_dataset['B'].isin([5, 6])].empty else pd.DataFrame({'price': [0]}),
151
+ '7': self.score_c_models['7'].predict_dataset(combined_dataset[combined_dataset['B'].isin([7])])
152
+ if not combined_dataset[combined_dataset['B'].isin([7])].empty else pd.DataFrame({'price': [0]}),
153
+ '8_9': self.score_c_models['8_9'].predict_dataset(combined_dataset[combined_dataset['B'].isin([8, 9])])
154
+ if not combined_dataset[combined_dataset['B'].isin([8, 9])].empty else pd.DataFrame({'price': [0]})
155
+ }
156
+ df_C = pd.concat(
157
+ [c_scores[key][['price']] for key in c_scores
158
+ if isinstance(c_scores[key], pd.DataFrame) and 'price' in c_scores[key].columns and not c_scores[key].empty],
159
+ ignore_index=True
160
+ )
161
+
162
+ df_C_ = df_C[df_C['price'] != 0].copy()
163
+
164
+ # Combine datasets
165
+ t_df_ = pd.concat([combined_dataset.reset_index(drop=True), df_C_.reset_index(drop=True)], axis=1)
166
+
167
+ # Predict time
168
+ score_t_1 = self.score_t_1.predict_dataset(t_df_).astype(int)
169
+
170
+ # Final result
171
+ result = pd.concat([df_C_.reset_index(drop=True), score_t_1.reset_index(drop=True)], axis=1)
172
+
173
+ return result
174
+
175
+ def run_inference(self, input_data: ProcessedSynapse) -> Tuple[float, str]:
176
+
177
+ input_data = pd.DataFrame([input_data])
178
+ result = self.predict(input_data)
179
+ predicted_sale_price, predicted_days = result['price'].iloc[0], result['days'].iloc[0] # кол-во дней нужно преобразовать в дату в виде строки
180
+
181
+ current_days_on_market = input_data.get('days_on_market', 0) or 0
182
+
183
+ # Вычисление даты размещения на рынке
184
+ date_listed = datetime.now() - timedelta(days=current_days_on_market)
185
+
186
+ # Вычисление предсказанной даты продажи
187
+ predicted_sale_date = (date_listed + timedelta(days=predicted_days)).strftime('%Y-%m-%d')
188
+
189
+ return predicted_sale_price, predicted_sale_date