Jacaranda commited on
Commit
6de6e27
1 Parent(s): c8f83c0

Upload facility_predict.py

Browse files
Files changed (1) hide show
  1. facility_predict.py +26 -9
facility_predict.py CHANGED
@@ -1,6 +1,6 @@
 
1
  import os
2
  import random
3
- import gradio as gr
4
  import json
5
  import numpy as np
6
  import torch
@@ -13,7 +13,8 @@ from torch.utils.data import TensorDataset, DataLoader
13
 
14
  class Preprocess:
15
  def __init__(self, tokenizer_vocab_path, tokenizer_max_len):
16
- self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_vocab_path, use_auth_token='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')
 
17
  self.max_len = tokenizer_max_len
18
 
19
  def clean_text(self, text):
@@ -59,13 +60,15 @@ class Preprocess:
59
  data = TensorDataset(input_ids, attention_mask)
60
  return data
61
 
 
62
  class Facility_Model:
63
  def __init__(self, facility_model_path: any,
64
  max_len: int):
65
  self.max_len = max_len
66
  self.softmax = torch.nn.Softmax(dim=1)
67
  self.gpu = False
68
- self.model = AutoModelForSequenceClassification.from_pretrained(facility_model_path, use_auth_token='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')
 
69
  self.model.eval() # set pytorch model for inference mode
70
 
71
  if torch.cuda.device_count() > 1:
@@ -105,9 +108,10 @@ class Facility_Model:
105
  """
106
  output_dict = {}
107
  # transform the relation table(between label and intent)
108
- path_table = pd.read_csv('dhis_label_relation_14357.csv')
109
 
110
- label_intent_dict = path_table[["label", "corresponding_label"]].set_index("corresponding_label").to_dict()['label']
 
111
 
112
  # transform the output into dictionary(between intent and probability)
113
  for intent in range(pred.shape[1]):
@@ -157,17 +161,30 @@ def predict_batch_from_csv(input_file, output_file):
157
  # Initialize predictions list
158
  predictions = []
159
 
 
160
  # Iterate over rows with tqdm for progress tracking
161
  for _, row in tqdm(batch_data.iterrows(), total=len(batch_data)):
162
- text = row['facility_name'] # Replace 'facility_name' with the actual column name containing the text data
163
- cleaned_text = processor.clean_text(text)
 
 
 
 
 
164
  prepared_data = processor.process_tokenizer(cleaned_text)
165
- prediction = obj_Facility_Model.inference(prepared_data)
 
 
 
 
 
166
  predictions.append(prediction)
167
 
168
  # Create DataFrame for predictions
169
  output_data = pd.DataFrame({'prediction': predictions})
 
170
  # Merge with input DataFrame
171
- pred_output_df = pd.concat([batch_data, output_data], axis=1)
 
172
  # Save predictions to CSV
173
  pred_output_df.to_csv(output_file, index=False)
 
1
+ # writefile facility_predict.py
2
  import os
3
  import random
 
4
  import json
5
  import numpy as np
6
  import torch
 
13
 
14
  class Preprocess:
15
  def __init__(self, tokenizer_vocab_path, tokenizer_max_len):
16
+ self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_vocab_path,
17
+ use_auth_token='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')
18
  self.max_len = tokenizer_max_len
19
 
20
  def clean_text(self, text):
 
60
  data = TensorDataset(input_ids, attention_mask)
61
  return data
62
 
63
+
64
  class Facility_Model:
65
  def __init__(self, facility_model_path: any,
66
  max_len: int):
67
  self.max_len = max_len
68
  self.softmax = torch.nn.Softmax(dim=1)
69
  self.gpu = False
70
+ self.model = AutoModelForSequenceClassification.from_pretrained(facility_model_path,
71
+ use_auth_token='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')
72
  self.model.eval() # set pytorch model for inference mode
73
 
74
  if torch.cuda.device_count() > 1:
 
108
  """
109
  output_dict = {}
110
  # transform the relation table(between label and intent)
111
+ path_table = pd.read_csv('/content/drive/MyDrive/dhis14000/dhis_label_relation_14357.csv')
112
 
113
+ label_intent_dict = path_table[["label", "corresponding_label"]].set_index("corresponding_label").to_dict()[
114
+ 'label']
115
 
116
  # transform the output into dictionary(between intent and probability)
117
  for intent in range(pred.shape[1]):
 
161
  # Initialize predictions list
162
  predictions = []
163
 
164
+ # Iterate over rows with tqdm for progress tracking
165
  # Iterate over rows with tqdm for progress tracking
166
  for _, row in tqdm(batch_data.iterrows(), total=len(batch_data)):
167
+ text = row['pnc_fac_name'] # Replace 'facility_name' with the actual column name containing the text data
168
+
169
+ if pd.isnull(text):
170
+ cleaned_text = ""
171
+ else:
172
+ cleaned_text = processor.clean_text(text)
173
+
174
  prepared_data = processor.process_tokenizer(cleaned_text)
175
+
176
+ if cleaned_text == "":
177
+ prediction = "" # Set prediction as empty string
178
+ else:
179
+ prediction = obj_Facility_Model.inference(prepared_data)
180
+
181
  predictions.append(prediction)
182
 
183
  # Create DataFrame for predictions
184
  output_data = pd.DataFrame({'prediction': predictions})
185
+
186
  # Merge with input DataFrame
187
+ pred_output_df = pd.concat([batch_data.reset_index(drop=True), output_data], axis=1)
188
+
189
  # Save predictions to CSV
190
  pred_output_df.to_csv(output_file, index=False)