Jacaranda commited on
Commit
7dbe05a
1 Parent(s): 719f5b1

uploaded main end point file

Browse files
Files changed (1) hide show
  1. facility_predict.py +172 -0
facility_predict.py ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import random
3
+ import json
4
+ import numpy as np
5
+ import torch
6
+ import heapq
7
+ import pandas as pd
8
+ from tqdm import tqdm
9
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer
10
+ from torch.utils.data import TensorDataset, DataLoader
11
+
12
+
13
+ class Preprocess:
14
+ def __init__(self, tokenizer_vocab_path, tokenizer_max_len):
15
+ self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_vocab_path, use_auth_token='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')
16
+ self.max_len = tokenizer_max_len
17
+
18
+ def clean_text(self, text):
19
+ text = text.lower()
20
+ stopwords = ["i", "was", "transferred",
21
+ "from", "to", "nilienda", "kituo",
22
+ "cha", "lakini", "saa", "hii", "niko",
23
+ "at", "nilienda", "nikahudumiwa", "pole",
24
+ "deliver", "na", "ni", "baada", "ya",
25
+ "kutumwa", "kutoka", "nilienda",
26
+ "ndipo", "nikapewa", "hiyo", "lindam ama", "nikawa",
27
+ "mgonjwa", "nikatibiwa", "in", "had", "a",
28
+ "visit", "gynaecologist", "ndio",
29
+ "karibu", "mimi", "niko", "sehemu", "hospitali",
30
+ "serikali", "delivered", "katika", "kaunti", "kujifungua",
31
+ "katika", "huko", "nilipoenda", "kwa", "bado", "naedelea",
32
+ "sija", "maliza", "mwisho",
33
+ "nilianza", "kliniki", "yangu",
34
+ "nilianzia", "nilijifungua"]
35
+ text_single = ' '.join(word for word in text.split() if word not in stopwords)
36
+ return text_single
37
+
38
+ def encode_fn(self, text_single):
39
+ """
40
+ Using tokenizer to preprocess the text
41
+ example of text_single:'Nairobi Hospital'
42
+ """
43
+ tokenizer = self.tokenizer(text_single,
44
+ padding=True,
45
+ truncation=True,
46
+ max_length=self.max_len,
47
+ return_tensors='pt'
48
+ )
49
+ input_ids = tokenizer['input_ids']
50
+ attention_mask = tokenizer['attention_mask']
51
+ return input_ids, attention_mask
52
+
53
+ def process_tokenizer(self, text_single):
54
+ """
55
+ Preprocess text and prepare dataloader for a single new sentence
56
+ """
57
+ input_ids, attention_mask = self.encode_fn(text_single)
58
+ data = TensorDataset(input_ids, attention_mask)
59
+ return data
60
+
61
+ class Facility_Model:
62
+ def __init__(self, facility_model_path: any,
63
+ max_len: int):
64
+ self.max_len = max_len
65
+ self.softmax = torch.nn.Softmax(dim=1)
66
+ self.gpu = False
67
+ self.model = AutoModelForSequenceClassification.from_pretrained(facility_model_path, use_auth_token='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')
68
+ self.model.eval() # set pytorch model for inference mode
69
+
70
+ if torch.cuda.device_count() > 1:
71
+ self.model = torch.nn.DataParallel(self.model)
72
+
73
+ if self.gpu:
74
+ seed = 42
75
+ random.seed(seed)
76
+ np.random.seed(seed)
77
+ torch.manual_seed(seed)
78
+ torch.cuda.manual_seed_all(seed)
79
+ torch.backends.cudnn.deterministic = True
80
+ self.device = torch.device('cuda')
81
+ else:
82
+ self.device = 'cpu'
83
+
84
+ self.model = self.model.to(self.device)
85
+
86
+ def predict_single(self, model, pred_data):
87
+ """
88
+ Model inference for new single sentence
89
+ """
90
+ pred_dataloader = DataLoader(pred_data, batch_size=10, shuffle=False)
91
+ for i, batch in enumerate(pred_dataloader):
92
+ with torch.no_grad():
93
+ outputs = model(input_ids=batch[0].to(self.device),
94
+ attention_mask=batch[1].to(self.device)
95
+ )
96
+ loss, logits = outputs.loss, outputs.logits
97
+ probability = self.softmax(logits)
98
+ probability_list = probability.detach().cpu().numpy()
99
+ return probability_list
100
+
101
+ def output_intent_probability(self, pred: any) -> dict:
102
+ """
103
+ convert the model output into a dictionary with all intents and its probability
104
+ """
105
+ output_dict = {}
106
+ # transform the relation table(between label and intent)
107
+ path_table = pd.read_csv('/content/drive/MyDrive/dhis14000/dhis_label_relation_14357.csv')
108
+
109
+ label_intent_dict = path_table[["label", "corresponding_label"]].set_index("corresponding_label").to_dict()['label']
110
+
111
+ # transform the output into dictionary(between intent and probability)
112
+ for intent in range(pred.shape[1]):
113
+ output_dict[label_intent_dict[intent]] = pred[0][intent]
114
+
115
+ return output_dict
116
+
117
+ def inference(self, prepared_data):
118
+ """
119
+ Make predictions on one new sentence and output a JSON format variable
120
+ """
121
+ temp = []
122
+ prob_distribution = self.predict_single(self.model, prepared_data)
123
+ prediction_results = self.output_intent_probability(prob_distribution.astype(float))
124
+
125
+ # Filter out predictions containing "dental" or "optical" keywords
126
+ filtered_results = {intent: prob for intent, prob in prediction_results.items()
127
+ if
128
+ "dental" not in intent.lower() and "optical" not in intent.lower() and "eye" not in intent.lower()}
129
+
130
+ sorted_pred_intent_results = sorted(filtered_results.items(), key=lambda x: x[1], reverse=True)
131
+ sorted_pred_intent_results_dict = dict(sorted_pred_intent_results)
132
+ # Return the top result
133
+ top_results = dict(list(sorted_pred_intent_results)[:1])
134
+ # temp.append(top_results)
135
+ # final_preds = json.dumps(temp)
136
+ final_preds = ', '.join(top_results.keys())
137
+ final_preds = final_preds.replace("'", "")
138
+ return final_preds
139
+
140
+
141
+ jacaranda_hugging_face_model = "Jacaranda/dhis_14000_600k_Test_Model"
142
+
143
+ obj_Facility_Model = Facility_Model(facility_model_path=jacaranda_hugging_face_model,
144
+ max_len=128
145
+ )
146
+
147
+ processor = Preprocess(tokenizer_vocab_path=jacaranda_hugging_face_model,
148
+ tokenizer_max_len=128
149
+ )
150
+
151
+
152
+ def predict_batch_from_csv(input_file, output_file):
153
+ # Load batch data from CSV
154
+ batch_data = pd.read_csv(input_file)
155
+
156
+ # Initialize predictions list
157
+ predictions = []
158
+
159
+ # Iterate over rows with tqdm for progress tracking
160
+ for _, row in tqdm(batch_data.iterrows(), total=len(batch_data)):
161
+ text = row['facility_name'] # Replace 'facility_name' with the actual column name containing the text data
162
+ cleaned_text = processor.clean_text(text)
163
+ prepared_data = processor.process_tokenizer(cleaned_text)
164
+ prediction = obj_Facility_Model.inference(prepared_data)
165
+ predictions.append(prediction)
166
+
167
+ # Create DataFrame for predictions
168
+ output_data = pd.DataFrame({'prediction': predictions})
169
+ # Merge with input DataFrame
170
+ pred_output_df = pd.concat([batch_data, output_data], axis=1)
171
+ # Save predictions to CSV
172
+ pred_output_df.to_csv(output_file, index=False)