Spaces:

Jacaranda
/

Facility_Batch_Predict

Runtime error

App Files Files Community

Jacaranda commited on Jun 6, 2023

Commit

7dbe05a

•

1 Parent(s): 719f5b1

uploaded main end point file

Browse files

Files changed (1) hide show

facility_predict.py +172 -0

facility_predict.py ADDED Viewed

	@@ -0,0 +1,172 @@

+import os
+import random
+import json
+import numpy as np
+import torch
+import heapq
+import pandas as pd
+from tqdm import tqdm
+from transformers import AutoModelForSequenceClassification, AutoTokenizer
+from torch.utils.data import TensorDataset, DataLoader
+class Preprocess:
+    def __init__(self, tokenizer_vocab_path, tokenizer_max_len):
+        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_vocab_path, use_auth_token='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')
+        self.max_len = tokenizer_max_len
+    def clean_text(self, text):
+        text = text.lower()
+        stopwords = ["i", "was", "transferred",
+                     "from", "to", "nilienda", "kituo",
+                     "cha", "lakini", "saa", "hii", "niko",
+                     "at", "nilienda", "nikahudumiwa", "pole",
+                     "deliver", "na", "ni", "baada", "ya",
+                     "kutumwa", "kutoka", "nilienda",
+                     "ndipo", "nikapewa", "hiyo", "lindam ama", "nikawa",
+                     "mgonjwa", "nikatibiwa", "in", "had", "a",
+                     "visit", "gynaecologist", "ndio",
+                     "karibu", "mimi", "niko", "sehemu", "hospitali",
+                     "serikali", "delivered", "katika", "kaunti", "kujifungua",
+                     "katika", "huko", "nilipoenda", "kwa", "bado", "naedelea",
+                     "sija", "maliza", "mwisho",
+                     "nilianza", "kliniki", "yangu",
+                     "nilianzia", "nilijifungua"]
+        text_single = ' '.join(word for word in text.split() if word not in stopwords)
+        return text_single
+    def encode_fn(self, text_single):
+        """
+        Using tokenizer to preprocess the text
+        example of text_single:'Nairobi Hospital'
+        """
+        tokenizer = self.tokenizer(text_single,
+                                   padding=True,
+                                   truncation=True,
+                                   max_length=self.max_len,
+                                   return_tensors='pt'
+                                   )
+        input_ids = tokenizer['input_ids']
+        attention_mask = tokenizer['attention_mask']
+        return input_ids, attention_mask
+    def process_tokenizer(self, text_single):
+        """
+        Preprocess text and prepare dataloader for a single new sentence
+        """
+        input_ids, attention_mask = self.encode_fn(text_single)
+        data = TensorDataset(input_ids, attention_mask)
+        return data
+class Facility_Model:
+    def __init__(self, facility_model_path: any,
+                 max_len: int):
+        self.max_len = max_len
+        self.softmax = torch.nn.Softmax(dim=1)
+        self.gpu = False
+        self.model = AutoModelForSequenceClassification.from_pretrained(facility_model_path, use_auth_token='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')
+        self.model.eval()  # set pytorch model for inference mode
+        if torch.cuda.device_count() > 1:
+            self.model = torch.nn.DataParallel(self.model)
+        if self.gpu:
+            seed = 42
+            random.seed(seed)
+            np.random.seed(seed)
+            torch.manual_seed(seed)
+            torch.cuda.manual_seed_all(seed)
+            torch.backends.cudnn.deterministic = True
+            self.device = torch.device('cuda')
+        else:
+            self.device = 'cpu'
+        self.model = self.model.to(self.device)
+    def predict_single(self, model, pred_data):
+        """
+        Model inference for new single sentence
+        """
+        pred_dataloader = DataLoader(pred_data, batch_size=10, shuffle=False)
+        for i, batch in enumerate(pred_dataloader):
+            with torch.no_grad():
+                outputs = model(input_ids=batch[0].to(self.device),
+                                attention_mask=batch[1].to(self.device)
+                                )
+                loss, logits = outputs.loss, outputs.logits
+                probability = self.softmax(logits)
+                probability_list = probability.detach().cpu().numpy()
+        return probability_list
+    def output_intent_probability(self, pred: any) -> dict:
+        """
+        convert the model output into a dictionary with  all intents and its probability
+        """
+        output_dict = {}
+        # transform the relation table(between label and intent)
+        path_table = pd.read_csv('/content/drive/MyDrive/dhis14000/dhis_label_relation_14357.csv')
+        label_intent_dict = path_table[["label", "corresponding_label"]].set_index("corresponding_label").to_dict()['label']
+        # transform the output into dictionary(between intent and probability)
+        for intent in range(pred.shape[1]):
+            output_dict[label_intent_dict[intent]] = pred[0][intent]
+        return output_dict
+    def inference(self, prepared_data):
+        """
+        Make predictions on one new sentence and output a JSON format variable
+        """
+        temp = []
+        prob_distribution = self.predict_single(self.model, prepared_data)
+        prediction_results = self.output_intent_probability(prob_distribution.astype(float))
+        # Filter out predictions containing "dental" or "optical" keywords
+        filtered_results = {intent: prob for intent, prob in prediction_results.items()
+                            if
+                            "dental" not in intent.lower() and "optical" not in intent.lower() and "eye" not in intent.lower()}
+        sorted_pred_intent_results = sorted(filtered_results.items(), key=lambda x: x[1], reverse=True)
+        sorted_pred_intent_results_dict = dict(sorted_pred_intent_results)
+        # Return the top result
+        top_results = dict(list(sorted_pred_intent_results)[:1])
+        # temp.append(top_results)
+        # final_preds = json.dumps(temp)
+        final_preds = ', '.join(top_results.keys())
+        final_preds = final_preds.replace("'", "")
+        return final_preds
+jacaranda_hugging_face_model = "Jacaranda/dhis_14000_600k_Test_Model"
+obj_Facility_Model = Facility_Model(facility_model_path=jacaranda_hugging_face_model,
+                                    max_len=128
+                                    )
+processor = Preprocess(tokenizer_vocab_path=jacaranda_hugging_face_model,
+                       tokenizer_max_len=128
+                       )
+def predict_batch_from_csv(input_file, output_file):
+    # Load batch data from CSV
+    batch_data = pd.read_csv(input_file)
+    # Initialize predictions list
+    predictions = []
+    # Iterate over rows with tqdm for progress tracking
+    for _, row in tqdm(batch_data.iterrows(), total=len(batch_data)):
+        text = row['facility_name']  # Replace 'facility_name' with the actual column name containing the text data
+        cleaned_text = processor.clean_text(text)
+        prepared_data = processor.process_tokenizer(cleaned_text)
+        prediction = obj_Facility_Model.inference(prepared_data)
+        predictions.append(prediction)
+    # Create DataFrame for predictions
+    output_data = pd.DataFrame({'prediction': predictions})
+    # Merge with input DataFrame
+    pred_output_df = pd.concat([batch_data, output_data], axis=1)
+    # Save predictions to CSV
+    pred_output_df.to_csv(output_file, index=False)