Spaces:

Jacaranda
/

Facility_Batch_Predict

Runtime error

App Files Files Community

Jacaranda commited on Jun 7, 2023

Commit

5fb1207

1 Parent(s): 27a2265

Upload facility_predict.py

Browse files

Files changed (1) hide show

facility_predict.py +24 -22

facility_predict.py CHANGED Viewed

@@ -13,36 +13,36 @@ from torch.utils.data import TensorDataset, DataLoader
 class Preprocess:
     def __init__(self, tokenizer_vocab_path, tokenizer_max_len):
         self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_vocab_path,
                                                        use_auth_token='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')
         self.max_len = tokenizer_max_len
     def clean_text(self, text):
         text = text.lower()
-        stopwords = ["i", "was", "transferred",
-                     "from", "to", "nilienda", "kituo",
-                     "cha", "lakini", "saa", "hii", "niko",
-                     "at", "nilienda", "nikahudumiwa", "pole",
-                     "deliver", "na", "ni", "baada", "ya",
-                     "kutumwa", "kutoka", "nilienda",
-                     "ndipo", "nikapewa", "hiyo", "lindam ama", "nikawa",
-                     "mgonjwa", "nikatibiwa", "in", "had", "a",
-                     "visit", "gynaecologist", "ndio",
-                     "karibu", "mimi", "niko", "sehemu", "hospitali",
-                     "serikali", "delivered", "katika", "kaunti", "kujifungua",
-                     "katika", "huko", "nilipoenda", "kwa", "bado", "naedelea",
-                     "sija", "maliza", "mwisho",
-                     "nilianza", "kliniki", "yangu",
-                     "nilianzia", "nilijifungua"]
-        text_single = ' '.join(word for word in text.split() if word not in stopwords)
-        return text_single
-    def encode_fn(self, text_single):
         """
         Using tokenizer to preprocess the text
         example of text_single:'Nairobi Hospital'
         """
-        tokenizer = self.tokenizer(text_single,
                                    padding=True,
                                    truncation=True,
                                    max_length=self.max_len,
@@ -52,15 +52,17 @@ class Preprocess:
         attention_mask = tokenizer['attention_mask']
         return input_ids, attention_mask
-    def process_tokenizer(self, text_single):
         """
         Preprocess text and prepare dataloader for a single new sentence
         """
-        input_ids, attention_mask = self.encode_fn(text_single)
         data = TensorDataset(input_ids, attention_mask)
         return data
 class Facility_Model:
     def __init__(self, facility_model_path: any,
                  max_len: int):

 class Preprocess:
     def __init__(self, tokenizer_vocab_path, tokenizer_max_len):
+        self.stopwords = ["i", "was", "transferred",
+                          "from", "to", "nilienda", "kituo",
+                          "cha", "lakini", "saa", "hii", "niko",
+                          "at", "nilienda", "nikahudumiwa", "pole",
+                          "deliver", "na", "ni", "baada", "ya",
+                          "kutumwa", "kutoka", "nilienda",
+                          "ndipo", "nikapewa", "hiyo", "lindam ama", "nikawa",
+                          "mgonjwa", "nikatibiwa", "in", "had", "a",
+                          "visit", "gynaecologist", "ndio",
+                          "karibu", "mimi", "niko", "sehemu", "hospitali",
+                          "serikali", "delivered", "katika", "kaunti", "kujifungua",
+                          "katika", "huko", "nilipoenda", "kwa", "bado", "naedelea",
+                          "sija", "maliza", "mwisho",
+                          "nilianza", "kliniki", "yangu",
+                          "nilianzia", "nilijifungua"]
         self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_vocab_path,
                                                        use_auth_token='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')
         self.max_len = tokenizer_max_len
     def clean_text(self, text):
         text = text.lower()
+        self.text_single = ' '.join(word for word in text.split() if word not in self.stopwords)
+        return self.text_single
+    def encode_fn(self):
         """
         Using tokenizer to preprocess the text
         example of text_single:'Nairobi Hospital'
         """
+        tokenizer = self.tokenizer(self.text_single,
                                    padding=True,
                                    truncation=True,
                                    max_length=self.max_len,
         attention_mask = tokenizer['attention_mask']
         return input_ids, attention_mask
+    def process_tokenizer(self, data):
         """
         Preprocess text and prepare dataloader for a single new sentence
         """
+        self.clean_text(data)
+        input_ids, attention_mask = self.encode_fn()
         data = TensorDataset(input_ids, attention_mask)
         return data
 class Facility_Model:
     def __init__(self, facility_model_path: any,
                  max_len: int):