Jacaranda commited on
Commit
5fb1207
1 Parent(s): 27a2265

Upload facility_predict.py

Browse files
Files changed (1) hide show
  1. facility_predict.py +24 -22
facility_predict.py CHANGED
@@ -13,36 +13,36 @@ from torch.utils.data import TensorDataset, DataLoader
13
 
14
  class Preprocess:
15
  def __init__(self, tokenizer_vocab_path, tokenizer_max_len):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_vocab_path,
17
  use_auth_token='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')
18
  self.max_len = tokenizer_max_len
19
 
20
  def clean_text(self, text):
21
  text = text.lower()
22
- stopwords = ["i", "was", "transferred",
23
- "from", "to", "nilienda", "kituo",
24
- "cha", "lakini", "saa", "hii", "niko",
25
- "at", "nilienda", "nikahudumiwa", "pole",
26
- "deliver", "na", "ni", "baada", "ya",
27
- "kutumwa", "kutoka", "nilienda",
28
- "ndipo", "nikapewa", "hiyo", "lindam ama", "nikawa",
29
- "mgonjwa", "nikatibiwa", "in", "had", "a",
30
- "visit", "gynaecologist", "ndio",
31
- "karibu", "mimi", "niko", "sehemu", "hospitali",
32
- "serikali", "delivered", "katika", "kaunti", "kujifungua",
33
- "katika", "huko", "nilipoenda", "kwa", "bado", "naedelea",
34
- "sija", "maliza", "mwisho",
35
- "nilianza", "kliniki", "yangu",
36
- "nilianzia", "nilijifungua"]
37
- text_single = ' '.join(word for word in text.split() if word not in stopwords)
38
- return text_single
39
-
40
- def encode_fn(self, text_single):
41
  """
42
  Using tokenizer to preprocess the text
43
  example of text_single:'Nairobi Hospital'
44
  """
45
- tokenizer = self.tokenizer(text_single,
46
  padding=True,
47
  truncation=True,
48
  max_length=self.max_len,
@@ -52,15 +52,17 @@ class Preprocess:
52
  attention_mask = tokenizer['attention_mask']
53
  return input_ids, attention_mask
54
 
55
- def process_tokenizer(self, text_single):
56
  """
57
  Preprocess text and prepare dataloader for a single new sentence
58
  """
59
- input_ids, attention_mask = self.encode_fn(text_single)
 
60
  data = TensorDataset(input_ids, attention_mask)
61
  return data
62
 
63
 
 
64
  class Facility_Model:
65
  def __init__(self, facility_model_path: any,
66
  max_len: int):
 
13
 
14
  class Preprocess:
15
  def __init__(self, tokenizer_vocab_path, tokenizer_max_len):
16
+ self.stopwords = ["i", "was", "transferred",
17
+ "from", "to", "nilienda", "kituo",
18
+ "cha", "lakini", "saa", "hii", "niko",
19
+ "at", "nilienda", "nikahudumiwa", "pole",
20
+ "deliver", "na", "ni", "baada", "ya",
21
+ "kutumwa", "kutoka", "nilienda",
22
+ "ndipo", "nikapewa", "hiyo", "lindam ama", "nikawa",
23
+ "mgonjwa", "nikatibiwa", "in", "had", "a",
24
+ "visit", "gynaecologist", "ndio",
25
+ "karibu", "mimi", "niko", "sehemu", "hospitali",
26
+ "serikali", "delivered", "katika", "kaunti", "kujifungua",
27
+ "katika", "huko", "nilipoenda", "kwa", "bado", "naedelea",
28
+ "sija", "maliza", "mwisho",
29
+ "nilianza", "kliniki", "yangu",
30
+ "nilianzia", "nilijifungua"]
31
  self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_vocab_path,
32
  use_auth_token='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')
33
  self.max_len = tokenizer_max_len
34
 
35
  def clean_text(self, text):
36
  text = text.lower()
37
+ self.text_single = ' '.join(word for word in text.split() if word not in self.stopwords)
38
+ return self.text_single
39
+
40
+ def encode_fn(self):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  """
42
  Using tokenizer to preprocess the text
43
  example of text_single:'Nairobi Hospital'
44
  """
45
+ tokenizer = self.tokenizer(self.text_single,
46
  padding=True,
47
  truncation=True,
48
  max_length=self.max_len,
 
52
  attention_mask = tokenizer['attention_mask']
53
  return input_ids, attention_mask
54
 
55
+ def process_tokenizer(self, data):
56
  """
57
  Preprocess text and prepare dataloader for a single new sentence
58
  """
59
+ self.clean_text(data)
60
+ input_ids, attention_mask = self.encode_fn()
61
  data = TensorDataset(input_ids, attention_mask)
62
  return data
63
 
64
 
65
+
66
  class Facility_Model:
67
  def __init__(self, facility_model_path: any,
68
  max_len: int):