meliascosta commited on
Commit
c864490
1 Parent(s): 73e1547

Initial version of the app

Browse files
Files changed (2) hide show
  1. app.py +137 -0
  2. requirements.txt +2 -0
app.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import gradio as gr
4
+ import torch
5
+ from huggingface_hub import hf_hub_download
6
+ from torch.utils.data import DataLoader, Dataset
7
+
8
+
9
+
10
+
11
+ model_file_path = hf_hub_download("meliascosta/wiki_hiagm", "hiagm_dyn_quant.p",
12
+ use_auth_token=os.environ['TOKEN'])
13
+ vocab_file_path = hf_hub_download("meliascosta/wiki_hiagm", "vocab/word.dict",
14
+ use_auth_token=os.environ['TOKEN'])
15
+ label_file_path = hf_hub_download("meliascosta/wiki_hiagm", "vocab/label.dict",
16
+ use_auth_token=os.environ['TOKEN'])
17
+
18
+ # Load model
19
+ # m = torch.load('./temp_dyn.p')
20
+ model = torch.load(model_file_path, map_location=torch.device('cpu'))
21
+
22
+ model.eval()
23
+
24
+ MAX_INPUT_LENGTH = 256
25
+ K_TOP = 3
26
+ BATCH_SIZE = 512
27
+ P_THRESHOLD = 0.5
28
+
29
+ class CustomImageDataset(Dataset):
30
+ def __init__(self, samples):
31
+ self.samples = samples
32
+
33
+ def __len__(self):
34
+ return len(self.samples)
35
+
36
+ def __getitem__(self, idx):
37
+ return self.samples[idx]
38
+
39
+
40
+ # Load vocab mappings
41
+
42
+ v2i = {}
43
+ i2v = {}
44
+ with open(vocab_file_path) as f:
45
+ for i, line in enumerate(f):
46
+ data = line.rstrip().split('\t')
47
+ assert len(data) == 2
48
+ v2i[data[0]] = i
49
+ i2v[i] = data[0]
50
+
51
+ v2i_lab = {}
52
+ i2v_lab = {}
53
+ with open(label_file_path) as f:
54
+ for i, line in enumerate(f):
55
+ data = line.rstrip().split('\t')
56
+ assert len(data) == 2
57
+ v2i_lab[data[0]] = i
58
+ i2v_lab[i] = data[0]
59
+
60
+
61
+ ### PREPROCESSING
62
+
63
+ english_stopwords = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll",
64
+ "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's",
65
+ 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs',
66
+ 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am',
67
+ 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does',
68
+ 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while',
69
+ 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during',
70
+ 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off',
71
+ 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why',
72
+ 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor',
73
+ 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don',
74
+ "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren',
75
+ "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn',
76
+ "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't",
77
+ 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't",
78
+ 'won', "won't", 'wouldn', "wouldn't", '\\.', '\\?', ',', '\\!', "'s", '']
79
+
80
+
81
+ def clean_stopwords(sample):
82
+ """
83
+ :param sample: List[Str], lower case
84
+ :return: List[Str]
85
+ """
86
+ return [token for token in sample if token not in english_stopwords]
87
+
88
+
89
+ def clean_str(string):
90
+ """
91
+ Original Source: https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
92
+ :param string: Str
93
+ :return -> Str
94
+ """
95
+ string = string.strip().strip('"')
96
+ string = re.sub(r"[^A-Za-z(),!?\.\'\`]", " ", string)
97
+ string = re.sub(r"\'s", " \'s", string)
98
+ string = re.sub(r"\'ve", " \'ve", string)
99
+ string = re.sub(r"n\'t", " n\'t", string)
100
+ string = re.sub(r"\'re", " \'re", string)
101
+ string = re.sub(r"\'d", " \'d", string)
102
+ string = re.sub(r"\'ll", " \'ll", string)
103
+ string = re.sub(r",", " , ", string)
104
+ string = re.sub(r"\.", " \. ", string)
105
+ string = re.sub(r"\"", " , ", string)
106
+ string = re.sub(r"!", " ! ", string)
107
+ string = re.sub(r"\(", " \( ", string)
108
+ string = re.sub(r"\)", " \) ", string)
109
+ string = re.sub(r"\?", " \? ", string)
110
+ string = re.sub(r"\s{2,}", " ", string)
111
+ return string.strip().lower()
112
+
113
+
114
+ def preprocess_line(sample):
115
+ """
116
+ :param sample: Str, "The sample would be tokenized and filtered according to the stopwords list"
117
+ :return: token_list -> List[Str]
118
+ """
119
+ sample = clean_str(sample.lstrip().rstrip())
120
+ token_list = clean_stopwords(sample.split(' '))
121
+ return {'token': token_list, 'label': []}
122
+
123
+
124
+
125
+ def predict(line):
126
+ preprocessed_line = preprocess_line(line)
127
+ sample = {}
128
+ sample["token"] = [v2i.get(v.lower(), v2i['<OOV>']) for v in preprocessed_line['token']]
129
+ sample["token_len"] = len(sample['token'])
130
+ sample["token"] = torch.tensor(sample["token"])
131
+ sample["token_len"] = torch.tensor(sample["token_len"])
132
+ output_logits = model(next(iter(DataLoader(CustomImageDataset([sample])))))
133
+ output_probs = torch.sigmoid(output_logits).cpu().tolist()
134
+ return {i2v_lab[i]:p for i, p in enumerate(output_probs[0]) if p > P_THRESHOLD}
135
+
136
+ iface = gr.Interface(fn=predict, inputs="text", outputs=gr.Label(num_top_classes=5))
137
+ iface.launch()
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ gradio
2
+ torch