Spaces:
Runtime error
Runtime error
Furkan Akkurt
commited on
Commit
•
f4d6740
1
Parent(s):
bb21314
update with app
Browse files- app.py +7 -3
- bap_preprocessing.py +109 -0
- chardict.pickle +3 -0
- classifier_bidirectional_emb100_hid256_epoch20.pt +3 -0
- requirements.txt +3 -0
app.py
CHANGED
@@ -1,7 +1,11 @@
|
|
1 |
import gradio as gr
|
|
|
2 |
|
3 |
-
def
|
4 |
-
|
|
|
|
|
|
|
5 |
|
6 |
-
iface = gr.Interface(fn=
|
7 |
iface.launch()
|
|
|
1 |
import gradio as gr
|
2 |
+
import bap_preprocessing
|
3 |
|
4 |
+
def tokenize(data):
|
5 |
+
json_data = json.loads(data)
|
6 |
+
response = bap_preprocessing.tokenize(json_data['text'])
|
7 |
+
result = { "tokens": response }
|
8 |
+
return result
|
9 |
|
10 |
+
iface = gr.Interface(fn=tokenize, inputs="text", outputs="text")
|
11 |
iface.launch()
|
bap_preprocessing.py
ADDED
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
|
3 |
+
|
4 |
+
import pickle
|
5 |
+
|
6 |
+
char_to_ix = pickle.load(open("chardict.pickle", "rb"))
|
7 |
+
# print(char_to_ix)
|
8 |
+
|
9 |
+
tag_to_ix = {'N':0, 'B':1, 'I':2}
|
10 |
+
|
11 |
+
import torch
|
12 |
+
import torch.nn as nn
|
13 |
+
import torch.nn.functional as F
|
14 |
+
import torch.optim as optim
|
15 |
+
import torch.autograd as autograd
|
16 |
+
torch.manual_seed(1)
|
17 |
+
|
18 |
+
class LSTMTagger(nn.Module):
|
19 |
+
|
20 |
+
def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
|
21 |
+
super(LSTMTagger, self).__init__()
|
22 |
+
self.hidden_dim = hidden_dim
|
23 |
+
#self.batch_size = batch_size
|
24 |
+
|
25 |
+
self.char_embeddings = nn.Embedding(vocab_size, embedding_dim)
|
26 |
+
|
27 |
+
# The LSTM takes word embeddings as inputs, and outputs hidden states
|
28 |
+
# with dimensionality hidden_dim.
|
29 |
+
self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True) # <- change here
|
30 |
+
|
31 |
+
# The linear layer that maps from hidden state space to tag space
|
32 |
+
self.hidden2tag = nn.Linear(hidden_dim * 2, tagset_size)
|
33 |
+
|
34 |
+
def forward(self, sentence):
|
35 |
+
embeds = self.char_embeddings(sentence)
|
36 |
+
lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
|
37 |
+
tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
|
38 |
+
tag_scores = F.log_softmax(tag_space, dim=1)
|
39 |
+
return tag_scores
|
40 |
+
|
41 |
+
#Train the model
|
42 |
+
|
43 |
+
model = LSTMTagger(100, 256, len(char_to_ix), len(tag_to_ix))
|
44 |
+
loss_function = nn.NLLLoss()
|
45 |
+
optimizer = optim.SGD(model.parameters(), lr = 0.1)
|
46 |
+
|
47 |
+
model_save_name = 'classifier_bidirectional_emb100_hid256_epoch20.pt'
|
48 |
+
|
49 |
+
model.load_state_dict(torch.load(model_save_name))
|
50 |
+
|
51 |
+
def prepare_sequence(seq, to_ix):
|
52 |
+
idxs = [to_ix[ch] for ch in seq]
|
53 |
+
return torch.tensor(idxs, dtype = torch.long)
|
54 |
+
|
55 |
+
def prob_to_tag(out):
|
56 |
+
_sentence_tag_list = []
|
57 |
+
#for sentence in tag_scores
|
58 |
+
_prob_to_tag = []
|
59 |
+
for ch in out:
|
60 |
+
chlist = list(ch)
|
61 |
+
#print(chlist)
|
62 |
+
maxi = max(chlist)
|
63 |
+
ind = chlist.index(maxi)
|
64 |
+
_prob_to_tag.append((list(tag_to_ix.keys())[ind]))
|
65 |
+
_sentence_tag_list.append(_prob_to_tag)
|
66 |
+
return _sentence_tag_list
|
67 |
+
|
68 |
+
def _char_to_token(samplesent, _sentence_tag_list):
|
69 |
+
token_list = []
|
70 |
+
token = []
|
71 |
+
for j in range(len(_sentence_tag_list[0])): #for each character of a sentence
|
72 |
+
ch = _sentence_tag_list[0][j]
|
73 |
+
ach = samplesent[j]
|
74 |
+
|
75 |
+
if ch == 'I':
|
76 |
+
token.append(ach)
|
77 |
+
if j == len(_sentence_tag_list[0]) -1:
|
78 |
+
token_list.append(token)
|
79 |
+
|
80 |
+
else:
|
81 |
+
if ch =='B':
|
82 |
+
if j == 0:
|
83 |
+
token.append(ach)
|
84 |
+
else:
|
85 |
+
token_list.append(token)
|
86 |
+
token=[]
|
87 |
+
token.append(ach)
|
88 |
+
if j == len(_sentence_tag_list[0]) -1:
|
89 |
+
token_list.append(token)
|
90 |
+
elif ch == 'N':
|
91 |
+
continue
|
92 |
+
|
93 |
+
return token_list
|
94 |
+
|
95 |
+
def char_unifier(_token_list):
|
96 |
+
for item in range(len(_token_list)):
|
97 |
+
_token_list[item]= ''.join(_token_list[item])
|
98 |
+
return _token_list
|
99 |
+
|
100 |
+
def tokenize(sentence):
|
101 |
+
input = prepare_sequence(sentence, char_to_ix)
|
102 |
+
out= model(input)
|
103 |
+
sentence_tag_list = prob_to_tag(out)
|
104 |
+
token_char_list = _char_to_token(sentence, sentence_tag_list)
|
105 |
+
token_list = char_unifier(token_char_list)
|
106 |
+
# print(token_list)
|
107 |
+
return token_list
|
108 |
+
|
109 |
+
print(tokenize("Merhaba, ben okula gidiyorum."))
|
chardict.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0a0192035fff749374ccb5c6e9df77ca8a2f2505cc3f3c428ae82779d0a97983
|
3 |
+
size 662
|
classifier_bidirectional_emb100_hid256_epoch20.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d1af1f1a0ea5ae6bc3c21dc7a1cadf92370c89c5af2f3aee25c42c32631d7374
|
3 |
+
size 2983172
|
requirements.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
numpy==1.21.6
|
2 |
+
torch==1.11
|
3 |
+
Flask==2.1.0
|