Furkan Akkurt commited on
Commit
f4d6740
1 Parent(s): bb21314

update with app

Browse files
app.py CHANGED
@@ -1,7 +1,11 @@
1
  import gradio as gr
 
2
 
3
- def greet(name):
4
- return "Hello " + name + "!!"
 
 
 
5
 
6
- iface = gr.Interface(fn=greet, inputs="text", outputs="text")
7
  iface.launch()
 
1
  import gradio as gr
2
+ import bap_preprocessing
3
 
4
+ def tokenize(data):
5
+ json_data = json.loads(data)
6
+ response = bap_preprocessing.tokenize(json_data['text'])
7
+ result = { "tokens": response }
8
+ return result
9
 
10
+ iface = gr.Interface(fn=tokenize, inputs="text", outputs="text")
11
  iface.launch()
bap_preprocessing.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+
3
+
4
+ import pickle
5
+
6
+ char_to_ix = pickle.load(open("chardict.pickle", "rb"))
7
+ # print(char_to_ix)
8
+
9
+ tag_to_ix = {'N':0, 'B':1, 'I':2}
10
+
11
+ import torch
12
+ import torch.nn as nn
13
+ import torch.nn.functional as F
14
+ import torch.optim as optim
15
+ import torch.autograd as autograd
16
+ torch.manual_seed(1)
17
+
18
+ class LSTMTagger(nn.Module):
19
+
20
+ def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
21
+ super(LSTMTagger, self).__init__()
22
+ self.hidden_dim = hidden_dim
23
+ #self.batch_size = batch_size
24
+
25
+ self.char_embeddings = nn.Embedding(vocab_size, embedding_dim)
26
+
27
+ # The LSTM takes word embeddings as inputs, and outputs hidden states
28
+ # with dimensionality hidden_dim.
29
+ self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True) # <- change here
30
+
31
+ # The linear layer that maps from hidden state space to tag space
32
+ self.hidden2tag = nn.Linear(hidden_dim * 2, tagset_size)
33
+
34
+ def forward(self, sentence):
35
+ embeds = self.char_embeddings(sentence)
36
+ lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
37
+ tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
38
+ tag_scores = F.log_softmax(tag_space, dim=1)
39
+ return tag_scores
40
+
41
+ #Train the model
42
+
43
+ model = LSTMTagger(100, 256, len(char_to_ix), len(tag_to_ix))
44
+ loss_function = nn.NLLLoss()
45
+ optimizer = optim.SGD(model.parameters(), lr = 0.1)
46
+
47
+ model_save_name = 'classifier_bidirectional_emb100_hid256_epoch20.pt'
48
+
49
+ model.load_state_dict(torch.load(model_save_name))
50
+
51
+ def prepare_sequence(seq, to_ix):
52
+ idxs = [to_ix[ch] for ch in seq]
53
+ return torch.tensor(idxs, dtype = torch.long)
54
+
55
+ def prob_to_tag(out):
56
+ _sentence_tag_list = []
57
+ #for sentence in tag_scores
58
+ _prob_to_tag = []
59
+ for ch in out:
60
+ chlist = list(ch)
61
+ #print(chlist)
62
+ maxi = max(chlist)
63
+ ind = chlist.index(maxi)
64
+ _prob_to_tag.append((list(tag_to_ix.keys())[ind]))
65
+ _sentence_tag_list.append(_prob_to_tag)
66
+ return _sentence_tag_list
67
+
68
+ def _char_to_token(samplesent, _sentence_tag_list):
69
+ token_list = []
70
+ token = []
71
+ for j in range(len(_sentence_tag_list[0])): #for each character of a sentence
72
+ ch = _sentence_tag_list[0][j]
73
+ ach = samplesent[j]
74
+
75
+ if ch == 'I':
76
+ token.append(ach)
77
+ if j == len(_sentence_tag_list[0]) -1:
78
+ token_list.append(token)
79
+
80
+ else:
81
+ if ch =='B':
82
+ if j == 0:
83
+ token.append(ach)
84
+ else:
85
+ token_list.append(token)
86
+ token=[]
87
+ token.append(ach)
88
+ if j == len(_sentence_tag_list[0]) -1:
89
+ token_list.append(token)
90
+ elif ch == 'N':
91
+ continue
92
+
93
+ return token_list
94
+
95
+ def char_unifier(_token_list):
96
+ for item in range(len(_token_list)):
97
+ _token_list[item]= ''.join(_token_list[item])
98
+ return _token_list
99
+
100
+ def tokenize(sentence):
101
+ input = prepare_sequence(sentence, char_to_ix)
102
+ out= model(input)
103
+ sentence_tag_list = prob_to_tag(out)
104
+ token_char_list = _char_to_token(sentence, sentence_tag_list)
105
+ token_list = char_unifier(token_char_list)
106
+ # print(token_list)
107
+ return token_list
108
+
109
+ print(tokenize("Merhaba, ben okula gidiyorum."))
chardict.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a0192035fff749374ccb5c6e9df77ca8a2f2505cc3f3c428ae82779d0a97983
3
+ size 662
classifier_bidirectional_emb100_hid256_epoch20.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d1af1f1a0ea5ae6bc3c21dc7a1cadf92370c89c5af2f3aee25c42c32631d7374
3
+ size 2983172
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ numpy==1.21.6
2
+ torch==1.11
3
+ Flask==2.1.0