Spaces:
Sleeping
Sleeping
First commit
Browse files- .gitignore +1 -0
- README.md +1 -1
- app.py +22 -0
- data.py +45 -0
- model.py +114 -0
- requirements.txt +2 -0
- tj-fa-3dmzfi52.pt +3 -0
.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
__pycache__/
|
README.md
CHANGED
@@ -9,4 +9,4 @@ app_file: app.py
|
|
9 |
pinned: false
|
10 |
---
|
11 |
|
12 |
-
|
|
|
9 |
pinned: false
|
10 |
---
|
11 |
|
12 |
+
Tajiki-Farsi Transliteration
|
app.py
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import streamlit as st
|
3 |
+
from model import init_model, predict
|
4 |
+
from data import Tokenizer, load_config
|
5 |
+
|
6 |
+
config = load_config('tj-fa-3dmzfi52.pt')
|
7 |
+
print('Config:', config)
|
8 |
+
tokenizer = Tokenizer(config)
|
9 |
+
|
10 |
+
# Load the model
|
11 |
+
model = init_model('tj-fa-3dmzfi52.pt')
|
12 |
+
|
13 |
+
# Create a text area box where the user can enter their text
|
14 |
+
user_input = st.text_area("Enter some text here", value="Халқро тақлидашон барбод дод,\nЭй дусад лаънат бар он тақлид бод")
|
15 |
+
|
16 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
17 |
+
|
18 |
+
# Run the model on the user's text and store the output
|
19 |
+
model_output = predict(model, tokenizer, user_input, device)
|
20 |
+
|
21 |
+
# Display the model's output in a text area box
|
22 |
+
st.text_area('The sentiment of the text is:', value=str(model_output), max_chars=None, key=None)
|
data.py
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
|
3 |
+
from torch.nn.utils.rnn import pad_sequence
|
4 |
+
|
5 |
+
|
6 |
+
def load_config(path):
|
7 |
+
d = torch.load(path, map_location='cpu')
|
8 |
+
return d['config']
|
9 |
+
|
10 |
+
|
11 |
+
class Tokenizer:
|
12 |
+
def __init__(self, config) -> None:
|
13 |
+
self.src_vocab = config['src_vocab']
|
14 |
+
self.trg_vocab = config['trg_vocab']
|
15 |
+
|
16 |
+
self.src_char_index = {char:i for i,char in enumerate(self.src_vocab)}
|
17 |
+
self.trg_char_index = {char:i for i,char in enumerate(self.trg_vocab)}
|
18 |
+
self.trg_null_idx = self.trg_char_index['<NULL>']
|
19 |
+
self.src_null_idx = self.src_char_index['<NULL>']
|
20 |
+
self.src_pad_idx = self.src_char_index['<PAD>']
|
21 |
+
self.trg_pad_idx = self.trg_char_index['<PAD>']
|
22 |
+
self.trg_unk_idx = self.trg_char_index['<UNK>']
|
23 |
+
self.src_unk_idx = self.src_char_index['<UNK>']
|
24 |
+
|
25 |
+
|
26 |
+
def encode_src(self, text: str):
|
27 |
+
src = [self.src_char_index.get(src_char, self.src_unk_idx) for src_char in text]
|
28 |
+
src = torch.tensor(src, dtype=torch.long)
|
29 |
+
return src
|
30 |
+
|
31 |
+
def decode_src(self, src: torch.Tensor):
|
32 |
+
return [self.src_vocab[i] for i in src]
|
33 |
+
|
34 |
+
def decode_trg(self, trg: torch.Tensor):
|
35 |
+
trg = trg.flatten().tolist()
|
36 |
+
trg = [r for r in trg if r != self.trg_null_idx]
|
37 |
+
|
38 |
+
return [self.trg_vocab[i] for i in trg]
|
39 |
+
|
40 |
+
def collate_fn(self, batch):
|
41 |
+
src = [x for x, _ in batch]
|
42 |
+
trg = [y for _, y in batch]
|
43 |
+
src_padded = pad_sequence(src, batch_first=True, padding_value=self.src_pad_idx)
|
44 |
+
trg_padded = pad_sequence(trg, batch_first=True, padding_value=self.trg_pad_idx)
|
45 |
+
return src_padded, trg_padded
|
model.py
ADDED
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from torch import nn
|
3 |
+
|
4 |
+
from data import Tokenizer
|
5 |
+
|
6 |
+
|
7 |
+
class ResidualBlock(nn.Module):
|
8 |
+
def __init__(self, num_channels, dropout=0.5):
|
9 |
+
super(ResidualBlock, self).__init__()
|
10 |
+
self.conv1 = nn.Conv1d(num_channels, num_channels, kernel_size=3, padding=1)
|
11 |
+
self.bn1 = nn.BatchNorm1d(num_channels)
|
12 |
+
self.conv2 = nn.Conv1d(num_channels, num_channels, kernel_size=3, padding=1)
|
13 |
+
self.bn2 = nn.BatchNorm1d(num_channels)
|
14 |
+
self.prelu = nn.PReLU()
|
15 |
+
self.dropout = nn.Dropout(dropout)
|
16 |
+
|
17 |
+
def forward(self, x):
|
18 |
+
residual = x
|
19 |
+
x = self.prelu(self.bn1(self.conv1(x)))
|
20 |
+
x = self.dropout(x)
|
21 |
+
x = self.bn2(self.conv2(x))
|
22 |
+
x = self.prelu(x)
|
23 |
+
x = self.dropout(x)
|
24 |
+
x += residual # shouldn't it be after activation function?
|
25 |
+
return x
|
26 |
+
|
27 |
+
|
28 |
+
class Seq2SeqCNN(nn.Module):
|
29 |
+
# def __init__(self, dict_size_src, dict_size_trg, embedding_dim, num_channels, num_residual_blocks, dropout=0.5):
|
30 |
+
def __init__(self, config):
|
31 |
+
dict_size_src = config['dict_size_src']
|
32 |
+
dict_size_trg = config['dict_size_trg']
|
33 |
+
embedding_dim = config['embedding_dim']
|
34 |
+
num_channels = config['num_channels']
|
35 |
+
num_residual_blocks = config['num_residual_blocks']
|
36 |
+
dropout = config['dropout']
|
37 |
+
many_to_one = config['many_to_one']
|
38 |
+
|
39 |
+
self.config = config
|
40 |
+
|
41 |
+
super(Seq2SeqCNN, self).__init__()
|
42 |
+
self.embedding = nn.Embedding(dict_size_src, embedding_dim)
|
43 |
+
self.conv = nn.Conv1d(embedding_dim, num_channels, kernel_size=3, padding=1)
|
44 |
+
self.bn = nn.BatchNorm1d(num_channels)
|
45 |
+
|
46 |
+
self.residual_blocks = nn.Sequential(
|
47 |
+
*(ResidualBlock(num_channels, dropout) for _ in range(num_residual_blocks))
|
48 |
+
# Add as many blocks as required
|
49 |
+
)
|
50 |
+
self.fc = nn.Linear(num_channels, dict_size_trg*many_to_one)
|
51 |
+
self.dropout = nn.Dropout(dropout)
|
52 |
+
self.dict_size_trg = dict_size_trg
|
53 |
+
|
54 |
+
def forward(self, src):
|
55 |
+
# src: (batch_size, seq_len)
|
56 |
+
batch_size = src.size(0)
|
57 |
+
|
58 |
+
embedded = self.embedding(src).permute(0, 2, 1) # (bsize, emb_dim, seq_len)
|
59 |
+
# print('embedded:', embedded.shape)
|
60 |
+
conv_out0 = self.conv(embedded) # (bsize, num_channels, seq_len)
|
61 |
+
# print('conv_out0:', conv_out0.shape)
|
62 |
+
# conv_out = embedded
|
63 |
+
conv_out = self.dropout(torch.relu(self.bn(conv_out0)))
|
64 |
+
# conv_out = conv_out0
|
65 |
+
res_out = self.residual_blocks(conv_out)
|
66 |
+
# print('res_out:', res_out.shape)
|
67 |
+
res_out = res_out + conv_out
|
68 |
+
# res_out = torch.cat([res_out, embedded], dim=1)
|
69 |
+
out = self.fc(self.dropout(res_out.permute(0, 2, 1))) # permute back to original
|
70 |
+
out = out.view(batch_size, -1, self.config['many_to_one'], self.dict_size_trg)
|
71 |
+
return out
|
72 |
+
|
73 |
+
|
74 |
+
def init_model(path, device="cpu"):
|
75 |
+
d = torch.load(path, map_location=device)
|
76 |
+
state_dict = d['state_dict']
|
77 |
+
model = Seq2SeqCNN(d['config']).to(device)
|
78 |
+
model.load_state_dict(state_dict)
|
79 |
+
return model
|
80 |
+
|
81 |
+
|
82 |
+
|
83 |
+
@torch.no_grad()
|
84 |
+
def _predict(model, src, device):
|
85 |
+
|
86 |
+
model.eval()
|
87 |
+
src = src.to(device)
|
88 |
+
output = model(src)
|
89 |
+
_, pred = torch.max(output, dim=-1)
|
90 |
+
|
91 |
+
# output = torch.softmax(output, dim=3)
|
92 |
+
# print(output.shape)
|
93 |
+
# pred = torch.multinomial(output.view(-1, output.size(-1)), 1)
|
94 |
+
# pred = pred.reshape(output.size()[:-1])
|
95 |
+
# print(pred.shape)
|
96 |
+
|
97 |
+
return pred
|
98 |
+
|
99 |
+
|
100 |
+
@torch.no_grad()
|
101 |
+
def predict(model, tokenizer: "Tokenizer", text:str, device):
|
102 |
+
print('text:', text)
|
103 |
+
if not text: return ''
|
104 |
+
text_encoded = tokenizer.encode_src(text)
|
105 |
+
|
106 |
+
batch = text_encoded.unsqueeze(0)
|
107 |
+
|
108 |
+
prd = _predict(model, batch, device)[0]
|
109 |
+
prd = prd[batch[0] != tokenizer.src_pad_idx,:]
|
110 |
+
|
111 |
+
predicted_text = ''.join(tokenizer.decode_trg(prd))
|
112 |
+
print('predicted_text:', repr(predicted_text))
|
113 |
+
return predicted_text # .replace('\u200c', '')
|
114 |
+
|
requirements.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
torch
|
2 |
+
altair==4.0
|
tj-fa-3dmzfi52.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1b5a43e9ea2d16238d025a19955e06a9a2d045cacb8dfe42cc79886864448fac
|
3 |
+
size 12029775
|