Spaces:
Sleeping
Sleeping
=
commited on
Commit
•
c9c7b63
1
Parent(s):
781ba83
deploy app
Browse files- pages/page_1.py +3 -0
- streamlit_test.py +140 -0
- wolof-translate/setup.py +5 -0
- wolof-translate/wolof_translate.egg-info/PKG-INFO +9 -0
- wolof-translate/wolof_translate.egg-info/SOURCES.txt +59 -0
- wolof-translate/wolof_translate.egg-info/dependency_links.txt +1 -0
- wolof-translate/wolof_translate.egg-info/top_level.txt +1 -0
- wolof-translate/wolof_translate/__init__.py +0 -0
- wolof-translate/wolof_translate/__pycache__/__init__.cpython-310.pyc +0 -0
- wolof-translate/wolof_translate/__pycache__/dataset_v1.cpython-310.pyc +0 -0
- wolof-translate/wolof_translate/__pycache__/sent_transformers.cpython-310.pyc +0 -0
- wolof-translate/wolof_translate/checkpoints/t5_small_custom_train_results_fw_v3/best_checkpoints.json +13 -0
- wolof-translate/wolof_translate/checkpoints/t5_small_custom_train_results_fw_v3/best_checkpoints.pth +3 -0
pages/page_1.py
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
|
3 |
+
st.markdown("Page 1")
|
streamlit_test.py
ADDED
@@ -0,0 +1,140 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import T5ForConditionalGeneration, T5TokenizerFast
|
2 |
+
from torch.utils.data import DataLoader
|
3 |
+
import streamlit as st
|
4 |
+
import torch
|
5 |
+
import os
|
6 |
+
|
7 |
+
|
8 |
+
# Let us define the main page
|
9 |
+
st.markdown("Translation page 🔠")
|
10 |
+
|
11 |
+
# Dropdown for the translation type
|
12 |
+
translation_type = st.sidebar.selectbox("Translation Type", options=["French ➡️ Wolof", "Wolof ➡️ French"])
|
13 |
+
|
14 |
+
# define a dictionary of versions
|
15 |
+
models = {
|
16 |
+
"Version ✌️": {
|
17 |
+
"French ➡️ Wolof": {
|
18 |
+
"checkpoints": "wolof-translate/wolof_translate/checkpoints/t5_small_custom_train_results_fw_v4",
|
19 |
+
"tokenizer": "wolof-translate/wolof_translate/tokenizers/t5_tokenizers/tokenizer_v4.json",
|
20 |
+
"max_len": None
|
21 |
+
}
|
22 |
+
},
|
23 |
+
"Version ☝️": {
|
24 |
+
"French ➡️ Wolof": {
|
25 |
+
"checkpoints": "wolof-translate/wolof_translate/checkpoints/t5_small_custom_train_results_fw_v3",
|
26 |
+
"tokenizer": "wolof-translate/wolof_translate/tokenizers/t5_tokenizers/tokenizer_v3.json",
|
27 |
+
"max_len": 51
|
28 |
+
}
|
29 |
+
}
|
30 |
+
}
|
31 |
+
|
32 |
+
# Dropdown for the model version
|
33 |
+
version = st.sidebar.selectbox("Model version", options=["Version ☝️", "Version ✌️"])
|
34 |
+
|
35 |
+
# Recuperate the number of sentences to provide
|
36 |
+
number = st.sidebar.number_input("Give the number of sentences that you want to provide", min_value = 1,
|
37 |
+
max_value = 100)
|
38 |
+
|
39 |
+
# Recuperate the number of sentences to provide
|
40 |
+
temperature = st.sidebar.slider("How randomly need you the translated sentences to be from 0% to 100%", min_value = 0,
|
41 |
+
max_value = 100)
|
42 |
+
|
43 |
+
|
44 |
+
# make the process
|
45 |
+
try:
|
46 |
+
# recuperate checkpoints
|
47 |
+
checkpoints = torch.load(os.path.join(models[version][translation_type]['checkpoints'], "best_checkpoints.pth"))
|
48 |
+
|
49 |
+
# recuperate the tokenizer
|
50 |
+
tokenizer_file = models[version][translation_type]['tokenizer']
|
51 |
+
|
52 |
+
# recuperate the max length
|
53 |
+
max_len = models[version][translation_type]['max_len']
|
54 |
+
|
55 |
+
# let us get the best model
|
56 |
+
@st.cache_resource
|
57 |
+
def get_model():
|
58 |
+
|
59 |
+
# initialize the tokenizer
|
60 |
+
tokenizer = T5TokenizerFast(tokenizer_file=tokenizer_file)
|
61 |
+
|
62 |
+
# initialize the model
|
63 |
+
model_name = 't5-small'
|
64 |
+
|
65 |
+
model = T5ForConditionalGeneration.from_pretrained(model_name)
|
66 |
+
|
67 |
+
# resize the token embeddings
|
68 |
+
model.resize_token_embeddings(len(tokenizer))
|
69 |
+
|
70 |
+
model.load_state_dict(checkpoints['model_state_dict'])
|
71 |
+
|
72 |
+
|
73 |
+
return model, tokenizer
|
74 |
+
|
75 |
+
model, tokenizer = get_model()
|
76 |
+
|
77 |
+
# set the model to eval mode
|
78 |
+
_ = model.eval()
|
79 |
+
|
80 |
+
# Add a title
|
81 |
+
st.header("Translate French sentences onto Wolof 👌")
|
82 |
+
|
83 |
+
|
84 |
+
# Recuperate two columns
|
85 |
+
left, right = st.columns(2)
|
86 |
+
|
87 |
+
# recuperate sentences
|
88 |
+
left.subheader('Give me some sentences in French: ')
|
89 |
+
|
90 |
+
for i in range(number):
|
91 |
+
|
92 |
+
left.text_input(f"- Sentence number {i + 1}", key = f"sentence{i}")
|
93 |
+
|
94 |
+
# run model inference on all test data
|
95 |
+
original_translations, predicted_translations, original_texts, scores = [], [], [], {}
|
96 |
+
|
97 |
+
# print a sentence recuperated from the session
|
98 |
+
right.subheader("Translation to Wolof:")
|
99 |
+
|
100 |
+
for i in range(number):
|
101 |
+
|
102 |
+
sentence = st.session_state[f"sentence{i}"] + tokenizer.eos_token
|
103 |
+
|
104 |
+
if not sentence == "":
|
105 |
+
|
106 |
+
# Let us encode the sentences
|
107 |
+
encoding = tokenizer([sentence], return_tensors='pt', max_length=max_len, padding='max_length', truncation=True)
|
108 |
+
|
109 |
+
# Let us recuperate the input ids
|
110 |
+
input_ids = encoding.input_ids
|
111 |
+
|
112 |
+
# Let us recuperate the mask
|
113 |
+
mask = encoding.attention_mask
|
114 |
+
|
115 |
+
# Let us recuperate the pad token id
|
116 |
+
pad_token_id = tokenizer.pad_token_id
|
117 |
+
|
118 |
+
# perform prediction
|
119 |
+
predictions = model.generate(input_ids, do_sample = False, top_k = 50, max_length = max_len, top_p = 0.90,
|
120 |
+
temperature = temperature/100, num_return_sequences = 0, attention_mask = mask, pad_token_id = pad_token_id)
|
121 |
+
|
122 |
+
# decode the predictions
|
123 |
+
predicted_sentence = tokenizer.batch_decode(predictions, skip_special_tokens = True)
|
124 |
+
|
125 |
+
# provide the prediction
|
126 |
+
right.write(f"{i+1}. {predicted_sentence[0]}")
|
127 |
+
|
128 |
+
else:
|
129 |
+
|
130 |
+
# provide the prediction
|
131 |
+
right.write(f"{i+1}. ")
|
132 |
+
|
133 |
+
except Exception as e:
|
134 |
+
|
135 |
+
st.warning("The chosen model is not available yet !", icon = "⚠️")
|
136 |
+
|
137 |
+
# st.write(e)
|
138 |
+
|
139 |
+
|
140 |
+
|
wolof-translate/setup.py
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from setuptools import setup
|
2 |
+
|
3 |
+
setup(name="wolof_translate", version="0.0.1", author="Oumar Kane", author_email="oumar.kane@univ-thies.sn",
|
4 |
+
description="Contain function and classes to process corpora for making translation between wolof text and other languages.",
|
5 |
+
requires=['spacy', 'nltk', 'gensim'])
|
wolof-translate/wolof_translate.egg-info/PKG-INFO
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Metadata-Version: 2.1
|
2 |
+
Name: wolof-translate
|
3 |
+
Version: 0.0.1
|
4 |
+
Summary: Contain function and classes to process corpora for making translation between wolof text and other languages.
|
5 |
+
Author: Oumar Kane
|
6 |
+
Author-email: oumar.kane@univ-thies.sn
|
7 |
+
Requires: spacy
|
8 |
+
Requires: nltk
|
9 |
+
Requires: gensim
|
wolof-translate/wolof_translate.egg-info/SOURCES.txt
ADDED
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
setup.py
|
2 |
+
wolof_translate/__init__.py
|
3 |
+
wolof_translate.egg-info/PKG-INFO
|
4 |
+
wolof_translate.egg-info/SOURCES.txt
|
5 |
+
wolof_translate.egg-info/dependency_links.txt
|
6 |
+
wolof_translate.egg-info/top_level.txt
|
7 |
+
wolof_translate/__pycache__/__init__.cpython-310.pyc
|
8 |
+
wolof_translate/__pycache__/dataset_v1.cpython-310.pyc
|
9 |
+
wolof_translate/__pycache__/sent_transformers.cpython-310.pyc
|
10 |
+
wolof_translate/data/__init__.py
|
11 |
+
wolof_translate/data/dataset_v1.py
|
12 |
+
wolof_translate/data/dataset_v2.py
|
13 |
+
wolof_translate/data/dataset_v3.py
|
14 |
+
wolof_translate/data/__pycache__/__init__.cpython-310.pyc
|
15 |
+
wolof_translate/data/__pycache__/dataset_v1.cpython-310.pyc
|
16 |
+
wolof_translate/data/__pycache__/dataset_v2.cpython-310.pyc
|
17 |
+
wolof_translate/models/__init__.py
|
18 |
+
wolof_translate/models/__pycache__/__init__.cpython-310.pyc
|
19 |
+
wolof_translate/models/transformers/__init__.py
|
20 |
+
wolof_translate/models/transformers/main.py
|
21 |
+
wolof_translate/models/transformers/optimization.py
|
22 |
+
wolof_translate/models/transformers/position.py
|
23 |
+
wolof_translate/models/transformers/size.py
|
24 |
+
wolof_translate/models/transformers/__pycache__/__init__.cpython-310.pyc
|
25 |
+
wolof_translate/models/transformers/__pycache__/main.cpython-310.pyc
|
26 |
+
wolof_translate/models/transformers/__pycache__/optimization.cpython-310.pyc
|
27 |
+
wolof_translate/models/transformers/__pycache__/position.cpython-310.pyc
|
28 |
+
wolof_translate/models/transformers/__pycache__/size.cpython-310.pyc
|
29 |
+
wolof_translate/pipe/__init__.py
|
30 |
+
wolof_translate/pipe/nlp_pipeline.py
|
31 |
+
wolof_translate/tokenizers/__init__.py
|
32 |
+
wolof_translate/tokenizers/adverse_tokenizer.json
|
33 |
+
wolof_translate/tokenizers/tokenizer_v1.json
|
34 |
+
wolof_translate/tokenizers/__pycache__/__init__.cpython-310.pyc
|
35 |
+
wolof_translate/tokenizers/t5_tokenizers/fr_tokenizer_v1.json
|
36 |
+
wolof_translate/tokenizers/t5_tokenizers/tokenizer_v1.json
|
37 |
+
wolof_translate/tokenizers/t5_tokenizers/tokenizer_v2.json
|
38 |
+
wolof_translate/tokenizers/t5_tokenizers/tokenizer_v3.json
|
39 |
+
wolof_translate/tokenizers/t5_tokenizers/wf_tokenizer_v1.json
|
40 |
+
wolof_translate/trainers/__init__.py
|
41 |
+
wolof_translate/trainers/transformer_trainer.py
|
42 |
+
wolof_translate/trainers/__pycache__/__init__.cpython-310.pyc
|
43 |
+
wolof_translate/trainers/__pycache__/transformer_trainer.cpython-310.pyc
|
44 |
+
wolof_translate/utils/__init__.py
|
45 |
+
wolof_translate/utils/evaluation.py
|
46 |
+
wolof_translate/utils/extract_poems.py
|
47 |
+
wolof_translate/utils/extract_sentences.py
|
48 |
+
wolof_translate/utils/sent_corrections.py
|
49 |
+
wolof_translate/utils/sent_transformers.py
|
50 |
+
wolof_translate/utils/sent_unification.py
|
51 |
+
wolof_translate/utils/split_with_valid.py
|
52 |
+
wolof_translate/utils/tokenize_text.py
|
53 |
+
wolof_translate/utils/__pycache__/__init__.cpython-310.pyc
|
54 |
+
wolof_translate/utils/__pycache__/evaluation.cpython-310.pyc
|
55 |
+
wolof_translate/utils/__pycache__/sent_corrections.cpython-310.pyc
|
56 |
+
wolof_translate/utils/__pycache__/sent_transformers.cpython-310.pyc
|
57 |
+
wolof_translate/utils/__pycache__/sent_unification.cpython-310.pyc
|
58 |
+
wolof_translate/utils/__pycache__/split_with_valid.cpython-310.pyc
|
59 |
+
wolof_translate/utils/__pycache__/tokenize_text.cpython-310.pyc
|
wolof-translate/wolof_translate.egg-info/dependency_links.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
|
wolof-translate/wolof_translate.egg-info/top_level.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
wolof_translate
|
wolof-translate/wolof_translate/__init__.py
ADDED
File without changes
|
wolof-translate/wolof_translate/__pycache__/__init__.cpython-310.pyc
ADDED
Binary file (244 Bytes). View file
|
|
wolof-translate/wolof_translate/__pycache__/dataset_v1.cpython-310.pyc
ADDED
Binary file (2.82 kB). View file
|
|
wolof-translate/wolof_translate/__pycache__/sent_transformers.cpython-310.pyc
ADDED
Binary file (949 Bytes). View file
|
|
wolof-translate/wolof_translate/checkpoints/t5_small_custom_train_results_fw_v3/best_checkpoints.json
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"metrics": {
|
3 |
+
"train_loss": 0.004466977413735216,
|
4 |
+
"test_loss": 0.5528496630489826,
|
5 |
+
"bleu": 24.9553,
|
6 |
+
"gen_len": 7.774,
|
7 |
+
"current_epoch": 759
|
8 |
+
},
|
9 |
+
"best_performance": {
|
10 |
+
"best_score": 24.9553,
|
11 |
+
"best_epoch": 759
|
12 |
+
}
|
13 |
+
}
|
wolof-translate/wolof_translate/checkpoints/t5_small_custom_train_results_fw_v3/best_checkpoints.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0a5d9617eba185df15b75da1871c2f2b5d2ab32eb089c21438228e8bcfac1595
|
3 |
+
size 540763111
|