Update README.md
Browse files
README.md
CHANGED
@@ -2,25 +2,169 @@
|
|
2 |
library_name: keras
|
3 |
tags:
|
4 |
- translation
|
|
|
5 |
---
|
|
|
6 |
|
7 |
-
|
8 |
|
9 |
-
|
10 |
|
11 |
-
##
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
|
13 |
-
|
14 |
|
15 |
-
|
|
|
16 |
|
17 |
-
|
|
|
|
|
|
|
|
|
|
|
18 |
|
19 |
-
|
|
|
|
|
|
|
20 |
|
21 |
-
|
22 |
-
|
|
|
23 |
|
24 |
-
|
|
|
25 |
|
26 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
library_name: keras
|
3 |
tags:
|
4 |
- translation
|
5 |
+
license: apache-2.0
|
6 |
---
|
7 |
+
# Model Description
|
8 |
|
9 |
+
The GRU-eng-por model is used to translate English to Portuguese automatically.
|
10 |
|
11 |
+
The model was trained with a traduction dataset.
|
12 |
|
13 |
+
## Details
|
14 |
+
- Size: 42,554,912 parameters
|
15 |
+
- Dataset: [`English-to-Portuguese`](https://www.kaggle.com/datasets/nageshsingh/englishportuguese-translation)
|
16 |
+
- Languages: English, Portuguese
|
17 |
+
- Number of Training Steps: 15
|
18 |
+
- Batch size: 32
|
19 |
+
- Optimizer: rmsprop
|
20 |
+
- Learning Rate: 0.001
|
21 |
+
- GPU: T4
|
22 |
+
- This repository has the source [code used](https://github.com/Nkluge-correa/teeny-tiny_castle/blob/master/ML%20Intro%20Course/16_sequence_to_sequence.ipynb) to train this model.
|
23 |
|
24 |
+
## Usage
|
25 |
|
26 |
+
```
|
27 |
+
!pip install huggingface_hub["tensorflow"] -q
|
28 |
|
29 |
+
from huggingface_hub import from_pretrained_keras
|
30 |
+
from huggingface_hub import hf_hub_download
|
31 |
+
import tensorflow as tf
|
32 |
+
import numpy as np
|
33 |
+
import string
|
34 |
+
import re
|
35 |
|
36 |
+
# Select characters to strip, but preserve the "[" and "]"
|
37 |
+
strip_chars = string.punctuation
|
38 |
+
strip_chars = strip_chars.replace("[", "")
|
39 |
+
strip_chars = strip_chars.replace("]", "")
|
40 |
|
41 |
+
def custom_standardization(input_string):
|
42 |
+
lowercase = tf.strings.lower(input_string)
|
43 |
+
return tf.strings.regex_replace(lowercase, f"[{re.escape(strip_chars)}]", "")
|
44 |
|
45 |
+
# Load the `seq2seq_rnn` from the Hub
|
46 |
+
seq2seq_rnn = from_pretrained_keras("AiresPucrs/GRU-eng-por")
|
47 |
|
48 |
+
# Load the portuguese vocabulary
|
49 |
+
portuguese_vocabulary_path = hf_hub_download(
|
50 |
+
repo_id="AiresPucrs/GRU-eng-por",
|
51 |
+
filename="portuguese_vocabulary.txt",
|
52 |
+
repo_type='model',
|
53 |
+
local_dir="./")
|
54 |
+
|
55 |
+
# Load the english vocabulary
|
56 |
+
english_vocabulary_path = hf_hub_download(
|
57 |
+
repo_id="AiresPucrs/GRU-eng-por",
|
58 |
+
filename="english_vocabulary.txt",
|
59 |
+
repo_type='model',
|
60 |
+
local_dir="./")
|
61 |
+
|
62 |
+
with open(portuguese_vocabulary_path, encoding='utf-8', errors='backslashreplace') as fp:
|
63 |
+
portuguese_vocab = [line.strip() for line in fp]
|
64 |
+
fp.close()
|
65 |
+
|
66 |
+
with open(english_vocabulary_path, encoding='utf-8', errors='backslashreplace') as fp:
|
67 |
+
english_vocab = [line.strip() for line in fp]
|
68 |
+
fp.close()
|
69 |
+
|
70 |
+
# Initialize the vectorizers with the learned vocabularies
|
71 |
+
target_vectorization = tf.keras.layers.TextVectorization(max_tokens=20000,
|
72 |
+
output_mode="int",
|
73 |
+
output_sequence_length=21,
|
74 |
+
standardize=custom_standardization,
|
75 |
+
vocabulary=portuguese_vocab)
|
76 |
+
|
77 |
+
source_vectorization = tf.keras.layers.TextVectorization(max_tokens=20000,
|
78 |
+
output_mode="int",
|
79 |
+
output_sequence_length=20,
|
80 |
+
vocabulary=english_vocab)
|
81 |
+
|
82 |
+
# Create a dictionary from `int`to portuguese words
|
83 |
+
portuguese_index_lookup = dict(zip(range(len(portuguese_vocab)), portuguese_vocab))
|
84 |
+
max_decoded_sentence_length = 20
|
85 |
+
|
86 |
+
def decode_sequence(input_sentence):
|
87 |
+
"""
|
88 |
+
Decodes a sequence using a trained seq2seq RNN model.
|
89 |
+
|
90 |
+
Args:
|
91 |
+
input_sentence (str): the input sentence to be decoded
|
92 |
+
|
93 |
+
Returns:
|
94 |
+
decoded_sentence (str): the decoded sentence
|
95 |
+
generated by the model
|
96 |
+
"""
|
97 |
+
tokenized_input_sentence = source_vectorization([input_sentence])
|
98 |
+
decoded_sentence = "[start]"
|
99 |
+
|
100 |
+
for i in range(max_decoded_sentence_length):
|
101 |
+
tokenized_target_sentence = target_vectorization([decoded_sentence])
|
102 |
+
next_token_predictions = seq2seq_rnn.predict([tokenized_input_sentence, tokenized_target_sentence], verbose=0)
|
103 |
+
sampled_token_index = np.argmax(next_token_predictions[0, i, :])
|
104 |
+
sampled_token = portuguese_index_lookup[sampled_token_index]
|
105 |
+
decoded_sentence += " " + sampled_token
|
106 |
+
if sampled_token == "[end]":
|
107 |
+
break
|
108 |
+
return decoded_sentence
|
109 |
+
|
110 |
+
eng_sentences =["What is its name?",
|
111 |
+
"How old are you?",
|
112 |
+
"I know you know where Mary is.",
|
113 |
+
"We will show Tom.",
|
114 |
+
"What do you all do?",
|
115 |
+
"Don't do it!"]
|
116 |
+
|
117 |
+
for sentence in eng_sentences:
|
118 |
+
print(f"English sentence:\n{sentence}")
|
119 |
+
print(f'Portuguese translation:\n{decode_sequence(sentence)}')
|
120 |
+
print('-' * 50)
|
121 |
+
```
|
122 |
+
|
123 |
+
This will output the following:
|
124 |
+
```
|
125 |
+
English sentence:
|
126 |
+
What is its name?
|
127 |
+
Portuguese translation:
|
128 |
+
[start] qual é o nome [end]
|
129 |
+
--------------------------------------------------
|
130 |
+
English sentence:
|
131 |
+
How old are you?
|
132 |
+
Portuguese translation:
|
133 |
+
[start] quantos anos você tem [end]
|
134 |
+
--------------------------------------------------
|
135 |
+
English sentence:
|
136 |
+
I know you know where Mary is.
|
137 |
+
Portuguese translation:
|
138 |
+
[start] eu sei que você sabe onde maria está [end]
|
139 |
+
--------------------------------------------------
|
140 |
+
English sentence:
|
141 |
+
We will show Tom.
|
142 |
+
Portuguese translation:
|
143 |
+
[start] nós vamos tom [end]
|
144 |
+
--------------------------------------------------
|
145 |
+
English sentence:
|
146 |
+
What do you all do?
|
147 |
+
Portuguese translation:
|
148 |
+
[start] o que vocês faz [end]
|
149 |
+
--------------------------------------------------
|
150 |
+
English sentence:
|
151 |
+
Don't do it!
|
152 |
+
Portuguese translation:
|
153 |
+
[start] não faça isso [end]
|
154 |
+
--------------------------------------------------
|
155 |
+
```
|
156 |
+
|
157 |
+
# Cite as 🤗
|
158 |
+
```
|
159 |
+
@misc{teenytinycastle,
|
160 |
+
doi = {10.5281/zenodo.7112065},
|
161 |
+
url = {https://huggingface.co/AiresPucrs/GRU-eng-por},
|
162 |
+
author = {Nicholas Kluge Corr{\^e}a},
|
163 |
+
title = {Teeny-Tiny Castle},
|
164 |
+
year = {2023},
|
165 |
+
publisher = {HuggingFace},
|
166 |
+
journal = {HuggingFace repository},
|
167 |
+
}
|
168 |
+
```
|
169 |
+
## License
|
170 |
+
The GRU-eng-por is licensed under the Apache License, Version 2.0. See the LICENSE file for more details.
|