dieineb commited on
Commit
65e0eb6
1 Parent(s): 345ebb8

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +149 -1
README.md CHANGED
@@ -1,3 +1,151 @@
1
  ---
2
- license: mit
 
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ license: apache-2.0
3
+ library_name: transformers
4
  ---
5
+ # Model Description
6
+
7
+ The transformer-eng-por model is used to ... More information is needed.
8
+
9
+ The model was trained with a ...More information is needed.
10
+
11
+ ## Details
12
+ - Size: 23,805,216 parameters
13
+ - Dataset:
14
+ - Languages: English
15
+ - Number of Training Steps: 30
16
+ - Batch size: 32
17
+ - Optimizer: rmsprop
18
+ - Learning Rate: 0.001
19
+ - GPU: T4
20
+ - This repository has the source [code used](https://github.com/Nkluge-correa/teeny-tiny_castle/blob/master/ML%20Intro%20Course/16_sequence_to_sequence.ipynb) to train this model.
21
+
22
+ ## Usage
23
+
24
+ ```
25
+ import tensorflow as tf
26
+ import numpy as np
27
+ import string
28
+ import keras
29
+ import re
30
+
31
+ strip_chars = string.punctuation
32
+ strip_chars = strip_chars.replace("[", "")
33
+ strip_chars = strip_chars.replace("]", "")
34
+
35
+
36
+ def custom_standardization(input_string):
37
+ lowercase = tf.strings.lower(input_string)
38
+ return tf.strings.regex_replace(lowercase, f"[{re.escape(strip_chars)}]", "")
39
+
40
+ portuguese_vocabulary_path = hf_hub_download(
41
+ repo_id="AiresPucrs/transformer-eng-por",
42
+ filename="keras_transformer_blocks.py",
43
+ repo_type='model',
44
+ local_dir="./")
45
+
46
+ from keras_transformer_blocks import TransformerEncoder, PositionalEmbedding, TransformerDecoder
47
+
48
+ transformer = keras.models.load_model("/content/transformer-eng-por/transformer-eng-por.h5",
49
+ custom_objects={"TransformerEncoder": TransformerEncoder,
50
+ "PositionalEmbedding": PositionalEmbedding,
51
+ "TransformerDecoder": TransformerDecoder})
52
+
53
+ with open('portuguese_vocabulary.txt', encoding='utf-8', errors='backslashreplace') as fp:
54
+ portuguese_vocab = [line.strip() for line in fp]
55
+ fp.close()
56
+
57
+ with open('english_vocabulary.txt', encoding='utf-8', errors='backslashreplace') as fp:
58
+ english_vocab = [line.strip() for line in fp]
59
+ fp.close()
60
+
61
+
62
+ target_vectorization = tf.keras.layers.TextVectorization(max_tokens=20000,
63
+ output_mode="int",
64
+ output_sequence_length=21,
65
+ standardize=custom_standardization,
66
+ vocabulary=portuguese_vocab)
67
+
68
+ source_vectorization = tf.keras.layers.TextVectorization(max_tokens=20000,
69
+ output_mode="int",
70
+ output_sequence_length=20,
71
+ vocabulary=english_vocab)
72
+
73
+ portuguese_index_lookup = dict(zip(range(len(portuguese_vocab)), portuguese_vocab))
74
+ max_decoded_sentence_length = 20
75
+
76
+
77
+ def decode_sequence(input_sentence):
78
+ tokenized_input_sentence = source_vectorization([input_sentence])
79
+ decoded_sentence = "[start]"
80
+
81
+ for i in range(max_decoded_sentence_length):
82
+ tokenized_target_sentence = target_vectorization([decoded_sentence])[:, :-1]
83
+ predictions = transformer([tokenized_input_sentence, tokenized_target_sentence])
84
+ sampled_token_index = np.argmax(predictions[0, i, :])
85
+ sampled_token = portuguese_index_lookup[sampled_token_index]
86
+ decoded_sentence += " " + sampled_token
87
+ if sampled_token == "[end]":
88
+ break
89
+ return decoded_sentence
90
+
91
+
92
+ eng_sentences =["What is its name?",
93
+ "How old are you?",
94
+ "I know you know where Mary is.",
95
+ "We will show Tom.",
96
+ "What do you all do?",
97
+ "Don't do it!"]
98
+
99
+ for sentence in eng_sentences:
100
+ print(f"English sentence:\n{sentence}")
101
+ print(f'Portuguese translation:\n{decode_sequence(sentence)}')
102
+ print('-' * 50)
103
+ ```
104
+ This will output the following:
105
+
106
+ ```
107
+ English sentence:
108
+ What is its name?
109
+ Portuguese translation:
110
+ [start] qual é o nome dele [end]
111
+ --------------------------------------------------
112
+ English sentence:
113
+ How old are you?
114
+ Portuguese translation:
115
+ [start] quantos anos você tem [end]
116
+ --------------------------------------------------
117
+ English sentence:
118
+ I know you know where Mary is.
119
+ Portuguese translation:
120
+ [start] eu sei que você sabe onde mary está [end]
121
+ --------------------------------------------------
122
+ English sentence:
123
+ We will show Tom.
124
+ Portuguese translation:
125
+ [start] vamos ligar para o tom [end]
126
+ --------------------------------------------------
127
+ English sentence:
128
+ What do you all do?
129
+ Portuguese translation:
130
+ [start] o que vocês todos nós têm feito [end]
131
+ --------------------------------------------------
132
+ English sentence:
133
+ Don't do it!
134
+ Portuguese translation:
135
+ [start] não faça isso [end]
136
+ --------------------------------------------------
137
+ ```
138
+ # Cite as 🤗
139
+ ```
140
+ @misc{teenytinycastle,
141
+ doi = {10.5281/zenodo.7112065},
142
+ url = {https://huggingface.co/AiresPucrs/transformer-eng-por},
143
+ author = {Nicholas Kluge Corr{\^e}a},
144
+ title = {Teeny-Tiny Castle},
145
+ year = {2023},
146
+ publisher = {HuggingFace},
147
+ journal = {HuggingFace repository},
148
+ }
149
+ ```
150
+ ## License
151
+ The transformer-eng-por is licensed under the Apache License, Version 2.0. See the LICENSE file for more details.