hash-map commited on
Commit
9d11688
·
verified ·
1 Parent(s): 988f0c9

Upload 10 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ full_transformer[[:space:]](1).keras filter=lfs diff=lfs merge=lfs -text
37
+ full_transformer[[:space:]](2).keras filter=lfs diff=lfs merge=lfs -text
38
+ full_transformer.keras filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from model import decode_sequence
2
+ import gradio as gr
3
+ def translate_english_to_telugu(english_sentence):
4
+ telugu_text, token_ids = decode_sequence(english_sentence)
5
+ return telugu_text, token_ids
6
+ iface = gr.Interface(
7
+ fn=translate_english_to_telugu,
8
+ inputs=gr.Textbox(label="Enter English Sentence"),
9
+ outputs=[
10
+ gr.Textbox(label="Translated Telugu Text"),
11
+ gr.Textbox(label="Token IDs")
12
+ ],
13
+ title="English to Telugu Translator",
14
+ description="Enter an English sentence to get its Telugu translation and token IDs."
15
+ )
16
+
17
+ # Launch the interface
18
+ if __name__ == "__main__":
19
+ iface.launch()
full_transformer (1).keras ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5c3c7cbdfd37fa4d1764dbfb0e9199dc37dfd51383070212c9be024ac513472c
3
+ size 281360984
full_transformer (2).keras ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a008b7eca65e919a4b0b8d1a6913bdca39ad4a378bbae1bf17e7b04b107f74d9
3
+ size 281360984
full_transformer.keras ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f06503049fc5b8f390e42755b9fd5cbe71f34f61d7cffcf351689cfd72375db8
3
+ size 281361169
model.py ADDED
@@ -0,0 +1,355 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import tensorflow as tf
3
+ import sentencepiece as spm
4
+ import numpy as np
5
+ from tensorflow import keras
6
+ from tensorflow.keras import layers
7
+ import os
8
+ text_pairs=[
9
+ ("Farmers fear that the elephant will destroy the crops","వర్షాలకు చేతికి వచ్చిన పంట దెబ్బతిన్నదని రైతులు వాపోతున్నారు"),
10
+ ("The death toll in the state stands at 9,863","దీంతో రాష్ట్రంలో ఇప్పటి వరకు మొత్తం డిశ్చార్జ్‌ల సంఖ్య 9,15,626కి చేరింది"),
11
+ ("Koo is available in Hindi, Kannada, Telugu, Tamil, Bengali, Gujarati and Marathi","ప్రశ్నలతో రూపొందించిన వీడియోలు మాత్రం ఆంగ్లం, హిందీ, మరాఠీ, కన్నడ, గుజరాతీ, బెంగాల్ భాషల్లో చూడోచ్చు" ) ,
12
+ ("How can the court direct the government to do this?","ప్రభుత్వం ఎలా వ్యవహరించి ఉండాల్సింది?" ),
13
+ ("America is safer today" ,"అమెరికాలో పరిస్థితి రోజురోజుకూ దారుణంగా మారుతోంది" ),
14
+ ("I don't look into that, to be president" ,"నేను ముఖ్యమంత్రిని కావాలని అనుకోలేదన్నారు" ),
15
+ ("He had tested positive for coronavirus" ,"కరోనా లక్షణాలు కనిపించడంతో టెస్ట్ చేసుకున్న ఆయనకు పాజిటివ్ గా నిర్దారణ అయ్యింది" ),
16
+ ("New Delhi: Amid the novel coronavirus situation in the country, locals in Delhi are taking precautionary measures in Delhi","న్యూడిల్లీ: దేశవ్యాప్తంగా కరోనా మహమ్మారి విజృంభిస్తున్న నేపథ్యంలో కేంద్ర ప్రభుత్వం మరింత అప్రమత్తమైంది" ),
17
+ ("She was rescued yesterday and admitted to a hospital" ,"శనివారం నాడు ఆమె ఆసుపత్రి నుండి డిశ్చార్జ్ అయ్యారు")
18
+
19
+ ]
20
+ # -----------------------
21
+ # 3. Load SentencePiece models in TensorFlow
22
+ # -----------------------
23
+ def load_spm(path):
24
+ with open(path, "rb") as f:
25
+ return f.read()
26
+
27
+ spm_model_en = load_spm("spm_en.model")
28
+ spm_model_te = load_spm("spm_te.model")
29
+
30
+ tokenizer_en = tf_text.SentencepieceTokenizer(model=spm_model_en)
31
+ tokenizer_te = tf_text.SentencepieceTokenizer(model=spm_model_te)
32
+
33
+ # -----------------------
34
+ # 4. Encode text pairs
35
+ # -----------------------
36
+ sequence_length = 50
37
+
38
+ def encode_source(texts):
39
+ return tokenizer_en.tokenize(texts).to_tensor(shape=(None, sequence_length))
40
+
41
+ def encode_target(texts):
42
+ return tokenizer_te.tokenize(texts).to_tensor(shape=(None, sequence_length + 1))
43
+ # Convert a batch of token IDs to strings
44
+
45
+
46
+ # Example: build dataset
47
+ english_texts = [pair[0] for pair in text_pairs]
48
+ telugu_texts = [pair[1] for pair in text_pairs]
49
+
50
+ X = encode_source(tf.constant(english_texts))
51
+ Y = encode_target(tf.constant(telugu_texts))
52
+
53
+ import random
54
+ for i in range(5):
55
+ print(random.choice(text_pairs))
56
+ len(text_pairs)
57
+
58
+ for idx in range(len(text_pairs)):
59
+ english ,telugu = text_pairs[i]
60
+ spanish = "[start] " + telugu + " [end]"
61
+ text_pairs.append((english, telugu))
62
+ class TransformerDecoder(layers.Layer):
63
+ def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
64
+ super().__init__(**kwargs)
65
+ self.embed_dim = embed_dim
66
+ self.dense_dim = dense_dim
67
+ self.num_heads = num_heads
68
+ self.attention_1 = layers.MultiHeadAttention(
69
+ num_heads=num_heads, key_dim=embed_dim)
70
+ self.attention_2 = layers.MultiHeadAttention(
71
+ num_heads=num_heads, key_dim=embed_dim)
72
+ self.dense_proj = keras.Sequential(
73
+ [layers.Dense(dense_dim, activation="relu"),
74
+ layers.Dense(embed_dim),]
75
+ )
76
+ self.layernorm_1 = layers.LayerNormalization()
77
+ self.layernorm_2 = layers.LayerNormalization()
78
+ self.layernorm_3 = layers.LayerNormalization()
79
+ self.supports_masking = True
80
+
81
+ def get_config(self):
82
+ config = super().get_config()
83
+ config.update({
84
+ "embed_dim": self.embed_dim,
85
+ "num_heads": self.num_heads,
86
+ "dense_dim": self.dense_dim,
87
+ })
88
+ return config
89
+
90
+ def get_causal_attention_mask(self, inputs):
91
+ input_shape = tf.shape(inputs)
92
+ batch_size, sequence_length = input_shape[0], input_shape[1]
93
+ i = tf.range(sequence_length)[:, tf.newaxis]
94
+ j = tf.range(sequence_length)
95
+ mask = tf.cast(i >= j, dtype="int32")
96
+ mask = tf.reshape(mask, (1, input_shape[1], input_shape[1]))
97
+ mult = tf.concat(
98
+ [tf.expand_dims(batch_size, -1),
99
+ tf.constant([1, 1], dtype=tf.int32)], axis=0)
100
+ return tf.tile(mask, mult)
101
+
102
+ def call(self, inputs, encoder_outputs, mask=None):
103
+ causal_mask = self.get_causal_attention_mask(inputs)
104
+ if mask is not None:
105
+ padding_mask = tf.cast(
106
+ mask[:, tf.newaxis, :], dtype="int32")
107
+ padding_mask = tf.minimum(padding_mask, causal_mask)
108
+ else:
109
+ padding_mask = mask
110
+ attention_output_1 = self.attention_1(
111
+ query=inputs,
112
+ value=inputs,
113
+ key=inputs,
114
+ attention_mask=causal_mask)
115
+ attention_output_1 = self.layernorm_1(inputs + attention_output_1)
116
+ attention_output_2 = self.attention_2(
117
+ query=attention_output_1,
118
+ value=encoder_outputs,
119
+ key=encoder_outputs,
120
+ attention_mask=padding_mask,
121
+ )
122
+ attention_output_2 = self.layernorm_2(
123
+ attention_output_1 + attention_output_2)
124
+ proj_output = self.dense_proj(attention_output_2)
125
+ return self.layernorm_3(attention_output_2 + proj_output)
126
+ import tensorflow as tf
127
+ from tensorflow import keras
128
+ from tensorflow.keras import layers
129
+
130
+ # Define the PositionalEmbedding layer
131
+ class PositionalEmbedding(layers.Layer):
132
+ def __init__(self, sequence_length, vocab_size, embed_dim, **kwargs):
133
+ super().__init__(**kwargs)
134
+ self.token_embeddings = layers.Embedding(
135
+ input_dim=vocab_size, output_dim=embed_dim
136
+ )
137
+ self.position_embeddings = layers.Embedding(
138
+ input_dim=sequence_length, output_dim=embed_dim
139
+ )
140
+ self.sequence_length = sequence_length
141
+ self.vocab_size = vocab_size
142
+ self.embed_dim = embed_dim
143
+
144
+ def call(self, inputs):
145
+ length = tf.shape(inputs)[-1]
146
+ positions = tf.range(start=0, limit=length, delta=1)
147
+ embedded_tokens = self.token_embeddings(inputs)
148
+ embedded_positions = self.position_embeddings(positions)
149
+ return embedded_tokens + embedded_positions
150
+
151
+ def compute_mask(self, inputs, mask=None):
152
+ # Properly handle mask computation within Keras
153
+ if mask is None:
154
+ return None
155
+ return mask
156
+
157
+ def get_config(self):
158
+ config = super().get_config()
159
+ config.update({
160
+ "sequence_length": self.sequence_length,
161
+ "vocab_size": self.vocab_size,
162
+ "embed_dim": self.embed_dim,
163
+ })
164
+ return config
165
+
166
+ # Define the TransformerEncoder layer (example implementation)
167
+ class TransformerEncoder(layers.Layer):
168
+ def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
169
+ super().__init__(**kwargs)
170
+ self.embed_dim = embed_dim
171
+ self.dense_dim = dense_dim
172
+ self.num_heads = num_heads
173
+ self.attention = layers.MultiHeadAttention(
174
+ num_heads=num_heads, key_dim=embed_dim
175
+ )
176
+ self.dense_proj = keras.Sequential([
177
+ layers.Dense(dense_dim, activation="relu"),
178
+ layers.Dense(embed_dim),
179
+ ])
180
+ self.layernorm_1 = layers.LayerNormalization()
181
+ self.layernorm_2 = layers.LayerNormalization()
182
+
183
+ def call(self, inputs, mask=None):
184
+ if mask is not None:
185
+ mask = mask[:, tf.newaxis, :]
186
+ attention_output = self.attention(inputs, inputs, attention_mask=mask)
187
+ proj_input = self.layernorm_1(inputs + attention_output)
188
+ proj_output = self.dense_proj(proj_input)
189
+ return self.layernorm_2(proj_input + proj_output)
190
+
191
+ def get_config(self):
192
+ config = super().get_config()
193
+ config.update({
194
+ "embed_dim": self.embed_dim,
195
+ "dense_dim": self.dense_dim,
196
+ "num_heads": self.num_heads,
197
+ })
198
+ return config
199
+
200
+
201
+ import sentencepiece as spm
202
+ sp_te = spm.SentencePieceProcessor(model_file="spm_te.model")
203
+
204
+ def decode_ids(ids):
205
+ return sp_te.decode(ids)
206
+
207
+
208
+
209
+ import tensorflow as tf
210
+ from tensorflow import keras
211
+
212
+ loss_object = keras.losses.SparseCategoricalCrossentropy(
213
+ from_logits=True, reduction="none"
214
+ )
215
+
216
+ def masked_loss(y_true, y_pred):
217
+ # Normal sparse CE (batch, seq_len)
218
+ loss_ = loss_object(y_true, y_pred)
219
+
220
+ # Create mask (ignore pad = 0)
221
+ mask = tf.cast(tf.not_equal(y_true, 0), loss_.dtype)
222
+
223
+ # Apply mask
224
+ loss_ = loss_ * mask
225
+
226
+ # Return mean only over non-masked tokens
227
+ return tf.reduce_sum(loss_) / tf.reduce_sum(mask)
228
+
229
+ def masked_accuracy(y_true, y_pred):
230
+ y_pred = tf.argmax(y_pred, axis=-1, output_type=y_true.dtype)
231
+
232
+ matches = tf.cast(tf.equal(y_true, y_pred), tf.float32)
233
+ mask = tf.cast(tf.not_equal(y_true, 0), tf.float32)
234
+
235
+ return tf.reduce_sum(matches * mask) / tf.reduce_sum(mask)
236
+
237
+ # Define callbacks
238
+ transformer = keras.models.load_model(
239
+ "full_transformer.keras",
240
+ custom_objects={
241
+ "TransformerEncoder": TransformerEncoder,
242
+ "PositionalEmbedding": PositionalEmbedding,
243
+ "TransformerDecoder":TransformerDecoder,
244
+ "masked_loss":masked_loss,
245
+ "masked_accuracy":masked_accuracy
246
+
247
+ }
248
+ )
249
+ # Define callbacks
250
+ transformer = keras.models.load_model(
251
+ "full_transformer (2).keras",
252
+ custom_objects={
253
+ "TransformerEncoder": TransformerEncoder,
254
+ "PositionalEmbedding": PositionalEmbedding,
255
+ "TransformerDecoder":TransformerDecoder,
256
+ "masked_loss":masked_loss,
257
+ "masked_accuracy":masked_accuracy
258
+
259
+ }
260
+ )
261
+ # Define callbacks
262
+ transformer2 = keras.models.load_model(
263
+ "full_transformer (1).keras",
264
+ custom_objects={
265
+ "TransformerEncoder": TransformerEncoder,
266
+ "PositionalEmbedding": PositionalEmbedding,
267
+ "TransformerDecoder":TransformerDecoder,
268
+ "masked_loss":masked_loss,
269
+ "masked_accuracy":masked_accuracy
270
+
271
+ }
272
+ )
273
+ # Define callbacks
274
+ transformer3 = keras.models.load_model(
275
+ "full_transformer.keras",
276
+ custom_objects={
277
+ "TransformerEncoder": TransformerEncoder,
278
+ "PositionalEmbedding": PositionalEmbedding,
279
+ "TransformerDecoder":TransformerDecoder,
280
+ "masked_loss":masked_loss,
281
+ "masked_accuracy":masked_accuracy
282
+
283
+ }
284
+ )
285
+
286
+ def decode_tokens(token_ids):
287
+ # token_ids: tf.Tensor shape (seq_len,)
288
+ token_ids = tf.expand_dims(token_ids, 0) # add batch dim
289
+ decoded = tokenizer_te.detokenize(token_ids) # returns tf.Tensor of shape (1,)
290
+ return decoded[0].numpy().decode("utf-8")
291
+ import tensorflow as tf
292
+ import numpy as np
293
+
294
+ def encode_source(texts):
295
+ return tokenizer_en.tokenize(texts).to_tensor(shape=(None, sequence_length))
296
+
297
+ # Modified decode_sequence to return tokens and text
298
+ def decode_sequence(input_sentence, t=transformer, max_len=50):
299
+ tokenized_input = encode_source([input_sentence])
300
+
301
+ # Initialize sequence with start token
302
+ start_id = tokenizer_te.string_to_id('[start]').numpy()
303
+ end_id = tokenizer_te.string_to_id('[end]').numpy()
304
+ seq = [start_id]
305
+
306
+ for _ in range(max_len):
307
+ if seq[-1] == end_id:
308
+ break
309
+
310
+ tgt = tf.expand_dims(seq, 0)
311
+ predictions = t([tokenized_input, tgt])
312
+
313
+ # Get probabilities for the last predicted token
314
+ probs = tf.nn.softmax(predictions[0, len(seq)-1, :]).numpy()
315
+ next_id = np.argmax(probs) # Select most probable token
316
+ seq.append(int(next_id))
317
+
318
+ # Decode sequence to text
319
+ decoded = tokenizer_te.detokenize(tf.constant([seq])).numpy()[0]
320
+ decoded_text = decoded.decode("utf-8").replace("[start]", "").replace("[end]", "").strip()
321
+
322
+ return decoded_text, seq
323
+
324
+
325
+ max_decoded_sentence_length = 50
326
+
327
+ # Evaluate some random samples
328
+ test_eng_texts = [pair[0] for pair in text_pairs]
329
+ final_pairs = [pair[1] for pair in text_pairs]
330
+
331
+
332
+ for _ in range(5):
333
+ idx = random.randint(0, len(test_eng_texts) - 1)
334
+ input_sentence = test_eng_texts[idx]
335
+ decoded = decode_sequence(input_sentence,transformer)
336
+ original = final_pairs[idx].replace("[start]", "").replace("[end]", "").strip()
337
+
338
+
339
+
340
+
341
+ idx = random.randint(0, len(test_eng_texts) - 1)
342
+ input_sentence = test_eng_texts[idx]
343
+ decoded = decode_sequence(input_sentence,transformer3)
344
+ original = final_pairs[idx].replace("[start]", "").replace("[end]", "").strip()
345
+
346
+ # BLEU expects tokenized sentences
347
+ original_tokens = tokenizer_te.tokenize([original]).numpy()[0]
348
+ decoded_tokens = tokenizer_te.tokenize([decoded]).numpy()[0]
349
+ print("original tokens:",original_tokens)
350
+ print("decoded_tokens",decoded_tokens)
351
+ print(original)
352
+ print(decoded)
353
+
354
+ # Example decoding
355
+ print(decode_sequence("your response to the question is not good you need to improve and this is order not request",transformer3))
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ tensorflow>=2.10.0
2
+ tensorflow-text>=2.10.0
3
+ sentencepiece>=0.1.99
4
+ gradio>=4.0.0
5
+ numpy>=1.21.0
spm_en.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b5dcafc10d182ac22a23d958199332fa83744c20debeb46b0bfcb3c7187401dd
3
+ size 779337
spm_en.vocab ADDED
The diff for this file is too large to render. See raw diff
 
spm_te.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dec2e4d652d0ca75217f5d31212267ff042048a8a97dbb462acd07983a15bbfd
3
+ size 1207994
spm_te.vocab ADDED
The diff for this file is too large to render. See raw diff