alifalhasan commited on
Commit
b1c38c2
1 Parent(s): 9a4dd2c

[Task] Minor Update

Browse files

[Description] Added comments and fixed the bug from requirements.txt
[Author]

@alifalhasan

app.py CHANGED
@@ -2,23 +2,23 @@ import gradio as gr
2
 
3
  from src.translation.translate import translate
4
 
5
- LANGS = ["arabic", "english"]
6
 
7
  if __name__ == "__main__":
8
  # Create the Gradio interface
9
  iface = gr.Interface(
10
- fn=translate,
11
  inputs=[
12
- gr.components.Textbox(label="Text"),
13
- gr.components.Dropdown(label="Source Language", choices=LANGS),
14
- gr.components.Dropdown(label="Target Language", choices=LANGS),
15
  ],
16
- outputs=["text"],
17
- examples=[["I'm ready", "english", "arabic"]],
18
- cache_examples=False,
19
- title="arabic2english",
20
- description="This is a translator app for arabic and english. Currently supports only english to arabic."
21
  )
22
 
23
  # Launch the interface
24
- iface.launch(share=True)
 
2
 
3
  from src.translation.translate import translate
4
 
5
+ LANGS = ["arabic", "english"] # Define a list of supported languages
6
 
7
  if __name__ == "__main__":
8
  # Create the Gradio interface
9
  iface = gr.Interface(
10
+ fn=translate, # Specify the translation function as the main function
11
  inputs=[
12
+ gr.components.Textbox(label="Text"), # Add a textbox input for entering text
13
+ gr.components.Dropdown(label="Source Language", choices=LANGS), # Add a dropdown for selecting source language
14
+ gr.components.Dropdown(label="Target Language", choices=LANGS), # Add a dropdown for selecting target language
15
  ],
16
+ outputs=["text"], # Define the output type as text
17
+ examples=[["I'm ready", "english", "arabic"]], # Provide an example input for demonstration
18
+ cache_examples=False, # Disable caching of examples
19
+ title="arabic2english", # Set the title of the interface
20
+ description="This is a translator app for arabic and english. Currently supports only english to arabic." # Add a description of the interface
21
  )
22
 
23
  # Launch the interface
24
+ iface.launch(share=True) # Launch the interface and enable sharing
requirements.txt CHANGED
@@ -1,7 +1,8 @@
1
  gradio
2
  torch>=1.6
3
  torchtext==0.6
4
- spacy
5
  transformers
6
  nltk
7
- pandas
 
 
 
1
  gradio
2
  torch>=1.6
3
  torchtext==0.6
 
4
  transformers
5
  nltk
6
+ pandas
7
+ spacy
8
+ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl
src/data_processing/data_processing.py CHANGED
@@ -7,23 +7,46 @@ from torchtext import data
7
  from spacy.lang.ar import Arabic
8
  from spacy.tokenizer import Tokenizer
9
 
 
10
  df = pd.read_csv(
11
  "data/arabic2english.txt",
12
  delimiter="\t",
13
  names=["eng", "ar"],
14
  )
15
 
 
16
  spacy_eng = spacy.load("en_core_web_sm")
17
 
 
18
  arab = Arabic()
 
 
19
  ar_Tokenizer = Tokenizer(arab.vocab)
20
 
21
 
22
  def engTokenizer(text):
 
 
 
 
 
 
 
 
 
23
  return [word.text for word in spacy_eng.tokenizer(text)]
24
 
25
 
26
  def arTokenizer(sentence):
 
 
 
 
 
 
 
 
 
27
  return [
28
  word.text
29
  for word in ar_Tokenizer(
@@ -32,6 +55,7 @@ def arTokenizer(sentence):
32
  ]
33
 
34
 
 
35
  SRC = data.Field(
36
  tokenize=engTokenizer, batch_first=False, init_token="<sos>", eos_token="<eos>"
37
  )
@@ -45,6 +69,20 @@ TRG = data.Field(
45
 
46
 
47
  class TextDataset(data.Dataset):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
 
49
  def __init__(self, df, src_field, target_field, is_test=False, **kwargs):
50
  fields = [("eng", src_field), ("ar", target_field)]
@@ -57,17 +95,35 @@ class TextDataset(data.Dataset):
57
  super().__init__(samples, fields, **kwargs)
58
 
59
  def __len__(self):
 
 
 
 
 
 
60
  return len(self.samples)
61
 
62
  def __getitem__(self, idx):
 
 
 
 
 
 
 
 
 
63
  return self.samples[idx]
64
 
65
 
 
66
  torchdataset = TextDataset(df, SRC, TRG)
67
 
 
68
  train_data, valid_data = torchdataset.split(
69
  split_ratio=0.8, random_state=random.seed(32)
70
  )
71
 
 
72
  SRC.build_vocab(train_data, min_freq=2)
73
  TRG.build_vocab(train_data, min_freq=2)
 
7
  from spacy.lang.ar import Arabic
8
  from spacy.tokenizer import Tokenizer
9
 
10
+ # Reading data into a pandas DataFrame
11
  df = pd.read_csv(
12
  "data/arabic2english.txt",
13
  delimiter="\t",
14
  names=["eng", "ar"],
15
  )
16
 
17
+ # Loading English language model from spaCy
18
  spacy_eng = spacy.load("en_core_web_sm")
19
 
20
+ # Creating an instance of Arabic language model from spaCy
21
  arab = Arabic()
22
+
23
+ # Creating a tokenizer for Arabic text using the Arabic language model
24
  ar_Tokenizer = Tokenizer(arab.vocab)
25
 
26
 
27
  def engTokenizer(text):
28
+ """
29
+ Tokenizes English text using spaCy tokenizer.
30
+
31
+ Args:
32
+ text (str): The input English text.
33
+
34
+ Returns:
35
+ list: List of tokens.
36
+ """
37
  return [word.text for word in spacy_eng.tokenizer(text)]
38
 
39
 
40
  def arTokenizer(sentence):
41
+ """
42
+ Tokenizes Arabic sentence using spaCy tokenizer.
43
+
44
+ Args:
45
+ sentence (str): The input Arabic sentence.
46
+
47
+ Returns:
48
+ list: List of tokens.
49
+ """
50
  return [
51
  word.text
52
  for word in ar_Tokenizer(
 
55
  ]
56
 
57
 
58
+ # Defining fields for source and target languages using torchtext
59
  SRC = data.Field(
60
  tokenize=engTokenizer, batch_first=False, init_token="<sos>", eos_token="<eos>"
61
  )
 
69
 
70
 
71
  class TextDataset(data.Dataset):
72
+ """
73
+ Custom dataset class for text data.
74
+
75
+ Args:
76
+ df (pandas.DataFrame): DataFrame containing source and target language data.
77
+ src_field (torchtext.data.Field): Field for source language.
78
+ target_field (torchtext.data.Field): Field for target language.
79
+ is_test (bool): Flag indicating if the dataset is for testing.
80
+
81
+ Attributes:
82
+ fields (list): List of tuples containing field names and corresponding Field objects.
83
+ samples (list): List of data examples.
84
+
85
+ """
86
 
87
  def __init__(self, df, src_field, target_field, is_test=False, **kwargs):
88
  fields = [("eng", src_field), ("ar", target_field)]
 
95
  super().__init__(samples, fields, **kwargs)
96
 
97
  def __len__(self):
98
+ """
99
+ Get the number of samples in the dataset.
100
+
101
+ Returns:
102
+ int: Number of samples.
103
+ """
104
  return len(self.samples)
105
 
106
  def __getitem__(self, idx):
107
+ """
108
+ Get a sample from the dataset.
109
+
110
+ Args:
111
+ idx (int): Index of the sample.
112
+
113
+ Returns:
114
+ torchtext.data.Example: Sample at the specified index.
115
+ """
116
  return self.samples[idx]
117
 
118
 
119
+ # Creating a TextDataset instance
120
  torchdataset = TextDataset(df, SRC, TRG)
121
 
122
+ # Splitting the dataset into training and validation sets
123
  train_data, valid_data = torchdataset.split(
124
  split_ratio=0.8, random_state=random.seed(32)
125
  )
126
 
127
+ # Building vocabularies for source and target languages
128
  SRC.build_vocab(train_data, min_freq=2)
129
  TRG.build_vocab(train_data, min_freq=2)
src/train/train.py CHANGED
@@ -7,7 +7,6 @@ from torchtext import data
7
  from transformer import Transformer
8
 
9
  import sys
10
-
11
  sys.path.append(os.path.abspath("src/data_processing/"))
12
  from data_processing import (
13
  SRC,
@@ -16,11 +15,13 @@ from data_processing import (
16
  valid_data,
17
  )
18
 
 
19
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
20
 
21
  """Hyperparameters"""
22
  BATCH_SIZE = 16
23
 
 
24
  train_iter, valid_iter = data.BucketIterator.splits(
25
  (train_data, valid_data),
26
  batch_size=BATCH_SIZE,
@@ -30,12 +31,12 @@ train_iter, valid_iter = data.BucketIterator.splits(
30
  device=device,
31
  shuffle=True,
32
  )
33
- load_model = False
34
- save_model = True
35
 
 
36
  num_epochs = 30
37
  learning_rate = 0.0001
38
 
 
39
  num_heads = 8
40
  num_encoder_layers = 3
41
  num_decoder_layers = 3
@@ -45,14 +46,14 @@ dropout = 0.4
45
  embedding_size = 256
46
  src_pad_idx = SRC.vocab.stoi["<pad>"]
47
 
48
-
49
  src_vocab_size = len(SRC.vocab)
50
- print("Size of english vocabulary:", src_vocab_size)
51
 
52
  trg_vocab_size = len(TRG.vocab)
53
- print("Size of arabic vocabulary:", trg_vocab_size)
54
-
55
 
 
56
  model = Transformer(
57
  embedding_size,
58
  src_vocab_size,
@@ -66,38 +67,43 @@ model = Transformer(
66
  device=device,
67
  ).to(device)
68
 
69
- loss_track = []
70
- loss_validation_track = []
71
-
72
 
 
73
  optimizer = optim.Adam(model.parameters(), lr=learning_rate)
74
 
 
75
  pad_idx = SRC.vocab.stoi["<pad>"]
76
  criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)
 
 
77
  for epoch in range(num_epochs):
78
  stepLoss = []
79
- model.train()
80
  for batch in train_iter:
81
  input_data = batch.eng.to(device)
82
  target = batch.ar.to(device)
83
 
84
- output = model(input_data, target[:-1])
85
- optimizer.zero_grad()
86
-
87
  output = output.reshape(-1, trg_vocab_size)
88
  target = target[1:].reshape(-1)
89
 
90
- loss = criterion(output, target)
91
- loss.backward()
 
92
 
93
- optimizer.step()
94
  stepLoss.append(loss.item())
95
 
96
- loss_track.append(np.mean(stepLoss))
97
  print(" Epoch {} | Train Cross Entropy Loss: ".format(epoch), np.mean(stepLoss))
 
 
98
  with torch.inference_mode():
99
  stepValidLoss = []
100
- model.eval()
101
  for i, batch in enumerate(valid_iter):
102
  input_sentence = batch.eng.to(device)
103
  target = batch.ar.to(device)
@@ -109,7 +115,7 @@ for epoch in range(num_epochs):
109
 
110
  stepValidLoss.append(loss.item())
111
 
112
- loss_validation_track.append(np.mean(stepValidLoss))
113
  print(
114
  " Epoch {} | Validation Cross Entropy Loss: ".format(epoch),
115
  np.mean(stepValidLoss),
 
7
  from transformer import Transformer
8
 
9
  import sys
 
10
  sys.path.append(os.path.abspath("src/data_processing/"))
11
  from data_processing import (
12
  SRC,
 
15
  valid_data,
16
  )
17
 
18
+ # Setting the device
19
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
20
 
21
  """Hyperparameters"""
22
  BATCH_SIZE = 16
23
 
24
+ # Creating data iterators for training and validation sets
25
  train_iter, valid_iter = data.BucketIterator.splits(
26
  (train_data, valid_data),
27
  batch_size=BATCH_SIZE,
 
31
  device=device,
32
  shuffle=True,
33
  )
 
 
34
 
35
+ # Training parameters
36
  num_epochs = 30
37
  learning_rate = 0.0001
38
 
39
+ # Transformer model hyperparameters
40
  num_heads = 8
41
  num_encoder_layers = 3
42
  num_decoder_layers = 3
 
46
  embedding_size = 256
47
  src_pad_idx = SRC.vocab.stoi["<pad>"]
48
 
49
+ # Vocabulary sizes
50
  src_vocab_size = len(SRC.vocab)
51
+ print("Size of English vocabulary:", src_vocab_size)
52
 
53
  trg_vocab_size = len(TRG.vocab)
54
+ print("Size of Arabic vocabulary:", trg_vocab_size)
 
55
 
56
+ # Creating the Transformer model
57
  model = Transformer(
58
  embedding_size,
59
  src_vocab_size,
 
67
  device=device,
68
  ).to(device)
69
 
70
+ # Lists to track training and validation losses
71
+ train_loss = []
72
+ validation_loss = []
73
 
74
+ # Optimizer definition
75
  optimizer = optim.Adam(model.parameters(), lr=learning_rate)
76
 
77
+ # Criterion for loss calculation
78
  pad_idx = SRC.vocab.stoi["<pad>"]
79
  criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)
80
+
81
+ # Main training loop
82
  for epoch in range(num_epochs):
83
  stepLoss = []
84
+ model.train() # Set the model to training mode
85
  for batch in train_iter:
86
  input_data = batch.eng.to(device)
87
  target = batch.ar.to(device)
88
 
89
+ output = model(input_data, target[:-1]) # Forward pass
90
+ optimizer.zero_grad() # Zero the gradients
 
91
  output = output.reshape(-1, trg_vocab_size)
92
  target = target[1:].reshape(-1)
93
 
94
+ loss = criterion(output, target) # Calculate the loss
95
+ loss.backward() # Backpropagation
96
+ optimizer.step() # Update the parameters
97
 
 
98
  stepLoss.append(loss.item())
99
 
100
+ train_loss.append(np.mean(stepLoss))
101
  print(" Epoch {} | Train Cross Entropy Loss: ".format(epoch), np.mean(stepLoss))
102
+
103
+ # Validation loop
104
  with torch.inference_mode():
105
  stepValidLoss = []
106
+ model.eval() # Set the model to evaluation mode
107
  for i, batch in enumerate(valid_iter):
108
  input_sentence = batch.eng.to(device)
109
  target = batch.ar.to(device)
 
115
 
116
  stepValidLoss.append(loss.item())
117
 
118
+ validation_loss.append(np.mean(stepValidLoss))
119
  print(
120
  " Epoch {} | Validation Cross Entropy Loss: ".format(epoch),
121
  np.mean(stepValidLoss),
src/train/transformer.py CHANGED
@@ -33,67 +33,86 @@ class Transformer(nn.Module):
33
  num_decoder_layers: Number of decoder layers.
34
  dropout: Dropout probability.
35
  max_len: Maximum sequence length.
 
36
  """
37
 
38
  super(Transformer, self).__init__()
 
39
  self.src_embeddings = nn.Embedding(src_vocab_size, embedding_size)
40
  self.src_positional_embeddings = nn.Embedding(max_len, embedding_size)
41
  self.trg_embeddings = nn.Embedding(trg_vocab_size, embedding_size)
42
  self.trg_positional_embeddings = nn.Embedding(max_len, embedding_size)
43
  self.device = device
 
44
  self.transformer = nn.Transformer(
45
  embedding_size,
46
  num_heads,
47
  num_encoder_layers,
48
  num_decoder_layers,
49
  )
50
-
51
  self.fc_out = nn.Linear(embedding_size, trg_vocab_size)
52
  self.dropout = nn.Dropout(dropout)
53
  self.src_pad_idx = src_pad_idx
54
 
55
  def make_src_mask(self, src):
56
- src_mask = src.transpose(0, 1) == self.src_pad_idx
 
 
 
 
57
 
 
 
 
 
58
  return src_mask.to(self.device)
59
 
60
  def forward(self, src, trg):
 
 
 
 
 
 
 
 
 
 
61
  src_seq_length, S = src.shape
62
  trg_seq_length, S = trg.shape
63
- # adding zeros is an easy way
64
  src_positions = (
65
  torch.arange(0, src_seq_length)
66
  .unsqueeze(1)
67
  .expand(src_seq_length, S)
68
  .to(self.device)
69
  )
70
-
71
  trg_positions = (
72
  torch.arange(0, trg_seq_length)
73
  .unsqueeze(1)
74
  .expand(trg_seq_length, S)
75
  .to(self.device)
76
  )
77
-
78
  embed_src = self.dropout(
79
  (self.src_embeddings(src) + self.src_positional_embeddings(src_positions))
80
  )
81
-
82
  embed_trg = self.dropout(
83
  (self.trg_embeddings(trg) + self.trg_positional_embeddings(trg_positions))
84
  )
85
-
86
  src_padding_mask = self.make_src_mask(src)
87
  trg_mask = self.transformer.generate_square_subsequent_mask(trg_seq_length).to(
88
  self.device
89
  )
90
-
91
  out = self.transformer(
92
  embed_src,
93
  embed_trg,
94
  src_key_padding_mask=src_padding_mask,
95
  tgt_mask=trg_mask,
96
  )
 
97
  out = self.fc_out(out)
98
-
99
  return out
 
33
  num_decoder_layers: Number of decoder layers.
34
  dropout: Dropout probability.
35
  max_len: Maximum sequence length.
36
+ device: Device to place tensors on.
37
  """
38
 
39
  super(Transformer, self).__init__()
40
+ # Embeddings for source and target sequences
41
  self.src_embeddings = nn.Embedding(src_vocab_size, embedding_size)
42
  self.src_positional_embeddings = nn.Embedding(max_len, embedding_size)
43
  self.trg_embeddings = nn.Embedding(trg_vocab_size, embedding_size)
44
  self.trg_positional_embeddings = nn.Embedding(max_len, embedding_size)
45
  self.device = device
46
+ # Transformer layer
47
  self.transformer = nn.Transformer(
48
  embedding_size,
49
  num_heads,
50
  num_encoder_layers,
51
  num_decoder_layers,
52
  )
53
+ # Final fully connected layer
54
  self.fc_out = nn.Linear(embedding_size, trg_vocab_size)
55
  self.dropout = nn.Dropout(dropout)
56
  self.src_pad_idx = src_pad_idx
57
 
58
  def make_src_mask(self, src):
59
+ """
60
+ Creates a mask to ignore padding tokens in the source sequence.
61
+
62
+ Args:
63
+ src: Source sequence tensor.
64
 
65
+ Returns:
66
+ src_mask: Mask tensor.
67
+ """
68
+ src_mask = src.transpose(0, 1) == self.src_pad_idx
69
  return src_mask.to(self.device)
70
 
71
  def forward(self, src, trg):
72
+ """
73
+ Forward pass of the Transformer model.
74
+
75
+ Args:
76
+ src: Source sequence tensor.
77
+ trg: Target sequence tensor.
78
+
79
+ Returns:
80
+ out: Output tensor.
81
+ """
82
  src_seq_length, S = src.shape
83
  trg_seq_length, S = trg.shape
84
+ # Generate position indices for source and target sequences
85
  src_positions = (
86
  torch.arange(0, src_seq_length)
87
  .unsqueeze(1)
88
  .expand(src_seq_length, S)
89
  .to(self.device)
90
  )
 
91
  trg_positions = (
92
  torch.arange(0, trg_seq_length)
93
  .unsqueeze(1)
94
  .expand(trg_seq_length, S)
95
  .to(self.device)
96
  )
97
+ # Apply embeddings and dropout for source and target sequences
98
  embed_src = self.dropout(
99
  (self.src_embeddings(src) + self.src_positional_embeddings(src_positions))
100
  )
 
101
  embed_trg = self.dropout(
102
  (self.trg_embeddings(trg) + self.trg_positional_embeddings(trg_positions))
103
  )
104
+ # Generate masks for source padding and target sequences
105
  src_padding_mask = self.make_src_mask(src)
106
  trg_mask = self.transformer.generate_square_subsequent_mask(trg_seq_length).to(
107
  self.device
108
  )
109
+ # Forward pass through Transformer
110
  out = self.transformer(
111
  embed_src,
112
  embed_trg,
113
  src_key_padding_mask=src_padding_mask,
114
  tgt_mask=trg_mask,
115
  )
116
+ # Apply final fully connected layer
117
  out = self.fc_out(out)
 
118
  return out
src/translation/translate.py CHANGED
@@ -1,6 +1,6 @@
1
- import torch
2
  import os
3
  import sys
 
4
 
5
  sys.path.append(os.path.abspath("src/train/"))
6
  sys.path.append(os.path.abspath("src/data_processing/"))
@@ -10,6 +10,7 @@ from data_processing import SRC, TRG, arTokenizer, engTokenizer
10
 
11
  device = "cpu"
12
 
 
13
  num_heads = 8
14
  num_encoder_layers = 3
15
  num_decoder_layers = 3
@@ -17,11 +18,12 @@ max_len = 230
17
  dropout = 0.4
18
  embedding_size = 256
19
 
 
20
  src_pad_idx = SRC.vocab.stoi["<pad>"]
21
  src_vocab_size = len(SRC.vocab)
22
  trg_vocab_size = len(TRG.vocab)
23
 
24
- # Initialize model with hyperparameters
25
  model = Transformer(
26
  embedding_size,
27
  src_vocab_size,
@@ -35,31 +37,47 @@ model = Transformer(
35
  device=device,
36
  ).to(device)
37
 
38
- # Load the saved model
39
  model.load_state_dict(torch.load("models/arabic2english.pt", map_location=device))
40
 
41
 
42
  def translate(sentence, srcField, targetField):
43
- """Translates an Arabic sentence to English using the model."""
44
- model.eval()
45
- srcTokenizer = engTokenizer
46
- srcField = SRC
47
- targetField = TRG
48
- processed_sentence = srcField.process([srcTokenizer(sentence)]).to(device)
49
- trg = ["بداية"]
 
 
 
 
 
 
 
 
 
 
 
 
50
 
 
51
  for _ in range(max_len):
52
  trg_tensor = (
53
  torch.tensor([targetField.vocab.stoi[word] for word in trg])
54
  .unsqueeze(1)
55
  .to(device)
56
  )
57
- outputs = model(processed_sentence, trg_tensor)
58
 
 
59
  pred_token = targetField.vocab.itos[outputs.argmax(2)[-1:].item()]
60
- if pred_token != "<unk>":
61
  trg.append(pred_token)
62
- if pred_token == "نهاية":
63
  break
64
 
65
- return " ".join([word for word in trg if word != "<unk>"][1:-1])
 
 
 
 
1
  import os
2
  import sys
3
+ import torch
4
 
5
  sys.path.append(os.path.abspath("src/train/"))
6
  sys.path.append(os.path.abspath("src/data_processing/"))
 
10
 
11
  device = "cpu"
12
 
13
+ # Define model hyperparameters
14
  num_heads = 8
15
  num_encoder_layers = 3
16
  num_decoder_layers = 3
 
18
  dropout = 0.4
19
  embedding_size = 256
20
 
21
+ # Define vocabulary sizes and padding index
22
  src_pad_idx = SRC.vocab.stoi["<pad>"]
23
  src_vocab_size = len(SRC.vocab)
24
  trg_vocab_size = len(TRG.vocab)
25
 
26
+ # Initialize model with specified hyperparameters
27
  model = Transformer(
28
  embedding_size,
29
  src_vocab_size,
 
37
  device=device,
38
  ).to(device)
39
 
40
+ # Load the saved model parameters
41
  model.load_state_dict(torch.load("models/arabic2english.pt", map_location=device))
42
 
43
 
44
  def translate(sentence, srcField, targetField):
45
+ """
46
+ Translates an English sentence to Arabic using the Transformer model.
47
+
48
+ Args:
49
+ sentence (str): Input Arabic sentence to be translated.
50
+ srcField: Source language field.
51
+ targetField: Target language field.
52
+
53
+ Returns:
54
+ str: Translated English sentence.
55
+ """
56
+ model.eval() # Set model to evaluation mode
57
+ srcTokenizer = engTokenizer # Initialize source tokenizer
58
+ srcField = SRC # Set source language field to English
59
+ targetField = TRG # Set target language field to Arabic
60
+ processed_sentence = srcField.process([srcTokenizer(sentence)]).to(
61
+ device
62
+ ) # Process input sentence
63
+ trg = ["بداية"] # Initialize target sentence with start token
64
 
65
+ # Generate translation
66
  for _ in range(max_len):
67
  trg_tensor = (
68
  torch.tensor([targetField.vocab.stoi[word] for word in trg])
69
  .unsqueeze(1)
70
  .to(device)
71
  )
72
+ outputs = model(processed_sentence, trg_tensor) # Generate output predictions
73
 
74
+ # Determine predicted token
75
  pred_token = targetField.vocab.itos[outputs.argmax(2)[-1:].item()]
76
+ if pred_token != "<unk>": # Exclude unknown tokens
77
  trg.append(pred_token)
78
+ if pred_token == "نهاية": # Stop translation at end token
79
  break
80
 
81
+ return " ".join(
82
+ [word for word in trg if word != "<unk>"][1:-1]
83
+ ) # Return translated sentence