BhuiyanMasum commited on
Commit
ccfa333
1 Parent(s): e13f31a

Updated data.py

Browse files
Files changed (4) hide show
  1. data/dataset.txt +0 -0
  2. src/pipes/const.py +2 -2
  3. src/pipes/data.py +122 -58
  4. src/pipes/utils.py +0 -27
data/dataset.txt CHANGED
The diff for this file is too large to render. See raw diff
 
src/pipes/const.py CHANGED
@@ -1,3 +1,3 @@
1
  data_dir: str = "E:/bn_multi_tribe_mt/data/"
2
- example_count: int = 1270
3
- langs: list[str] = ['bn', 'en', 'gr']
 
1
  data_dir: str = "E:/bn_multi_tribe_mt/data/"
2
+ langs: list[str] = ['bn', 'en', 'gr']
3
+ MAX_SEQ_LEN = 30
src/pipes/data.py CHANGED
@@ -1,78 +1,137 @@
1
  import random
2
- from pipes import utils, const
 
 
3
 
4
-
5
- class Sentence:
6
  def __init__(self):
7
- self.sentence_dict = None
8
- self.shuffled_sentences = None
9
  self.shuffled_indices = None
10
- self.sentences = None
11
  self.max_seq_length = None
12
  self.vocab = None
 
13
 
14
- def pack(self, lang):
15
- self.sentences = utils.read_file("{}/raw/{}.txt".format(const.data_dir, lang))
16
 
17
- example_count = len(self.sentences)
18
- split_index = int(example_count * 0.80)
19
 
20
  if self.shuffled_indices is None:
21
- self.shuffled_indices = list(range(example_count))
22
  random.shuffle(self.shuffled_indices)
23
 
24
- self.shuffled_sentences = [self.sentences[i] for i in self.shuffled_indices]
25
-
26
- for i in range(example_count):
27
- self.shuffled_sentences[i] = utils.remove_punctuation(self.shuffled_sentences[i])
28
- self.sentences[i] = utils.add_start_end_tags(self.sentences[i])
29
-
30
- self.vocab = utils.build_vocab(self.shuffled_sentences)
31
- self.max_seq_length = max(len(sentence.split()) for sentence in self.shuffled_sentences)
32
- self.sentence_dict = dict(
33
- max_seq_len=self.max_seq_length,
34
- vocab_size=len(self.vocab),
35
- vocab=self.vocab,
36
- train=self.shuffled_sentences[:split_index],
37
- val=self.shuffled_sentences[split_index:],
38
- count=example_count
39
  )
40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  def tokenize(self):
42
- tokenized_train_sentences = []
43
- for sentence in self.sentence_dict["train"]:
44
  tokens = []
45
- for word in sentence.split():
46
  tokens.append(self.vocab.index(word))
47
- tokenized_train_sentences.append(tokens)
48
 
49
- tokenized_val_sentences = []
50
- for sentence in self.sentence_dict["train"]:
51
  tokens = []
52
- for word in sentence.split():
53
  tokens.append(self.vocab.index(word))
54
- tokenized_val_sentences.append(tokens)
55
-
56
- self.sentence_dict["train"] = tokenized_train_sentences
57
- self.sentence_dict["val"] = tokenized_val_sentences
58
 
59
- def pad(self, max_seq_len=None):
60
- if max_seq_len is None:
61
- max_seq_len = self.sentence_dict["max_seq_len"]
62
 
63
- padded_train_sentences = []
64
- for sentence in self.sentence_dict["train"]:
65
- padded_train_sentences.append(utils.pad_sequence(sentence, max_seq_len))
66
 
67
- padded_val_sentences = []
68
- for sentence in self.sentence_dict["val"]:
69
- padded_val_sentences.append(utils.pad_sequence(sentence, max_seq_len))
70
 
71
- self.sentence_dict["train"] = padded_train_sentences
72
- self.sentence_dict["val"] = padded_val_sentences
 
73
 
74
  def get_dict(self):
75
- return self.sentence_dict
76
 
77
 
78
  class Dataset:
@@ -81,18 +140,22 @@ class Dataset:
81
  self.dataset_dict = {}
82
 
83
  def pack(self):
84
- sentence_object = Sentence()
85
  for lang in self.langs:
86
- sentence_object.pack(lang)
87
- self.dataset_dict[lang] = sentence_object.get_dict()
 
88
 
89
  def process(self):
90
- sentence_object = Sentence()
91
  for lang in self.langs:
92
- sentence_object.pack(lang)
93
- sentence_object.tokenize()
94
- sentence_object.pad()
95
- self.dataset_dict[lang] = sentence_object.get_dict()
 
 
 
96
 
97
  def get_dict(self):
98
  return self.dataset_dict
@@ -101,6 +164,7 @@ class Dataset:
101
  if __name__ == "__main__":
102
  dataset_object = Dataset(const.langs)
103
  dataset_object.pack()
104
- dataset_object.process()
105
  dataset_dict = dataset_object.get_dict()
106
  utils.save_dict("{}/dataset.txt".format(const.data_dir), dataset_dict)
 
 
 
1
  import random
2
+ import const
3
+ import utils
4
+ import string
5
 
6
+ class SequenceLoader:
 
7
  def __init__(self):
8
+ self.sequence_dict = None
9
+ self.shuffled_sequences = None
10
  self.shuffled_indices = None
11
+ self.sequences = None
12
  self.max_seq_length = None
13
  self.vocab = None
14
+ self.lang = None
15
 
16
+ def pack(self):
17
+ self.sequences = utils.read_file("{}/raw/{}.txt".format(const.data_dir, self.lang))
18
 
19
+ examples_count = len(self.sequences)
20
+ split_index = int(examples_count * 0.80)
21
 
22
  if self.shuffled_indices is None:
23
+ self.shuffled_indices = list(range(examples_count))
24
  random.shuffle(self.shuffled_indices)
25
 
26
+ self.shuffled_sequences = [self.sequences[i] for i in self.shuffled_indices]
27
+
28
+ self.sequence_dict = dict(
29
+ train=self.shuffled_sequences[:split_index],
30
+ val=self.shuffled_sequences[split_index:],
31
+ count=examples_count,
 
 
 
 
 
 
 
 
 
32
  )
33
 
34
+ def get_dict(self):
35
+ return self.sequence_dict
36
+
37
+ def set_lang(self, lang):
38
+ self.lang = lang
39
+
40
+
41
+ def remove_punctuation_from_seq(seq):
42
+ english_punctuations = string.punctuation
43
+ bangla_punctuations = "৷-–—’‘৳…।"
44
+ all_punctuations = english_punctuations + bangla_punctuations
45
+ cleaned_seq = ''.join([char for char in seq if char not in all_punctuations])
46
+ cleaned_seq = cleaned_seq.strip()
47
+ cleaned_seq = ' '.join(cleaned_seq.split())
48
+ return cleaned_seq
49
+
50
+
51
+ def add_start_end_tags_seq(sequence):
52
+ return '<SOS> ' + sequence + ' <EOS>'
53
+
54
+
55
+ def pad_sequence(sequence, max_seq_len, padding_token=0):
56
+ padded_sequence = sequence[:max_seq_len] + [padding_token] * (max_seq_len - len(sequence))
57
+ return padded_sequence
58
+
59
+
60
+ class SequenceProcessor:
61
+ def __init__(self, _dataset_dict):
62
+ self.max_seq_len = 0
63
+ self.lang = None
64
+ self.dataset_dict = _dataset_dict
65
+ self.vocab = None
66
+
67
+ def remove_punctuation(self):
68
+ for i in range(len(self.dataset_dict[self.lang]["train"])):
69
+ self.dataset_dict[self.lang]["train"][i] = remove_punctuation_from_seq(
70
+ self.dataset_dict[self.lang]["train"][i])
71
+
72
+ for i in range(len(self.dataset_dict[self.lang]["val"])):
73
+ self.dataset_dict[self.lang]["val"][i] = remove_punctuation_from_seq(
74
+ self.dataset_dict[self.lang]["val"][i])
75
+
76
+ def build_vocab(self):
77
+ vocab = set()
78
+
79
+ for i in range(len(self.dataset_dict[self.lang]["train"])):
80
+ seq = self.dataset_dict[self.lang]["train"][i]
81
+ vocab.update(seq.split())
82
+
83
+ for i in range(len(self.dataset_dict[self.lang]["val"])):
84
+ seq = self.dataset_dict[self.lang]["val"][i]
85
+ vocab.update(seq.split())
86
+
87
+ self.vocab = sorted(list(vocab))
88
+ self.dataset_dict[self.lang]["vocab"] = self.vocab
89
+ self.dataset_dict[self.lang]["vocab_size"] = len(self.vocab)
90
+
91
+ def add_start_end_tags(self):
92
+ for i in range(len(self.dataset_dict[self.lang]["train"])):
93
+ self.dataset_dict[self.lang]["train"][i] = add_start_end_tags_seq(
94
+ self.dataset_dict[self.lang]["train"][i])
95
+ self.max_seq_len = max(len(self.dataset_dict[self.lang]["train"][i].split()), self.max_seq_len)
96
+
97
+ for i in range(len(self.dataset_dict[self.lang]["val"])):
98
+ self.dataset_dict[self.lang]["val"][i] = add_start_end_tags_seq(
99
+ self.dataset_dict[self.lang]["val"][i])
100
+ self.max_seq_len = max(len(self.dataset_dict[self.lang]["val"][i].split()), self.max_seq_len)
101
+
102
+ self.dataset_dict[self.lang]["max_seq_len"] = self.max_seq_len
103
+
104
  def tokenize(self):
105
+ for i in range(len(self.dataset_dict[self.lang]["train"])):
106
+ seq = self.dataset_dict[self.lang]["train"][i]
107
  tokens = []
108
+ for word in seq.split():
109
  tokens.append(self.vocab.index(word))
110
+ self.dataset_dict[self.lang]["train"][i] = tokens
111
 
112
+ for i in range(len(self.dataset_dict[self.lang]["val"])):
113
+ seq = self.dataset_dict[self.lang]["val"][i]
114
  tokens = []
115
+ for word in seq.split():
116
  tokens.append(self.vocab.index(word))
117
+ self.dataset_dict[self.lang]["val"][i] = tokens
 
 
 
118
 
119
+ def pad(self, max_seq_len=const.MAX_SEQ_LEN):
 
 
120
 
121
+ for i in range(len(self.dataset_dict[self.lang]["train"])):
122
+ self.dataset_dict[self.lang]["train"][i] = pad_sequence(
123
+ sequence=self.dataset_dict[self.lang]["train"][i], max_seq_len=max_seq_len)
124
 
125
+ for i in range(len(self.dataset_dict[self.lang]["val"])):
126
+ self.dataset_dict[self.lang]["val"][i] = pad_sequence(sequence=self.dataset_dict[self.lang]["val"][i],
127
+ max_seq_len=self.max_seq_len)
128
 
129
+ def set_lang(self, lang):
130
+ self.lang = lang
131
+ self.max_seq_len = 0
132
 
133
  def get_dict(self):
134
+ return self.dataset_dict
135
 
136
 
137
  class Dataset:
 
140
  self.dataset_dict = {}
141
 
142
  def pack(self):
143
+ seq_loader = SequenceLoader()
144
  for lang in self.langs:
145
+ seq_loader.set_lang(lang)
146
+ seq_loader.pack()
147
+ self.dataset_dict[lang] = seq_loader.get_dict()
148
 
149
  def process(self):
150
+ seq_processor = SequenceProcessor(self.dataset_dict)
151
  for lang in self.langs:
152
+ seq_processor.set_lang(lang)
153
+ seq_processor.remove_punctuation()
154
+ seq_processor.add_start_end_tags()
155
+ seq_processor.build_vocab()
156
+ seq_processor.tokenize()
157
+ seq_processor.pad()
158
+ self.dataset_dict = seq_processor.get_dict()
159
 
160
  def get_dict(self):
161
  return self.dataset_dict
 
164
  if __name__ == "__main__":
165
  dataset_object = Dataset(const.langs)
166
  dataset_object.pack()
 
167
  dataset_dict = dataset_object.get_dict()
168
  utils.save_dict("{}/dataset.txt".format(const.data_dir), dataset_dict)
169
+ dataset_object.process()
170
+ print(utils.load_dict("{}/dataset.txt".format(const.data_dir)))
src/pipes/utils.py CHANGED
@@ -2,15 +2,6 @@ import json
2
  import string
3
 
4
 
5
- def pad_sequence(sequence, max_length, padding_token=0):
6
- padded_sequence = sequence[:max_length] + [padding_token] * (max_length - len(sequence))
7
- return padded_sequence
8
-
9
-
10
- def add_start_end_tags(sentence):
11
- return '<START> ' + sentence + ' <END>'
12
-
13
-
14
  def save_dict(file_path, data_dict, encoding='utf-8'):
15
  with open(file_path, "w", encoding=encoding) as f:
16
  json.dump(data_dict, f, ensure_ascii=False)
@@ -26,21 +17,3 @@ def read_file(file_path):
26
  with open(file_path, "r", encoding="utf-8") as f:
27
  sentences = f.readlines()
28
  return sentences
29
-
30
-
31
- def build_vocab(sentences):
32
- vocab = set()
33
- for sentence in sentences:
34
- vocab.update(sentence.split())
35
- return sorted(list(vocab))
36
-
37
-
38
- def remove_punctuation(sentence):
39
- english_punctuations = string.punctuation
40
- bangla_punctuations = "৷-–—’‘৳…।"
41
- all_punctuations = english_punctuations + bangla_punctuations
42
- cleaned_sentence = ''.join([char for char in sentence if char not in all_punctuations])
43
- cleaned_sentence = cleaned_sentence.strip()
44
- cleaned_sentence = ' '.join(cleaned_sentence.split())
45
- return cleaned_sentence
46
-
 
2
  import string
3
 
4
 
 
 
 
 
 
 
 
 
 
5
  def save_dict(file_path, data_dict, encoding='utf-8'):
6
  with open(file_path, "w", encoding=encoding) as f:
7
  json.dump(data_dict, f, ensure_ascii=False)
 
17
  with open(file_path, "r", encoding="utf-8") as f:
18
  sentences = f.readlines()
19
  return sentences