cdactvm commited on
Commit
9fc1cf9
·
verified ·
1 Parent(s): b4d2c26

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +234 -1
app.py CHANGED
@@ -15,6 +15,7 @@ def transcribe_odiya_eng(speech):
15
  trn = Transliterator(source='ori', target='eng', build_lookup=True)
16
  text = p1(speech)["text"]
17
  text=trn.transform(text)
 
18
  return text
19
 
20
  def sel_lng(lng,mic=None, file=None):
@@ -28,9 +29,241 @@ def sel_lng(lng,mic=None, file=None):
28
  return transcribe_odiya(audio)
29
  elif (lng=="Odiya-trans"):
30
  return transcribe_odiya_eng(audio)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  demo=gr.Interface(
35
  fn=sel_lng,
36
 
 
15
  trn = Transliterator(source='ori', target='eng', build_lookup=True)
16
  text = p1(speech)["text"]
17
  text=trn.transform(text)
18
+ text = master_function(text)
19
  return text
20
 
21
  def sel_lng(lng,mic=None, file=None):
 
29
  return transcribe_odiya(audio)
30
  elif (lng=="Odiya-trans"):
31
  return transcribe_odiya_eng(audio)
32
+
33
+ #####################################################
34
+
35
+
36
+ def soundex(word):
37
+ word = word.upper()
38
+ word = ''.join(filter(str.isalpha, word))
39
+ if not word:
40
+ return None
41
+ soundex_mapping = {
42
+ 'B': '1', 'F': '1', 'P': '1', 'V': '1',
43
+ 'C': '2', 'G': '2', 'J': '2', 'K': '2', 'Q': '2',
44
+ 'S': '2', 'X': '2', 'Z': '2',
45
+ 'D': '3', 'T': '3',
46
+ 'L': '4',
47
+ 'M': '5', 'N': '5',
48
+ 'R': '6'
49
+ }
50
+ soundex_code = word[0]
51
+ for char in word[1:]:
52
+ if char not in ('H', 'W'):
53
+ soundex_code += soundex_mapping.get(char, '0')
54
+ soundex_code = soundex_code[0] + ''.join(c for i, c in enumerate(soundex_code[1:]) if c != soundex_code[i])
55
+ soundex_code = soundex_code.replace('0', '') + '000'
56
+ return soundex_code[:4]
57
+
58
+ # convert special tecken to numbers
59
+
60
+ def is_number(x):
61
+ if type(x) == str:
62
+ x = x.replace(',', '')
63
+ try:
64
+ float(x)
65
+ except:
66
+ return False
67
+ return True
68
+
69
+ def text2int (textnum, numwords={}):
70
 
71
+ units = ['Z600', 'O500','T000','T600','F600','F100','S220','S150','E300','N500',
72
+ 'T500', 'E415', 'T410', 'T635', 'F635', 'F135', 'S235', 'S153', 'E235','N535']
73
+ # teens = ['T500', 'E415', 'T410', 'T635', 'F635', 'F135', 'S235', 'S153', 'E235','N535']
74
+ tens = ['', '', 'T537', 'T637', 'F637', 'F137', 'S230', 'S153', 'E230', 'N530']
75
+ scales = ['H536', 'T253', 'M450', 'C600']
76
+ # scale_values = [100, 1_000, 10_0000, 1000_000_000]
77
+ indian_scales = ['L200', 'C600', 'A610', 'K610']
78
+ conjunction = ['and']
79
+ ordinal_words = {'oh': 'Z600', 'first': 'O500', 'second': 'T000', 'third': 'T600', 'fourth': 'F600', 'fifth': 'F100',
80
+ 'sixth': 'S200','seventh': 'S150','eighth': 'E230', 'ninth': 'N500', 'twelfth': 'T410'}
81
+ ordinal_endings = [('ieth', 'y'), ('th', '')]
82
+ if not numwords:
83
+ numwords['and'] = (1, 0)
84
+ for idx, word in enumerate(units): numwords[word] = (1, idx)
85
+ for idx, word in enumerate(tens): numwords[word] = (1, idx * 10)
86
+ for idx, word in enumerate(scales): numwords[word] = (10 ** (idx * 3 or 2), 0)
87
+
88
+ textnum = textnum.replace('-', ' ')
89
+
90
+ current = result = 0
91
+ curstring = ''
92
+ onnumber = False
93
+ lastunit = False
94
+ lastscale = False
95
+
96
+ def is_numword(x):
97
+ if is_number(x):
98
+ return True
99
+ if word in numwords:
100
+ return True
101
+ return False
102
+
103
+ def from_numword(x):
104
+ if is_number(x):
105
+ scale = 0
106
+ increment = int(x.replace(',', ''))
107
+ return scale, increment
108
+ return numwords[x]
109
+
110
+ for word in textnum.split():
111
+ if word in ordinal_words:
112
+ scale, increment = (1, ordinal_words[word])
113
+ current = current * scale + increment
114
+ if scale > 100:
115
+ result += current
116
+ current = 0
117
+ onnumber = True
118
+ lastunit = False
119
+ lastscale = False
120
+ else:
121
+ for ending, replacement in ordinal_endings:
122
+ if word.endswith(ending):
123
+ word = "%s%s" % (word[:-len(ending)], replacement)
124
+
125
+ if (not is_numword(word)) or (word == 'and' and not lastscale):
126
+ if onnumber:
127
+ # Flush the current number we are building
128
+ curstring += repr(result + current) + " "
129
+ curstring += word + " "
130
+ result = current = 0
131
+ onnumber = False
132
+ lastunit = False
133
+ lastscale = False
134
+ else:
135
+ scale, increment = from_numword(word)
136
+ onnumber = True
137
+
138
+ if lastunit and (word not in scales):
139
+ # Assume this is part of a string of individual numbers to
140
+ # be flushed, such as a zipcode "one two three four five"
141
+ curstring += repr(result + current)
142
+ result = current = 0
143
+
144
+ if scale > 1:
145
+ current = max(1, current)
146
+
147
+ current = current * scale + increment
148
+ if scale > 100:
149
+ result += current
150
+ current = 0
151
+
152
+ lastscale = False
153
+ lastunit = False
154
+ if word in scales:
155
+ lastscale = True
156
+ elif word in units:
157
+ lastunit = True
158
+
159
+ if onnumber:
160
+ curstring += repr(result + current)
161
+
162
+ return curstring
163
+
164
+ # replace those words which are not correctly spelled to correct words.
165
+ def replace_words(sentence):
166
+ # Define the replacements
167
+ replacements = [
168
+ (r'\bjiro\b', 'zero'),
169
+ (r'\bjero\b', 'zero'),
170
+ (r'\bnn\b', 'one'),
171
+ (r'\bn\b', 'one'),
172
+ (r'\bna\b', 'one'),
173
+ (r'\btu\b', 'two'),
174
+ (r'\btoo\b', 'two'),
175
+ (r'\bthiri\b', 'three'),
176
+ (r'\bfor\b', 'four'),
177
+ (r'\bfore\b', 'four'),
178
+ (r'\bfib\b', 'five'),
179
+ (r'\bdublseven\b', 'double seven'),
180
+ (r'\bdubalathri\b', 'double three'),
181
+ (r'\bnineeit\b', 'nine eight'),
182
+ (r'\bfipeit\b', 'five eight'),
183
+ (r'\bdubal\b', 'double'),
184
+ (r'\bsevenatu\b', 'seven two'),
185
+ ]
186
+ # Apply the replacements
187
+ for pattern, replacement in replacements:
188
+ sentence = re.sub(pattern, replacement, sentence)
189
+ return sentence
190
+
191
+ # split text and numbers and get it into different sentences.
192
+ def split_sentence(sentence):
193
+ # List of word-based numbers
194
+ word_numbers = ["zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine",
195
+ "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen",
196
+ "seventeen", "eighteen", "nineteen", "twenty", "thirty", "forty", "fifty",
197
+ "sixty", "seventy", "eighty", "ninety", "hundred", "thousand", "million",
198
+ 'zero', 'one', 'on','na','n','tu','two','to','too', 'three','tree','four',
199
+ 'for','five','fib', 'six', 'seven', 'eight', 'eit', 'nine', 'eit', 'seven',
200
+ 'sics', 'thri', 'for', 'eittu', 'eittu', 'nine','dubal','sikas','tri', 'double']
201
 
202
+ # Split the sentence into tokens
203
+ tokens = sentence.split()
204
+ # Initialize variables to store the parts of the sentence
205
+ pre_numbers = []
206
+ numbers = []
207
+ post_numbers = []
208
+ found_numbers = False
209
+ # Iterate through the tokens to classify them
210
+ for token in tokens:
211
+ if token.lower() in word_numbers:
212
+ found_numbers = True
213
+ numbers.append(token)
214
+ else:
215
+ if found_numbers:
216
+ post_numbers.append(token)
217
+ else:
218
+ pre_numbers.append(token)
219
+ # Join the parts back into sentences
220
+ sentence1 = ' '.join(pre_numbers)
221
+ number = ' '.join(numbers)
222
+ sentence3 = ' '.join(post_numbers)
223
+ return sentence1, number, sentence3
224
+
225
+ # Process double followed by a numbers.
226
+ def process_doubles(sentence):
227
+ tokens = sentence.split()
228
+ result = []
229
+
230
+ i = 0
231
+ while i < len(tokens):
232
+ if tokens[i] == "double" or tokens[i] == "dubal":
233
+ if i + 1 < len(tokens):
234
+ # Repeat the next word twice
235
+ result.append(tokens[i + 1])
236
+ result.append(tokens[i + 1])
237
+ i += 2 # Skip the next word as it's already added twice
238
+ else:
239
+ # If "double" is the last word, just add it (although this case is unusual)
240
+ result.append(tokens[i])
241
+ i += 1
242
+ else:
243
+ result.append(tokens[i])
244
+ i += 1
245
+
246
+ return ' '.join(result)
247
+
248
+ # Concatenate text and numbers and form a single sentence.
249
+ def concatenate_sentences(sentence1, numbers, sentence3):
250
+ full_sentence = f"{sentence1} {numbers} {sentence3}"
251
+ return full_sentence
252
+
253
+ # define a master function to run all the above functions.
254
+ def master_function(initial_input):
255
+ output_string1 = replace_words(initial_input)
256
+ sentence1, number, sentence3 = split_sentence(output_string1)
257
+ processed_sentence = process_doubles(number)
258
+ text = processed_sentence
259
+ words = text.strip().split()
260
+ soundex_codes = [soundex(word) for word in words]
261
+ combined_text = " ".join(soundex_codes)
262
+ numbers=text2int(combined_text)
263
+ full_sentence = concatenate_sentences(sentence1, numbers, sentence3)
264
+ return full_sentence
265
+
266
+ ######################################################
267
  demo=gr.Interface(
268
  fn=sel_lng,
269