patharanor commited on
Commit
79be08a
1 Parent(s): 80c704e

feat: pretty number in thai word to numeric

Browse files
Files changed (3) hide show
  1. app.py +58 -10
  2. tests/test_thai_word.py +71 -0
  3. utils/thai_word.py +88 -0
app.py CHANGED
@@ -1,11 +1,17 @@
1
  import gradio as gr
2
  import torch
3
- from transformers import pipeline
4
  import numpy as np
 
 
 
5
 
6
  MODEL_NAME = "biodatlab/whisper-th-medium-combined"
7
  DEVICE = 0 if torch.cuda.is_available() else "cpu"
 
8
 
 
 
 
9
  transcriber = pipeline(
10
  "automatic-speech-recognition",
11
  model=MODEL_NAME,
@@ -14,16 +20,30 @@ transcriber = pipeline(
14
  )
15
 
16
  def transcribe(audio):
17
- sr, y = audio
18
- y = y.astype(np.float32)
19
- y /= np.max(np.abs(y))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
- return transcriber(
22
- {"sampling_rate": sr, "raw": y},
23
- generate_kwargs={"language":"<|th|>", "task":"transcribe"},
24
- return_timestamps=False,
25
- batch_size=16
26
- )["text"]
27
 
28
 
29
  demo = gr.Interface(
@@ -32,4 +52,32 @@ demo = gr.Interface(
32
  "text",
33
  )
34
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  demo.launch()
 
1
  import gradio as gr
2
  import torch
 
3
  import numpy as np
4
+ from transformers import pipeline
5
+ from utils.thai_word import ThaiWord
6
+ from pythainlp.tokenize import word_tokenize
7
 
8
  MODEL_NAME = "biodatlab/whisper-th-medium-combined"
9
  DEVICE = 0 if torch.cuda.is_available() else "cpu"
10
+ thw = ThaiWord()
11
 
12
+ # stride_length_s is a tuple of the left and right stride length.
13
+ # With only 1 number, both sides get the same stride, by default
14
+ # the stride_length on one side is 1/6th of the chunk_length_s
15
  transcriber = pipeline(
16
  "automatic-speech-recognition",
17
  model=MODEL_NAME,
 
20
  )
21
 
22
  def transcribe(audio):
23
+ result = ''
24
+ try:
25
+ sr, y = audio
26
+ y = y.astype(np.float32)
27
+ y /= np.max(np.abs(y))
28
+
29
+ text = transcriber(
30
+ {"sampling_rate": sr, "raw": y},
31
+ generate_kwargs={"language":"<|th|>", "task":"transcribe"},
32
+ return_timestamps=False,
33
+ batch_size=16
34
+ )["text"]
35
+
36
+ if text is not None:
37
+ # pretty text
38
+ tokens = word_tokenize(text, engine="attacut", join_broken_num=True)
39
+ print(tokens)
40
+ result = f'pretty: {thw.pretty(tokens)}\n\n original: {text}'
41
+ else:
42
+ result = 'โปรดลองพูดอีกครั้ง'
43
+ except Exception as e:
44
+ result = f'ไม่สามารถแปลงข้อความเสียงได้ โปรดลองอีกครั้ง\n\nพบข้อผิดพลาด: {str(e)}'
45
 
46
+ return result
 
 
 
 
 
47
 
48
 
49
  demo = gr.Interface(
 
52
  "text",
53
  )
54
 
55
+ # def transcribe(stream, new_chunk):
56
+ # sr, y = new_chunk
57
+ # y = y.astype(np.float32)
58
+ # y /= np.max(np.abs(y))
59
+
60
+ # if stream is not None:
61
+ # stream = np.concatenate([stream, y])
62
+ # else:
63
+ # stream = y
64
+
65
+ # text = transcriber({"sampling_rate": sr, "raw": stream})["text"]
66
+ # if text is not None:
67
+ # # pretty text
68
+ # tokens = word_tokenize(text, engine="attacut", join_broken_num=True)
69
+ # result = f'pretty: {thw.pretty(tokens)}\n\n original: {text}'
70
+ # else:
71
+ # result = 'โปรดลองพูดอีกครั้ง'
72
+
73
+ # return stream, result
74
+
75
+
76
+ # demo = gr.Interface(
77
+ # transcribe,
78
+ # ["state", gr.Audio(sources=["microphone"], streaming=True)],
79
+ # ["state", "text"],
80
+ # live=True,
81
+ # )
82
+
83
  demo.launch()
tests/test_thai_word.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import unittest
2
+ from utils.thai_word import ThaiWord
3
+
4
+ class TestThaiWord(unittest.TestCase):
5
+
6
+ def setUp(self) -> None:
7
+ self.thw = ThaiWord()
8
+
9
+ def test_pretty_text_to_numeric(self):
10
+ self.assertEqual(
11
+ self.thw.pretty(['ฮา','โหล','หนึ่ง','สอง','สาม','สี่']),
12
+ 'ฮาโหล1234',
13
+ 'should convert single word number in thai to numeric'
14
+ )
15
+
16
+ def test_pretty_long_words_to_numeric(self):
17
+ self.assertEqual(
18
+ self.thw.pretty([
19
+ 'ปี','นี้','สอง','พัน','ห้า','ร้อย','หก','สิบ','เจ็ด','นะ',
20
+ ' ',
21
+ 'ปี','หน้า','ก็','สอง','พัน','ห้า','ร้อย','หก','สิบ','แปด'
22
+ ]),
23
+ 'ปีนี้2567นะ ปีหน้าก็2568',
24
+ 'should convert full-words number in thai to numeric in long words (case1)'
25
+ )
26
+
27
+ self.assertEqual(
28
+ self.thw.pretty([
29
+ 'อืม', ' ', 'อยาก', 'ได้', 'ราย', 'ได้', 'ยี่', 'สิบ',
30
+ 'เอ็ดล้าน', 'แบบ', 'เข้า', 'บ้าง', ' ', 'ทำ', 'ยัง', 'ไง', 'ดี'
31
+ ]),
32
+ 'อืม อยากได้รายได้21000000แบบเข้าบ้าง ทำยังไงดี',
33
+ 'should convert full-words number in thai to numeric in long words (case2)'
34
+ )
35
+
36
+ self.assertEqual(
37
+ self.thw.pretty([
38
+ 'อืม',' ','อยาก','ได้','ราย','ได้','ยี่สิบ','เอ็ด','ล้าน',
39
+ 'แบบ', 'ร้าน','พร้อม','ทำ','ยัง','ไง','ดี'
40
+ ]),
41
+ 'อืม อยากได้รายได้21000000แบบร้านพร้อมทำยังไงดี',
42
+ 'should convert full-words number in thai to numeric in long words (case3)'
43
+ )
44
+
45
+ def test_pretty_word11_to_numeric(self):
46
+ self.assertEqual(
47
+ self.thw.pretty(['ซื้อ','มา','สิบ','เอ็ด','บาท']),
48
+ 'ซื้อมา11บาท',
49
+ 'should correct specific numeric "สิบ" and "เอ็ด"'
50
+ )
51
+ self.assertEqual(
52
+ self.thw.pretty(['ซื้อ','มา','สิบเอ็ด','บาท']),
53
+ 'ซื้อมา11บาท',
54
+ 'should correct specific numeric "สิบเอ็ด"'
55
+ )
56
+
57
+ def test_pretty_word2x_to_numeric(self):
58
+ self.assertEqual(
59
+ self.thw.pretty(['ซื้อ','มา','ยี่','สิบ','ห้า','บาท']),
60
+ 'ซื้อมา25บาท',
61
+ 'should correct specific numeric "ยี่" and "สิบ"'
62
+ )
63
+
64
+ self.assertEqual(
65
+ self.thw.pretty(['ซื้อ','มา','ยี่สิบ','ห้า','บาท']),
66
+ 'ซื้อมา25บาท',
67
+ 'should correct specific numeric "ยี่สิบ"'
68
+ )
69
+
70
+ def tearDown(self) -> None:
71
+ self.thw = None
utils/thai_word.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pythainlp.util import text_to_num, text_to_arabic_digit
2
+
3
+ class ThaiWord:
4
+
5
+ def __init__(self) -> None:
6
+ self.word_number = ['หนึ่ง','สอง','สาม','สี่','ห้า','หก','เจ็ด','แปด','เก้า']
7
+ self.word_digit = ['สิบ','ร้อย','พัน','หมื่น','แสน','ล้าน']
8
+ self.word_number_specific = ['เอ็ด', 'ยี่']
9
+ self.word_digit_specific = ['สิบ']
10
+
11
+ def iscontains11(self, word) -> bool:
12
+ return self.word_number_specific[0] == word[-4:] or \
13
+ self.word_number_specific[0] == word[0:4]
14
+
15
+ def iscontains2x(self, word) -> bool:
16
+ return self.word_number_specific[1] == word[0:3]
17
+
18
+ def words_to_number(self, words) -> str:
19
+ num = ''
20
+
21
+ if len(words) == 1 and words[0] in self.word_digit:
22
+ # return text if the word is unit
23
+ num = words
24
+ else:
25
+ try:
26
+ num = text_to_num("".join(words))
27
+ if len(num) > 0:
28
+ num = num[0]
29
+ except Exception:
30
+ for word in words:
31
+ num = f'{num}{text_to_arabic_digit(word)}'
32
+
33
+ return f' {int(num):,} '
34
+
35
+ def pretty(self, words) -> str:
36
+ has_start_number = False
37
+ number = []
38
+ text = []
39
+
40
+ for idx, word in enumerate(words):
41
+ if has_start_number:
42
+ if self.is_number(word) or self.is_digit(word):
43
+ number.append(word)
44
+ else:
45
+ text.append(self.words_to_number(number))
46
+ has_start_number = False
47
+ number.clear()
48
+
49
+ if not has_start_number:
50
+ if self.is_start_number(word):
51
+ has_start_number = True
52
+ number.append(word)
53
+ else:
54
+ text.append(word)
55
+
56
+ if idx == len(words)-1 and len(number) > 0:
57
+ text.append(self.words_to_number(number))
58
+
59
+
60
+ return ''.join(text)
61
+
62
+ def is_start_number(self, word) -> bool:
63
+ has_start_number = False
64
+ if word in self.word_number or \
65
+ word in self.word_digit or \
66
+ self.iscontains2x(word) or \
67
+ self.iscontains11(word):
68
+
69
+ has_start_number = True
70
+
71
+ return has_start_number
72
+
73
+ def is_digit(self, word) -> bool:
74
+ has_digit = False
75
+ if word in self.word_digit:
76
+ has_digit = True
77
+
78
+ return has_digit
79
+
80
+ def is_number(self, word) -> bool:
81
+ has_number = False
82
+ if word in self.word_number or \
83
+ word in self.word_number_specific or \
84
+ self.iscontains11(word):
85
+
86
+ has_number = True
87
+
88
+ return has_number