Spaces:
Runtime error
Runtime error
patharanor
commited on
Commit
•
79be08a
1
Parent(s):
80c704e
feat: pretty number in thai word to numeric
Browse files- app.py +58 -10
- tests/test_thai_word.py +71 -0
- utils/thai_word.py +88 -0
app.py
CHANGED
@@ -1,11 +1,17 @@
|
|
1 |
import gradio as gr
|
2 |
import torch
|
3 |
-
from transformers import pipeline
|
4 |
import numpy as np
|
|
|
|
|
|
|
5 |
|
6 |
MODEL_NAME = "biodatlab/whisper-th-medium-combined"
|
7 |
DEVICE = 0 if torch.cuda.is_available() else "cpu"
|
|
|
8 |
|
|
|
|
|
|
|
9 |
transcriber = pipeline(
|
10 |
"automatic-speech-recognition",
|
11 |
model=MODEL_NAME,
|
@@ -14,16 +20,30 @@ transcriber = pipeline(
|
|
14 |
)
|
15 |
|
16 |
def transcribe(audio):
|
17 |
-
|
18 |
-
|
19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
|
21 |
-
return
|
22 |
-
{"sampling_rate": sr, "raw": y},
|
23 |
-
generate_kwargs={"language":"<|th|>", "task":"transcribe"},
|
24 |
-
return_timestamps=False,
|
25 |
-
batch_size=16
|
26 |
-
)["text"]
|
27 |
|
28 |
|
29 |
demo = gr.Interface(
|
@@ -32,4 +52,32 @@ demo = gr.Interface(
|
|
32 |
"text",
|
33 |
)
|
34 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
demo.launch()
|
|
|
1 |
import gradio as gr
|
2 |
import torch
|
|
|
3 |
import numpy as np
|
4 |
+
from transformers import pipeline
|
5 |
+
from utils.thai_word import ThaiWord
|
6 |
+
from pythainlp.tokenize import word_tokenize
|
7 |
|
8 |
MODEL_NAME = "biodatlab/whisper-th-medium-combined"
|
9 |
DEVICE = 0 if torch.cuda.is_available() else "cpu"
|
10 |
+
thw = ThaiWord()
|
11 |
|
12 |
+
# stride_length_s is a tuple of the left and right stride length.
|
13 |
+
# With only 1 number, both sides get the same stride, by default
|
14 |
+
# the stride_length on one side is 1/6th of the chunk_length_s
|
15 |
transcriber = pipeline(
|
16 |
"automatic-speech-recognition",
|
17 |
model=MODEL_NAME,
|
|
|
20 |
)
|
21 |
|
22 |
def transcribe(audio):
|
23 |
+
result = ''
|
24 |
+
try:
|
25 |
+
sr, y = audio
|
26 |
+
y = y.astype(np.float32)
|
27 |
+
y /= np.max(np.abs(y))
|
28 |
+
|
29 |
+
text = transcriber(
|
30 |
+
{"sampling_rate": sr, "raw": y},
|
31 |
+
generate_kwargs={"language":"<|th|>", "task":"transcribe"},
|
32 |
+
return_timestamps=False,
|
33 |
+
batch_size=16
|
34 |
+
)["text"]
|
35 |
+
|
36 |
+
if text is not None:
|
37 |
+
# pretty text
|
38 |
+
tokens = word_tokenize(text, engine="attacut", join_broken_num=True)
|
39 |
+
print(tokens)
|
40 |
+
result = f'pretty: {thw.pretty(tokens)}\n\n original: {text}'
|
41 |
+
else:
|
42 |
+
result = 'โปรดลองพูดอีกครั้ง'
|
43 |
+
except Exception as e:
|
44 |
+
result = f'ไม่สามารถแปลงข้อความเสียงได้ โปรดลองอีกครั้ง\n\nพบข้อผิดพลาด: {str(e)}'
|
45 |
|
46 |
+
return result
|
|
|
|
|
|
|
|
|
|
|
47 |
|
48 |
|
49 |
demo = gr.Interface(
|
|
|
52 |
"text",
|
53 |
)
|
54 |
|
55 |
+
# def transcribe(stream, new_chunk):
|
56 |
+
# sr, y = new_chunk
|
57 |
+
# y = y.astype(np.float32)
|
58 |
+
# y /= np.max(np.abs(y))
|
59 |
+
|
60 |
+
# if stream is not None:
|
61 |
+
# stream = np.concatenate([stream, y])
|
62 |
+
# else:
|
63 |
+
# stream = y
|
64 |
+
|
65 |
+
# text = transcriber({"sampling_rate": sr, "raw": stream})["text"]
|
66 |
+
# if text is not None:
|
67 |
+
# # pretty text
|
68 |
+
# tokens = word_tokenize(text, engine="attacut", join_broken_num=True)
|
69 |
+
# result = f'pretty: {thw.pretty(tokens)}\n\n original: {text}'
|
70 |
+
# else:
|
71 |
+
# result = 'โปรดลองพูดอีกครั้ง'
|
72 |
+
|
73 |
+
# return stream, result
|
74 |
+
|
75 |
+
|
76 |
+
# demo = gr.Interface(
|
77 |
+
# transcribe,
|
78 |
+
# ["state", gr.Audio(sources=["microphone"], streaming=True)],
|
79 |
+
# ["state", "text"],
|
80 |
+
# live=True,
|
81 |
+
# )
|
82 |
+
|
83 |
demo.launch()
|
tests/test_thai_word.py
ADDED
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import unittest
|
2 |
+
from utils.thai_word import ThaiWord
|
3 |
+
|
4 |
+
class TestThaiWord(unittest.TestCase):
|
5 |
+
|
6 |
+
def setUp(self) -> None:
|
7 |
+
self.thw = ThaiWord()
|
8 |
+
|
9 |
+
def test_pretty_text_to_numeric(self):
|
10 |
+
self.assertEqual(
|
11 |
+
self.thw.pretty(['ฮา','โหล','หนึ่ง','สอง','สาม','สี่']),
|
12 |
+
'ฮาโหล1234',
|
13 |
+
'should convert single word number in thai to numeric'
|
14 |
+
)
|
15 |
+
|
16 |
+
def test_pretty_long_words_to_numeric(self):
|
17 |
+
self.assertEqual(
|
18 |
+
self.thw.pretty([
|
19 |
+
'ปี','นี้','สอง','พัน','ห้า','ร้อย','หก','สิบ','เจ็ด','นะ',
|
20 |
+
' ',
|
21 |
+
'ปี','หน้า','ก็','สอง','พัน','ห้า','ร้อย','หก','สิบ','แปด'
|
22 |
+
]),
|
23 |
+
'ปีนี้2567นะ ปีหน้าก็2568',
|
24 |
+
'should convert full-words number in thai to numeric in long words (case1)'
|
25 |
+
)
|
26 |
+
|
27 |
+
self.assertEqual(
|
28 |
+
self.thw.pretty([
|
29 |
+
'อืม', ' ', 'อยาก', 'ได้', 'ราย', 'ได้', 'ยี่', 'สิบ',
|
30 |
+
'เอ็ดล้าน', 'แบบ', 'เข้า', 'บ้าง', ' ', 'ทำ', 'ยัง', 'ไง', 'ดี'
|
31 |
+
]),
|
32 |
+
'อืม อยากได้รายได้21000000แบบเข้าบ้าง ทำยังไงดี',
|
33 |
+
'should convert full-words number in thai to numeric in long words (case2)'
|
34 |
+
)
|
35 |
+
|
36 |
+
self.assertEqual(
|
37 |
+
self.thw.pretty([
|
38 |
+
'อืม',' ','อยาก','ได้','ราย','ได้','ยี่สิบ','เอ็ด','ล้าน',
|
39 |
+
'แบบ', 'ร้าน','พร้อม','ทำ','ยัง','ไง','ดี'
|
40 |
+
]),
|
41 |
+
'อืม อยากได้รายได้21000000แบบร้านพร้อมทำยังไงดี',
|
42 |
+
'should convert full-words number in thai to numeric in long words (case3)'
|
43 |
+
)
|
44 |
+
|
45 |
+
def test_pretty_word11_to_numeric(self):
|
46 |
+
self.assertEqual(
|
47 |
+
self.thw.pretty(['ซื้อ','มา','สิบ','เอ็ด','บาท']),
|
48 |
+
'ซื้อมา11บาท',
|
49 |
+
'should correct specific numeric "สิบ" and "เอ็ด"'
|
50 |
+
)
|
51 |
+
self.assertEqual(
|
52 |
+
self.thw.pretty(['ซื้อ','มา','สิบเอ็ด','บาท']),
|
53 |
+
'ซื้อมา11บาท',
|
54 |
+
'should correct specific numeric "สิบเอ็ด"'
|
55 |
+
)
|
56 |
+
|
57 |
+
def test_pretty_word2x_to_numeric(self):
|
58 |
+
self.assertEqual(
|
59 |
+
self.thw.pretty(['ซื้อ','มา','ยี่','สิบ','ห้า','บาท']),
|
60 |
+
'ซื้อมา25บาท',
|
61 |
+
'should correct specific numeric "ยี่" and "สิบ"'
|
62 |
+
)
|
63 |
+
|
64 |
+
self.assertEqual(
|
65 |
+
self.thw.pretty(['ซื้อ','มา','ยี่สิบ','ห้า','บาท']),
|
66 |
+
'ซื้อมา25บาท',
|
67 |
+
'should correct specific numeric "ยี่สิบ"'
|
68 |
+
)
|
69 |
+
|
70 |
+
def tearDown(self) -> None:
|
71 |
+
self.thw = None
|
utils/thai_word.py
ADDED
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pythainlp.util import text_to_num, text_to_arabic_digit
|
2 |
+
|
3 |
+
class ThaiWord:
|
4 |
+
|
5 |
+
def __init__(self) -> None:
|
6 |
+
self.word_number = ['หนึ่ง','สอง','สาม','สี่','ห้า','หก','เจ็ด','แปด','เก้า']
|
7 |
+
self.word_digit = ['สิบ','ร้อย','พัน','หมื่น','แสน','ล้าน']
|
8 |
+
self.word_number_specific = ['เอ็ด', 'ยี่']
|
9 |
+
self.word_digit_specific = ['สิบ']
|
10 |
+
|
11 |
+
def iscontains11(self, word) -> bool:
|
12 |
+
return self.word_number_specific[0] == word[-4:] or \
|
13 |
+
self.word_number_specific[0] == word[0:4]
|
14 |
+
|
15 |
+
def iscontains2x(self, word) -> bool:
|
16 |
+
return self.word_number_specific[1] == word[0:3]
|
17 |
+
|
18 |
+
def words_to_number(self, words) -> str:
|
19 |
+
num = ''
|
20 |
+
|
21 |
+
if len(words) == 1 and words[0] in self.word_digit:
|
22 |
+
# return text if the word is unit
|
23 |
+
num = words
|
24 |
+
else:
|
25 |
+
try:
|
26 |
+
num = text_to_num("".join(words))
|
27 |
+
if len(num) > 0:
|
28 |
+
num = num[0]
|
29 |
+
except Exception:
|
30 |
+
for word in words:
|
31 |
+
num = f'{num}{text_to_arabic_digit(word)}'
|
32 |
+
|
33 |
+
return f' {int(num):,} '
|
34 |
+
|
35 |
+
def pretty(self, words) -> str:
|
36 |
+
has_start_number = False
|
37 |
+
number = []
|
38 |
+
text = []
|
39 |
+
|
40 |
+
for idx, word in enumerate(words):
|
41 |
+
if has_start_number:
|
42 |
+
if self.is_number(word) or self.is_digit(word):
|
43 |
+
number.append(word)
|
44 |
+
else:
|
45 |
+
text.append(self.words_to_number(number))
|
46 |
+
has_start_number = False
|
47 |
+
number.clear()
|
48 |
+
|
49 |
+
if not has_start_number:
|
50 |
+
if self.is_start_number(word):
|
51 |
+
has_start_number = True
|
52 |
+
number.append(word)
|
53 |
+
else:
|
54 |
+
text.append(word)
|
55 |
+
|
56 |
+
if idx == len(words)-1 and len(number) > 0:
|
57 |
+
text.append(self.words_to_number(number))
|
58 |
+
|
59 |
+
|
60 |
+
return ''.join(text)
|
61 |
+
|
62 |
+
def is_start_number(self, word) -> bool:
|
63 |
+
has_start_number = False
|
64 |
+
if word in self.word_number or \
|
65 |
+
word in self.word_digit or \
|
66 |
+
self.iscontains2x(word) or \
|
67 |
+
self.iscontains11(word):
|
68 |
+
|
69 |
+
has_start_number = True
|
70 |
+
|
71 |
+
return has_start_number
|
72 |
+
|
73 |
+
def is_digit(self, word) -> bool:
|
74 |
+
has_digit = False
|
75 |
+
if word in self.word_digit:
|
76 |
+
has_digit = True
|
77 |
+
|
78 |
+
return has_digit
|
79 |
+
|
80 |
+
def is_number(self, word) -> bool:
|
81 |
+
has_number = False
|
82 |
+
if word in self.word_number or \
|
83 |
+
word in self.word_number_specific or \
|
84 |
+
self.iscontains11(word):
|
85 |
+
|
86 |
+
has_number = True
|
87 |
+
|
88 |
+
return has_number
|