Spaces:

patharanor
/

asr-th

Runtime error

App Files Files Community

patharanor commited on Feb 11

Commit

79be08a

•

1 Parent(s): 80c704e

feat: pretty number in thai word to numeric

Browse files

Files changed (3) hide show

app.py +58 -10
tests/test_thai_word.py +71 -0
utils/thai_word.py +88 -0

app.py CHANGED Viewed

@@ -1,11 +1,17 @@
 import gradio as gr
 import torch
-from transformers import pipeline
 import numpy as np
 MODEL_NAME = "biodatlab/whisper-th-medium-combined"
 DEVICE = 0 if torch.cuda.is_available() else "cpu"
 transcriber = pipeline(
     "automatic-speech-recognition",
     model=MODEL_NAME,
@@ -14,16 +20,30 @@ transcriber = pipeline(
 )
 def transcribe(audio):
-    sr, y = audio
-    y = y.astype(np.float32)
-    y /= np.max(np.abs(y))
-    return transcriber(
-        {"sampling_rate": sr, "raw": y},
-        generate_kwargs={"language":"<|th|>", "task":"transcribe"},
-        return_timestamps=False,
-        batch_size=16
-    )["text"]
 demo = gr.Interface(
@@ -32,4 +52,32 @@ demo = gr.Interface(
     "text",
 )
 demo.launch()

 import gradio as gr
 import torch
 import numpy as np
+from transformers import pipeline
+from utils.thai_word import ThaiWord
+from pythainlp.tokenize import word_tokenize
 MODEL_NAME = "biodatlab/whisper-th-medium-combined"
 DEVICE = 0 if torch.cuda.is_available() else "cpu"
+thw = ThaiWord()
+# stride_length_s is a tuple of the left and right stride length.
+# With only 1 number, both sides get the same stride, by default
+# the stride_length on one side is 1/6th of the chunk_length_s
 transcriber = pipeline(
     "automatic-speech-recognition",
     model=MODEL_NAME,
 )
 def transcribe(audio):
+    result = ''
+    try:
+        sr, y = audio
+        y = y.astype(np.float32)
+        y /= np.max(np.abs(y))
+        text = transcriber(
+            {"sampling_rate": sr, "raw": y},
+            generate_kwargs={"language":"<|th|>", "task":"transcribe"},
+            return_timestamps=False,
+            batch_size=16
+        )["text"]
+        if text is not None:
+            # pretty text
+            tokens = word_tokenize(text, engine="attacut", join_broken_num=True)
+            print(tokens)
+            result = f'pretty: {thw.pretty(tokens)}\n\n original: {text}'
+        else:
+            result = 'โปรดลองพูดอีกครั้ง'
+    except Exception as e:
+        result = f'ไม่สามารถแปลงข้อความเสียงได้ โปรดลองอีกครั้ง\n\nพบข้อผิดพลาด: {str(e)}'
+    return result
 demo = gr.Interface(
     "text",
 )
+# def transcribe(stream, new_chunk):
+#     sr, y = new_chunk
+#     y = y.astype(np.float32)
+#     y /= np.max(np.abs(y))
+#     if stream is not None:
+#         stream = np.concatenate([stream, y])
+#     else:
+#         stream = y
+#     text = transcriber({"sampling_rate": sr, "raw": stream})["text"]
+#     if text is not None:
+#         # pretty text
+#         tokens = word_tokenize(text, engine="attacut", join_broken_num=True)
+#         result = f'pretty: {thw.pretty(tokens)}\n\n original: {text}'
+#     else:
+#         result = 'โปรดลองพูดอีกครั้ง'
+#     return stream, result
+# demo = gr.Interface(
+#     transcribe,
+#     ["state", gr.Audio(sources=["microphone"], streaming=True)],
+#     ["state", "text"],
+#     live=True,
+# )
 demo.launch()

tests/test_thai_word.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import unittest
+from utils.thai_word import ThaiWord
+class TestThaiWord(unittest.TestCase):
+    def setUp(self) -> None:
+        self.thw = ThaiWord()
+    def test_pretty_text_to_numeric(self):
+        self.assertEqual(
+            self.thw.pretty(['ฮา','โหล','หนึ่ง','สอง','สาม','สี่']),
+            'ฮาโหล1234',
+            'should convert single word number in thai to numeric'
+        )
+    def test_pretty_long_words_to_numeric(self):
+        self.assertEqual(
+            self.thw.pretty([
+                'ปี','นี้','สอง','พัน','ห้า','ร้อย','หก','สิบ','เจ็ด','นะ',
+                ' ',
+                'ปี','หน้า','ก็','สอง','พัน','ห้า','ร้อย','หก','สิบ','แปด'
+            ]),
+            'ปีนี้2567นะ ปีหน้าก็2568',
+            'should convert full-words number in thai to numeric in long words (case1)'
+        )
+        self.assertEqual(
+            self.thw.pretty([
+                'อืม', ' ', 'อยาก', 'ได้', 'ราย', 'ได้', 'ยี่', 'สิบ',
+                'เอ็ดล้าน', 'แบบ', 'เข้า', 'บ้าง', ' ', 'ทำ', 'ยัง', 'ไง', 'ดี'
+            ]),
+            'อืม อยากได้รายได้21000000แบบเข้าบ้าง ทำยังไงดี',
+            'should convert full-words number in thai to numeric in long words (case2)'
+        )
+        self.assertEqual(
+            self.thw.pretty([
+                'อืม',' ','อยาก','ได้','ราย','ได้','ยี่สิบ','เอ็ด','ล้าน',
+                'แบบ', 'ร้าน','พร้อม','ทำ','ยัง','ไง','ดี'
+            ]),
+            'อืม อยากได้รายได้21000000แบบร้านพร้อมทำยังไงดี',
+            'should convert full-words number in thai to numeric in long words (case3)'
+        )
+    def test_pretty_word11_to_numeric(self):
+        self.assertEqual(
+            self.thw.pretty(['ซื้อ','มา','สิบ','เอ็ด','บาท']),
+            'ซื้อมา11บาท',
+            'should correct specific numeric "สิบ" and "เอ็ด"'
+        )
+        self.assertEqual(
+            self.thw.pretty(['ซื้อ','มา','สิบเอ็ด','บาท']),
+            'ซื้อมา11บาท',
+            'should correct specific numeric "สิบเอ็ด"'
+        )
+    def test_pretty_word2x_to_numeric(self):
+        self.assertEqual(
+            self.thw.pretty(['ซื้อ','มา','ยี่','สิบ','ห้า','บาท']),
+            'ซื้อมา25บาท',
+            'should correct specific numeric "ยี่" and "สิบ"'
+        )
+        self.assertEqual(
+            self.thw.pretty(['ซื้อ','มา','ยี่สิบ','ห้า','บาท']),
+            'ซื้อมา25บาท',
+            'should correct specific numeric "ยี่สิบ"'
+        )
+    def tearDown(self) -> None:
+        self.thw = None

utils/thai_word.py ADDED Viewed

	@@ -0,0 +1,88 @@

+from pythainlp.util import text_to_num, text_to_arabic_digit
+class ThaiWord:
+    def __init__(self) -> None:
+        self.word_number = ['หนึ่ง','สอง','สาม','สี่','ห้า','หก','เจ็ด','แปด','เก้า']
+        self.word_digit = ['สิบ','ร้อย','พัน','หมื่น','แสน','ล้าน']
+        self.word_number_specific = ['เอ็ด', 'ยี่']
+        self.word_digit_specific = ['สิบ']
+    def iscontains11(self, word) -> bool:
+        return self.word_number_specific[0] == word[-4:] or  \
+            self.word_number_specific[0] == word[0:4]
+    def iscontains2x(self, word) -> bool:
+        return self.word_number_specific[1] == word[0:3]
+    def words_to_number(self, words) -> str:
+        num = ''
+        if len(words) == 1 and words[0] in self.word_digit:
+            # return text if the word is unit
+            num = words
+        else:
+            try:
+                num = text_to_num("".join(words))
+                if len(num) > 0:
+                    num = num[0]
+            except Exception:
+                for word in words:
+                    num = f'{num}{text_to_arabic_digit(word)}'
+        return f' {int(num):,} '
+    def pretty(self, words) -> str:
+        has_start_number = False
+        number = []
+        text = []
+        for idx, word in enumerate(words):
+            if has_start_number:
+                if self.is_number(word) or self.is_digit(word):
+                    number.append(word)
+                else:
+                    text.append(self.words_to_number(number))
+                    has_start_number = False
+                    number.clear()
+            if not has_start_number:
+                if self.is_start_number(word):
+                    has_start_number = True
+                    number.append(word)
+                else:
+                    text.append(word)
+            if idx == len(words)-1 and len(number) > 0:
+                text.append(self.words_to_number(number))
+        return ''.join(text)
+    def is_start_number(self, word) -> bool:
+        has_start_number = False
+        if word in self.word_number or \
+            word in self.word_digit or \
+            self.iscontains2x(word) or  \
+            self.iscontains11(word):
+            has_start_number = True
+        return has_start_number
+    def is_digit(self, word) -> bool:
+        has_digit = False
+        if word in self.word_digit:
+            has_digit = True
+        return has_digit
+    def is_number(self, word) -> bool:
+        has_number = False
+        if word in self.word_number or \
+            word in self.word_number_specific or  \
+            self.iscontains11(word):
+            has_number = True
+        return has_number