import gradio as gr import torch import numpy as np from transformers import pipeline from utils.thai_word import ThaiWord from pythainlp.tokenize import word_tokenize from collections import deque from copy import deepcopy MODEL_NAME = "biodatlab/whisper-th-medium-combined" DEVICE = 0 if torch.cuda.is_available() else "cpu" thw = ThaiWord() # stride_length_s is a tuple of the left and right stride length. # With only 1 number, both sides get the same stride, by default # the stride_length on one side is 1/6th of the chunk_length_s transcriber = pipeline( "automatic-speech-recognition", model=MODEL_NAME, chunk_length_s=30, device=DEVICE ) def transcribe(audio): result = '' try: sr, y = audio y = y.astype(np.float32) y /= np.max(np.abs(y)) text = transcriber( {"sampling_rate": sr, "raw": y}, generate_kwargs={"language":"<|th|>", "task":"transcribe"}, return_timestamps=False, batch_size=16 )["text"] if text is not None: # pretty text tokens = word_tokenize(text, engine="attacut", join_broken_num=True) print(tokens) result = f'pretty: {thw.pretty(deque(deepcopy(tokens)))}\n\n original: {text}' else: result = 'โปรดลองพูดอีกครั้ง' except Exception as e: result = f'ไม่สามารถแปลงข้อความเสียงได้ โปรดลองอีกครั้ง\n\nพบข้อผิดพลาด: {str(e)}' return result demo = gr.Interface( transcribe, gr.Audio(sources=["microphone"]), "text", ) demo.launch()