|
from math import log2, pow |
|
import os |
|
|
|
import numpy as np |
|
from scipy.fftpack import fft |
|
|
|
import gradio as gr |
|
|
|
from funasr import AutoModel |
|
from modelscope.pipelines import pipeline |
|
from modelscope.utils.constant import Tasks |
|
from modelscope import snapshot_download |
|
|
|
A4 = 440 |
|
C0 = A4 * pow(2, -4.75) |
|
name = ["C", "C#", "D", "D#", "E", "F", "F#", "G", "G#", "A", "A#", "B"] |
|
|
|
|
|
def get_pitch(freq): |
|
h = round(12 * log2(freq / C0)) |
|
n = h % 12 |
|
return name[n] |
|
|
|
inference_pipeline = pipeline( |
|
task=Tasks.auto_speech_recognition, |
|
model='./models_from_modelscope/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch', |
|
vad_model="./models_from_modelscope/speech_fsmn_vad_zh-cn-16k-common-pytorch", |
|
punc_model="./models_from_modelscope/punc_ct-transformer_cn-en-common-vocab471067-large", |
|
) |
|
|
|
def ASR(audio): |
|
|
|
param_dict = {} |
|
param_dict['use_timestamp'] = False |
|
|
|
rec_result = inference_pipeline(input=audio, params=param_dict) |
|
|
|
return rec_result[0]['text'] |
|
|
|
|
|
demo = gr.Interface( |
|
ASR, |
|
|
|
gr.Audio(type="filepath"), |
|
outputs = 'text', |
|
allow_flagging= 'auto', |
|
examples=[ |
|
[os.path.join(os.path.dirname(__file__),"test1.mp3")], |
|
], |
|
) |
|
|
|
if __name__ == "__main__": |
|
demo.launch(share=True) |
|
|