import os os.system("pip install transformers") os.system("pip install https://github.com/kpu/kenlm/archive/master.zip") os.system("pip install pyctcdecode") os.system("pip install gradio") os.system("pip install librosa") os.system("pip install torch") import gradio as gr import librosa import torch from transformers import Wav2Vec2CTCTokenizer from transformers import Wav2Vec2FeatureExtractor from transformers import Wav2Vec2Processor from transformers import Wav2Vec2ForCTC from transformers import Wav2Vec2ProcessorWithLM repo_name = "aiface/vietnamese_s2t" device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") # device="cpu" processor = Wav2Vec2ProcessorWithLM.from_pretrained(repo_name, token="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") model = Wav2Vec2ForCTC.from_pretrained(repo_name, token="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").to(device) feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(repo_name, token="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") tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(repo_name, token="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") def process_audio_file(file): data, sr = librosa.load(file, sr = 16000) return data def transcribe(file_mic, file_upload): warn_output = "" if (file_mic is not None) and (file_upload is not None): warn_output = "WARNING: You've uploaded an audio file and used the microphone. The recorded file from the microphone will be used and the uploaded audio will be discarded.\n" file = file_mic elif (file_mic is None) and (file_upload is None): return "ERROR: You have to either use the microphone or upload an audio file" elif file_mic is not None: file = file_mic else: file = file_upload input_values = process_audio_file(file) input_dict = processor(input_values, sampling_rate=16_000, return_tensors="pt", padding=True) logits = model(input_dict.input_values.to(device)).logits pred_ids = torch.argmax(logits, dim=-1)[0] pres = processor.batch_decode(logits.to("cpu").detach().numpy()).text return warn_output + str(pres[0]) iface = gr.Interface( fn=transcribe, inputs=[ gr.inputs.Audio(source="microphone", type='filepath', optional=True), gr.inputs.Audio(source="upload", type='filepath', optional=True), ], outputs="text", layout="horizontal", theme="huggingface", title="Speech to text MMS With Language Model", description="Demo đơn giản speech to text", ) iface.launch(share=True)