Spaces:

aiface
/

vietnamese_s2t

Runtime error

App Files Files Community

vietnamese_s2t / app.py

aiface

Update app.py

c220a06 8 months ago

raw history blame contribute delete

No virus

2.55 kB

	import os

	os.system("pip install transformers")
	os.system("pip install https://github.com/kpu/kenlm/archive/master.zip")
	os.system("pip install pyctcdecode")
	os.system("pip install gradio")
	os.system("pip install librosa")
	os.system("pip install torch")

	import gradio as gr
	import librosa
	import torch

	from transformers import Wav2Vec2CTCTokenizer
	from transformers import Wav2Vec2FeatureExtractor
	from transformers import Wav2Vec2Processor
	from transformers import Wav2Vec2ForCTC
	from transformers import Wav2Vec2ProcessorWithLM

	repo_name = "aiface/vietnamese_s2t"

	device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
	# device="cpu"
	processor = Wav2Vec2ProcessorWithLM.from_pretrained(repo_name, token="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")
	model = Wav2Vec2ForCTC.from_pretrained(repo_name, token="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").to(device)
	feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(repo_name, token="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")
	tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(repo_name, token="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")

	def process_audio_file(file):
	data, sr = librosa.load(file, sr = 16000)

	return data

	def transcribe(file_mic, file_upload):
	warn_output = ""
	if (file_mic is not None) and (file_upload is not None):
	warn_output = "WARNING: You've uploaded an audio file and used the microphone. The recorded file from the microphone will be used and the uploaded audio will be discarded.\n"
	file = file_mic
	elif (file_mic is None) and (file_upload is None):
	return "ERROR: You have to either use the microphone or upload an audio file"
	elif file_mic is not None:
	file = file_mic
	else:
	file = file_upload

	input_values = process_audio_file(file)
	input_dict = processor(input_values, sampling_rate=16_000, return_tensors="pt", padding=True)
	logits = model(input_dict.input_values.to(device)).logits

	pred_ids = torch.argmax(logits, dim=-1)[0]
	pres = processor.batch_decode(logits.to("cpu").detach().numpy()).text

	return warn_output + str(pres[0])

	iface = gr.Interface(
	fn=transcribe,
	inputs=[
	gr.inputs.Audio(source="microphone", type='filepath', optional=True),
	gr.inputs.Audio(source="upload", type='filepath', optional=True),
	],
	outputs="text",
	layout="horizontal",
	theme="huggingface",
	title="Speech to text MMS With Language Model",
	description="Demo đơn giản speech to text",
	)
	iface.launch(share=True)