Spaces:

cotxetj
/

swedish-to-speech-or-text

Runtime error

App Files Files Community

swedish-to-speech-or-text / app.py

cotxetj

Update app.py

5e0755e 8 months ago

raw

history blame

No virus

3.27 kB

	import torch
	import os
	from transformers import pipeline, VitsModel, VitsTokenizer
	import numpy as np
	os.system("pip install git+https://github.com/openai/whisper.git")
	import gradio as gr
	import whisper

	model = whisper.load_model("small")
	device = "cuda:0" if torch.cuda.is_available() else "cpu"

	def inference(audio):
	audio = whisper.load_audio(audio)
	audio = whisper.pad_or_trim(audio)

	mel = whisper.log_mel_spectrogram(audio).to(model.device)

	_, probs = model.detect_language(mel)

	options = whisper.DecodingOptions(fp16 = False)
	result = whisper.decode(model, mel, options)

	print(result.text)
	return result.text


	# Load Whisper-small
	pipe = pipeline("automatic-speech-recognition",
	model="openai/whisper-small",
	device=device
	)

	# Load the model checkpoint and tokenizer
	#model = VitsModel.from_pretrained("Matthijs/mms-tts-fra")
	#tokenizer = VitsTokenizer.from_pretrained("Matthijs/mms-tts-fra")
	model = VitsModel.from_pretrained("facebook/mms-tts-fra")
	tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-fra")


	# Define a function to translate an audio, in english here
	def translate(audio):
	return inference(audio)
	outputs = pipe(audio, max_new_tokens=256,
	generate_kwargs={"task": "transcribe", "language": "english"})
	return outputs["text"]


	# Define function to generate the waveform output
	def synthesise(text):
	inputs = tokenizer(text, return_tensors="pt")
	input_ids = inputs["input_ids"]

	with torch.no_grad():
	outputs = model(input_ids)

	return outputs.audio[0]


	# Define the pipeline
	def speech_to_speech_translation(audio):
	translated_text = translate(audio)
	synthesised_speech = synthesise(translated_text)
	synthesised_speech = (
	synthesised_speech.numpy() * 32767).astype(np.int16)
	return (16000, synthesised_speech)

	def predict(transType, language, audio, audio_mic = None):
	print("debug1:", audio,"debug2", audio_mic)
	if not audio and audio_mic:
	audio = audio_mic
	audio = audio[1]
	if transType == "Text":
	return translate(audio), None
	if transType == "Audio":
	return "",speech_to_speech_translation(audio)

	# Define the title etc
	title = "Swedish STSOT (Speech To Speech Or Text)"
	description="Use Whisper pretrained model to convert swedish audio to english (text or audio)"


	supportLangs = ["Swedish", "French (in training)"]
	transTypes = ["Text", "Audio"]

	#examples = [
	# ["Text", "Swedish", "./ex1.wav", None],
	# ["Audio", "Swedish", "./ex2.wav", None]
	#]

	examples =[]
	demo = gr.Interface(
	fn=predict,
	inputs=[
	gr.Radio(label="Choose your output format", choices=transTypes),
	gr.Radio(label="Choose a source language", choices=supportLangs, value="Swedish"),
	gr.Audio(label="Import an audio", sources="upload", type="numpy"),
	gr.Audio(label="Record an audio", sources="microphone", type="numpy"),
	],
	outputs=[
	gr.Text(label="Text translation"),gr.Audio(label="Audio translation",type = "numpy")
	],
	title=title,
	description=description,
	article="",
	examples=examples,
	)


	demo.launch()