Spaces:

cotxetj
/

swedish-to-speech-or-text

Runtime error

App Files Files Community

swedish-to-speech-or-text / app.py

cotxetj

Update app.py

4c84112 11 months ago

raw

history blame

No virus

3.42 kB

	import torch
	import os
	from transformers import pipeline, VitsModel, VitsTokenizer, SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor
	import numpy as np
	os.system("pip install git+https://github.com/openai/whisper.git")
	import gradio as gr
	import whisper

	model = whisper.load_model("small")
	device = "cuda:0" if torch.cuda.is_available() else "cpu"

	def inference(audio):
	audio = whisper.load_audio(audio)
	print("loading finished")
	audio = whisper.pad_or_trim(audio)
	print("audio trimed")
	mel = whisper.log_mel_spectrogram(audio).to(model.device)
	print("spectro finished")
	_, probs = model.detect_language(mel)
	print("lang detected")
	options = whisper.DecodingOptions(fp16 = False)
	print("options decoded")
	result = whisper.decode(model, mel, options)

	print(result.text)
	return result.text


	# Load Whisper-small
	# pipe = pipeline("automatic-speech-recognition",
	# model="openai/whisper-small",
	# device=device
	# )
	pipe = pipeline(model="Sleepyp00/whisper-small-Swedish")

	model2 = VitsModel.from_pretrained("facebook/mms-tts-eng")
	tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-eng")


	# Define a function to translate an audio, in english here
	def translate(audio):
	# return inference(audio)
	outputs = pipe(audio, max_new_tokens=256,
	generate_kwargs={"task": "translate"})
	return outputs["text"]


	# Define function to generate the waveform output
	def synthesise(text):
	inputs = tokenizer(text, return_tensors="pt")
	input_ids = inputs["input_ids"]

	with torch.no_grad():
	outputs = model2(input_ids)

	return outputs.audio[0]


	# Define the pipeline
	def speech_to_speech_translation(audio):
	translated_text = translate(audio)
	synthesised_speech = synthesise(translated_text)
	synthesised_speech = (
	synthesised_speech.numpy() * 32767).astype(np.int16)
	return [translated_text, (16000, synthesised_speech)]

	def predict(transType, language, audio, audio_mic = None):
	print("debug1:", audio,"debug2", audio_mic)
	if not audio and audio_mic:
	audio = audio_mic

	if transType == "Text":
	return translate(audio), None
	if transType == "Audio":
	return speech_to_speech_translation(audio)

	# Define the title etc
	title = "Swedish STSOT (Speech To Speech Or Text)"
	description="Use Whisper pretrained model to convert swedish audio to english (text or audio)"


	supportLangs = ["Swedish", "French (in training)"]
	transTypes = ["Text", "Audio"]

	#examples = [
	# ["Text", "Swedish", "./ex1.wav", None],
	# ["Audio", "Swedish", "./ex2.wav", None]
	#]

	examples =[]
	demo = gr.Interface(
	fn=predict,
	inputs=[
	gr.Radio(label="Choose your output format", choices=transTypes),
	gr.Radio(label="Choose a source language", choices=supportLangs, value="Swedish"),
	gr.Audio(label="Import an audio", sources="upload", type="filepath"),
	#gr.Audio(label="Import an audio", sources="upload", type="numpy"),
	gr.Audio(label="Record an audio", sources="microphone", type="filepath"),
	],
	outputs=[
	gr.Text(label="Text translation"),gr.Audio(label="Audio translation",type = "numpy")
	],
	title=title,
	description=description,
	article="",
	examples=examples,
	)


	demo.launch()