Spaces:

Kindler
/

197zAlexa

Runtime error

App Files Files Community

197zAlexa / app.py

Kindler

Create app.py

dff36fd verified 5 months ago

raw

history blame

No virus

3.13 kB

	from nemo.collections.asr.models import EncDecMultiTaskModel
	import gradio as gr
	import torch
	import json
	import numpy as np
	import soundfile as sf
	import tempfile
	from transformers import VitsTokenizer, VitsModel, set_seed



	#just to import this piece of shit above me, one needs:

	#gradio transformers
	#nemo
	#hydra
	#librosa
	#sentencepiece
	#
	#







	# load model
	canary_model = EncDecMultiTaskModel.from_pretrained('nvidia/canary-1b')

	# update decode params
	decode_cfg = canary_model.cfg.decoding
	decode_cfg.beam.beam_size = 1
	canary_model.change_decoding_strategy(decode_cfg)


	import torch
	from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline


	#install accelerate

	torch.random.manual_seed(0)

	model = AutoModelForCausalLM.from_pretrained(
	"microsoft/Phi-3-mini-128k-instruct",
	device_map="cpu",
	torch_dtype="auto",
	trust_remote_code=True,
	)
	tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct")

	messages = []

	pipe = pipeline(
	"text-generation",
	model=model,
	tokenizer=tokenizer,
	)

	generation_args = {
	"max_new_tokens": 500,
	"return_full_text": False,
	"temperature": 0.0,
	"do_sample": False,
	}


	tokenizer_vits = VitsTokenizer.from_pretrained("facebook/mms-tts-eng")
	model_vits = VitsModel.from_pretrained("facebook/mms-tts-eng")

	# Define the function to transcribe audio
	def transcribe_audio(audio):
	audio_list, sample_rate = sf.read(audio)

	if audio_list.ndim > 1:
	audio_list = np.mean(audio_list,axis=1)

	# Create a temporary file to save the audio data
	with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio_file:
	temp_audio_path = temp_audio_file.name

	# Save the audio data to the temporary file
	sf.write(temp_audio_path, audio_list, sample_rate)

	# Transcribe audio using the canary model
	predicted_text = canary_model.transcribe(paths2audio_files=[temp_audio_path], batch_size=16)

	# Remove the temporary file

	# Return the transcription
	messages = [{"role": "user", "content": predicted_text[0]}]

	output_text =pipe(messages, **generation_args)

	inputs_vits = tokenizer_vits(text=output_text[0]["generated_text"], return_tensors="pt")

	set_seed(555) # make deterministic

	with torch.no_grad():
	outputs_vits = model_vits(**inputs_vits)

	waveform = outputs_vits.waveform[0]

	with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio_file_2:
	temp_audio_path_2 = temp_audio_file_2.name

	# Save the audio data to the temporary file
	sf.write(temp_audio_path_2, waveform.numpy(), model_vits.config.sampling_rate)

	return temp_audio_path_2




	# Create the Gradio interface
	import gradio as gr





	#gradio replaced .input and .output with .components
	audio_input = gr.components.Audio(sources=["upload","microphone"], type="filepath", label="Record Audio")
	audio_output = gr.components.Audio(label="Audio Output")
	interface = gr.Interface(fn=transcribe_audio, inputs=audio_input, outputs=audio_output)

	# Launch the interface
	interface.launch()