marvin / app.py
StephaneBah's picture
init2
dc7109c verified
raw
history blame contribute delete
No virus
2.23 kB
import streamlit as st
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from diffusers import DiffusionPipeline
import torch
import accelerate
# Load the models and tokenizers
translation_model_name = "google/madlad400-3b-mt"
translation_model = AutoModelForSeq2SeqLM.from_pretrained(translation_model_name)
translation_tokenizer = AutoTokenizer.from_pretrained(translation_model_name)
transcription_model = "chrisjay/fonxlsr"
diffusion_model_name = "stabilityai/stable-diffusion-xl-base-1.0"
diffusion_pipeline = DiffusionPipeline.from_pretrained(diffusion_model_name, torch_dtype=torch.float16)
diffusion_pipeline = diffusion_pipeline.to("cuda")
# Define the translation and transcription pipeline with accelerate
translation_pipeline = pipeline("translation", model=translation_model, tokenizer=translation_tokenizer, device_map="auto")
transcription_pipeline = pipeline("automatic-speech-recognition", model=transcription_model, device_map="auto")
# Define the function for transcribing and translating audio in Fon
def transcribe_and_translate_audio_fon(audio_path, num_images=1):
# Transcribe the audio to Fon using the transcription pipeline
transcription_fon = transcription_pipeline(audio_path)["text"]
# Translate the Fon transcription to French using the translation pipeline
translation_result = translation_pipeline(transcription_fon, source_lang="fon", target_lang="fr")
translation_fr = translation_result[0]["translation_text"]
images = diffusion_pipeline(translation_fr, num_images_per_prompt=num_images)["images"]
return images
# Create a Streamlit app
st.title("Fon Audio to Image Translation")
# Upload audio file
audio_file = st.file_uploader("Upload an audio file", type=["wav"])
# Transcribe, translate and generate images
if audio_file:
images = transcribe_and_translate_audio_fon(audio_file)
st.image(images[0])
# Use Accelerate to distribute the computation across available GPUs
#images = accelerate.launch(transcribe_and_translate_and_generate, audio_file="Fongbe_Speech_Dataset/Fongbe_Speech_Dataset/fongbe_speech_audio_files/wav/64_fongbe_6b36d45b77344caeb1c8d773303c9dcb_for_validation_2022-03-11-23-50-13.wav", num_images=2)