fawadrashid's picture
Update app.py
4eb34af verified
import gradio as gr
from transformers import pipeline
from transformers import BlipForQuestionAnswering
from transformers.utils import logging
logging.set_verbosity_error()
from transformers import AutoProcessor
tts_pipe = pipeline("text-to-speech",
model="kakao-enterprise/vits-ljs")
model = BlipForQuestionAnswering.from_pretrained(
"Salesforce/blip-vqa-base")
processor = AutoProcessor.from_pretrained(
"Salesforce/blip-vqa-base")
def get_pipeline_prediction(pil_image, question):
inputs = processor(pil_image, question, return_tensors="pt")
out = model.generate(**inputs)
text = processor.decode(out[0], skip_special_tokens=True)
narrated_text = tts_pipe(text)
return (narrated_text["sampling_rate"], narrated_text["audio"][0] )
demo = gr.Interface(
fn=get_pipeline_prediction,
inputs=[gr.Image(label="Input image",
type="pil"), gr.Textbox(label="Ask your question")],
outputs=gr.Audio(label="Narration", type="numpy", autoplay=True)
)
demo.launch(server_name="0.0.0.0", server_port=7860)