|
import gradio as gr |
|
from transformers import pipeline |
|
from transformers import BlipForQuestionAnswering |
|
from transformers.utils import logging |
|
logging.set_verbosity_error() |
|
|
|
from transformers import AutoProcessor |
|
|
|
|
|
tts_pipe = pipeline("text-to-speech", |
|
model="kakao-enterprise/vits-ljs") |
|
|
|
model = BlipForQuestionAnswering.from_pretrained( |
|
"Salesforce/blip-vqa-base") |
|
|
|
processor = AutoProcessor.from_pretrained( |
|
"Salesforce/blip-vqa-base") |
|
|
|
def get_pipeline_prediction(pil_image, question): |
|
|
|
inputs = processor(pil_image, question, return_tensors="pt") |
|
|
|
out = model.generate(**inputs) |
|
|
|
text = processor.decode(out[0], skip_special_tokens=True) |
|
|
|
narrated_text = tts_pipe(text) |
|
|
|
|
|
return (narrated_text["sampling_rate"], narrated_text["audio"][0] ) |
|
|
|
|
|
demo = gr.Interface( |
|
fn=get_pipeline_prediction, |
|
inputs=[gr.Image(label="Input image", |
|
type="pil"), gr.Textbox(label="Ask your question")], |
|
outputs=gr.Audio(label="Narration", type="numpy", autoplay=True) |
|
|
|
) |
|
|
|
demo.launch(server_name="0.0.0.0", server_port=7860) |
|
|