Spaces:

fawadrashid
/

visual_qa_with_audio

Sleeping

File size: 1,176 Bytes

e7db0a6

import gradio as gr
from transformers import pipeline
from transformers import BlipForQuestionAnswering
from transformers.utils import logging
logging.set_verbosity_error()

from transformers import AutoProcessor



od_pipe = pipeline("object-detection", "facebook/detr-resnet-50")
tts_pipe = pipeline("text-to-speech",
                    model="kakao-enterprise/vits-ljs")

model = BlipForQuestionAnswering.from_pretrained(
    "Salesforce/blip-vqa-base")

processor = AutoProcessor.from_pretrained(
    "Salesforce/blip-vqa-base")

def get_pipeline_prediction(pil_image, question):
    
    inputs = processor(pil_image, question, return_tensors="pt")

    out = model.generate(**inputs)

    text = processor.decode(out[0], skip_special_tokens=True)
        
    narrated_text = tts_pipe(text)

        
    return (narrated_text["sampling_rate"], narrated_text["audio"][0] )
    

demo = gr.Interface(
  fn=get_pipeline_prediction,
  inputs=[gr.Image(label="Input image", 
                  type="pil"), gr.Textbox(label="Ask your question")],
  outputs=gr.Audio(label="Narration", type="numpy",  autoplay=True)
  
)

demo.launch(server_name="0.0.0.0", server_port=7860)