fawadrashid commited on
Commit
e7db0a6
1 Parent(s): 2389ef4

Upload 3 files

Browse files
Files changed (3) hide show
  1. Dockerfile +27 -0
  2. app2.py +43 -0
  3. requirements.txt +7 -0
Dockerfile ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9-slim
2
+
3
+
4
+ RUN useradd -m -u 1000 user
5
+ USER user
6
+ ENV HOME=/home/user \
7
+ PATH=/home/user/.local/bin:$PATH
8
+ WORKDIR $HOME/app
9
+
10
+ COPY --chown=user . $HOME/app
11
+ COPY ./requirements.txt ~/app/requirements.txt
12
+
13
+ USER root
14
+ RUN rm /var/lib/apt/lists/* -vf
15
+ RUN apt-get clean
16
+ RUN apt-get update
17
+ RUN apt-get upgrade
18
+ RUN apt-get install -y wget zip unzip uvicorn espeak-ng
19
+ USER user
20
+ COPY . .
21
+ USER root
22
+ RUN chmod 777 ~/app/*
23
+ USER user
24
+
25
+ RUN pip3 install -r requirements.txt
26
+
27
+ CMD ["python", "app.py"]
app2.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import pipeline
3
+ from transformers import BlipForQuestionAnswering
4
+ from transformers.utils import logging
5
+ logging.set_verbosity_error()
6
+
7
+ from transformers import AutoProcessor
8
+
9
+
10
+
11
+ od_pipe = pipeline("object-detection", "facebook/detr-resnet-50")
12
+ tts_pipe = pipeline("text-to-speech",
13
+ model="kakao-enterprise/vits-ljs")
14
+
15
+ model = BlipForQuestionAnswering.from_pretrained(
16
+ "Salesforce/blip-vqa-base")
17
+
18
+ processor = AutoProcessor.from_pretrained(
19
+ "Salesforce/blip-vqa-base")
20
+
21
+ def get_pipeline_prediction(pil_image, question):
22
+
23
+ inputs = processor(pil_image, question, return_tensors="pt")
24
+
25
+ out = model.generate(**inputs)
26
+
27
+ text = processor.decode(out[0], skip_special_tokens=True)
28
+
29
+ narrated_text = tts_pipe(text)
30
+
31
+
32
+ return (narrated_text["sampling_rate"], narrated_text["audio"][0] )
33
+
34
+
35
+ demo = gr.Interface(
36
+ fn=get_pipeline_prediction,
37
+ inputs=[gr.Image(label="Input image",
38
+ type="pil"), gr.Textbox(label="Ask your question")],
39
+ outputs=gr.Audio(label="Narration", type="numpy", autoplay=True)
40
+
41
+ )
42
+
43
+ demo.launch(server_name="0.0.0.0", server_port=7860)
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ opencv-python-headless<4.3
2
+ gradio
3
+ transformers
4
+ phonemizer
5
+ py-espeak-ng
6
+ inflect
7
+ timm