chethu commited on
Commit
d2e8024
1 Parent(s): 3603e2d

Upload 3 files

Browse files
Files changed (3) hide show
  1. Dockerfile +27 -0
  2. app (1).py +49 -0
  3. requirements (1).txt +7 -0
Dockerfile ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9-slim
2
+
3
+
4
+ RUN useradd -m -u 1000 user
5
+ USER user
6
+ ENV HOME=/home/user \
7
+ PATH=/home/user/.local/bin:$PATH
8
+ WORKDIR $HOME/app
9
+
10
+ COPY --chown=user . $HOME/app
11
+ COPY ./requirements.txt ~/app/requirements.txt
12
+
13
+ USER root
14
+ RUN rm /var/lib/apt/lists/* -vf
15
+ RUN apt-get clean
16
+ RUN apt-get update
17
+ RUN apt-get upgrade
18
+ RUN apt-get install -y wget zip unzip uvicorn espeak-ng
19
+ USER user
20
+ COPY . .
21
+ USER root
22
+ RUN chmod 777 ~/app/*
23
+ USER user
24
+
25
+ RUN pip3 install -r requirements.txt
26
+
27
+ CMD ["python", "app.py"]
app (1).py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from PIL import Image, ImageDraw, ImageFont
3
+ import gradio as gr
4
+ from helper import load_image_from_url, render_results_in_image
5
+ from helper import summarize_predictions_natural_language
6
+ from transformers import pipeline
7
+ from transformers.utils import logging
8
+ logging.set_verbosity_error()
9
+
10
+ from helper import ignore_warnings
11
+ ignore_warnings()
12
+
13
+
14
+
15
+
16
+ od_pipe = pipeline("object-detection", "facebook/detr-resnet-50")
17
+ tts_pipe = pipeline("text-to-speech",
18
+ model="kakao-enterprise/vits-ljs")
19
+
20
+
21
+ def get_pipeline_prediction(pil_image):
22
+
23
+ pipeline_output = od_pipe(pil_image)
24
+
25
+ processed_image = render_results_in_image(pil_image,
26
+ pipeline_output)
27
+
28
+ text = summarize_predictions_natural_language(pipeline_output)
29
+ print(text)
30
+ narrated_text = tts_pipe(text)
31
+
32
+ #print (narrated_text)
33
+ print(narrated_text["audio"][0])
34
+ print (narrated_text["sampling_rate"])
35
+ return processed_image, (narrated_text["sampling_rate"], narrated_text["audio"][0] )
36
+ #return processed_image
37
+
38
+
39
+ demo = gr.Interface(
40
+ fn=get_pipeline_prediction,
41
+ inputs=gr.Image(label="Input image",
42
+ type="pil"),
43
+ outputs=[gr.Image(label="Output image with predicted instances",
44
+ type="pil"), gr.Audio(label="Narration", type="numpy", autoplay=True)]
45
+ #outputs=gr.Image(label="Output image with predicted instances",
46
+ # type="pil")
47
+ )
48
+
49
+ demo.launch(server_name="0.0.0.0", server_port=7860)
requirements (1).txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ opencv-python-headless<4.3
2
+ gradio
3
+ transformers
4
+ phonemizer
5
+ py-espeak-ng
6
+ inflect
7
+ timm