RemyVF commited on
Commit
19d14e7
1 Parent(s): 15be723

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +80 -0
app.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import pipeline
3
+ import cv2
4
+ from PIL import Image
5
+ import io
6
+ import scipy
7
+ import torch
8
+ import time
9
+
10
+
11
+
12
+ def video_to_descriptions(video, target_language="en"):
13
+
14
+ start_time = time.time()
15
+ print("START TIME = ", start_time)
16
+
17
+ ImgToText = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large")
18
+ Summarize = pipeline("summarization", model="tuner007/pegasus_summarizer")
19
+ translator = pipeline("translation", model=f"Helsinki-NLP/opus-mt-en-{target_language}")
20
+ audio = pipeline("text-to-speech", model="suno/bark-small")
21
+
22
+ voice_preset = f"v2/{target_language}_speaker_1"
23
+
24
+ cap = cv2.VideoCapture(video)
25
+ fps = int(cap.get(cv2.CAP_PROP_FPS))
26
+
27
+ descriptions = []
28
+ frame_count = 0
29
+
30
+ while True:
31
+ ret, frame = cap.read()
32
+ if not ret:
33
+ break
34
+
35
+
36
+ if frame_count % (fps * 2) == 0:
37
+
38
+ frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
39
+
40
+ pil_img = Image.fromarray(frame_rgb)
41
+
42
+ outputs = ImgToText(pil_img)
43
+ description = outputs[0]['generated_text']
44
+ descriptions.append(description)
45
+ print(str(frame_count) + " : " + outputs[0]['generated_text'])
46
+
47
+ frame_count += 1
48
+
49
+ cap.release()
50
+
51
+ concatenated_description = " ".join(descriptions)
52
+ summarized_description = Summarize(concatenated_description, max_length=31)[0]["summary_text"]
53
+ print("SUMMARIZATION : " + summarized_description)
54
+
55
+ translated_text = translator(summarized_description)[0]["translation_text"]
56
+ print("TRANSLATION : " + translated_text)
57
+
58
+ audio_file = audio(translated_text)
59
+
60
+ output_path = "./bark_out.wav"
61
+ scipy.io.wavfile.write(output_path, data=audio_file["audio"][0], rate=audio_file["sampling_rate"])
62
+
63
+ stop_time = time.time()
64
+
65
+ print("EXECUTION TIME = ", stop_time - start_time)
66
+ return output_path
67
+
68
+ language_dropdown = gr.Dropdown(
69
+ ["en", "fr", "de", "es"], label="[MANDATORY] Language", info="The Voice's Language"
70
+ )
71
+
72
+ iface = gr.Interface(
73
+ fn=video_to_descriptions,
74
+ inputs=[gr.Video(label="Video to Upload", info="The Video"), language_dropdown],
75
+ outputs="audio",
76
+ live=False
77
+ )
78
+
79
+ if __name__ == "__main__":
80
+ iface.launch()