seminaire1 / app.py
RemyVF's picture
Create app.py
19d14e7
raw
history blame
2.29 kB
import gradio as gr
from transformers import pipeline
import cv2
from PIL import Image
import io
import scipy
import torch
import time
def video_to_descriptions(video, target_language="en"):
start_time = time.time()
print("START TIME = ", start_time)
ImgToText = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large")
Summarize = pipeline("summarization", model="tuner007/pegasus_summarizer")
translator = pipeline("translation", model=f"Helsinki-NLP/opus-mt-en-{target_language}")
audio = pipeline("text-to-speech", model="suno/bark-small")
voice_preset = f"v2/{target_language}_speaker_1"
cap = cv2.VideoCapture(video)
fps = int(cap.get(cv2.CAP_PROP_FPS))
descriptions = []
frame_count = 0
while True:
ret, frame = cap.read()
if not ret:
break
if frame_count % (fps * 2) == 0:
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
pil_img = Image.fromarray(frame_rgb)
outputs = ImgToText(pil_img)
description = outputs[0]['generated_text']
descriptions.append(description)
print(str(frame_count) + " : " + outputs[0]['generated_text'])
frame_count += 1
cap.release()
concatenated_description = " ".join(descriptions)
summarized_description = Summarize(concatenated_description, max_length=31)[0]["summary_text"]
print("SUMMARIZATION : " + summarized_description)
translated_text = translator(summarized_description)[0]["translation_text"]
print("TRANSLATION : " + translated_text)
audio_file = audio(translated_text)
output_path = "./bark_out.wav"
scipy.io.wavfile.write(output_path, data=audio_file["audio"][0], rate=audio_file["sampling_rate"])
stop_time = time.time()
print("EXECUTION TIME = ", stop_time - start_time)
return output_path
language_dropdown = gr.Dropdown(
["en", "fr", "de", "es"], label="[MANDATORY] Language", info="The Voice's Language"
)
iface = gr.Interface(
fn=video_to_descriptions,
inputs=[gr.Video(label="Video to Upload", info="The Video"), language_dropdown],
outputs="audio",
live=False
)
if __name__ == "__main__":
iface.launch()