flocolombari's picture
Update app.py
a12eca5
import gradio as gr
from transformers import pipeline
import cv2
from PIL import Image
import io
import scipy
import torch
import time
import numpy as np
def detect_scene_changes(video_path, threshold):
"""
Détecte les changements de plan dans une vidéo.
Parameters:
- video_path: chemin vers le fichier vidéo
- threshold: seuil de différence pour détecter un changement de plan
Returns:
Une liste des numéros d'images où un changement de plan est détecté.
"""
cap = cv2.VideoCapture(video_path)
if not cap.isOpened():
print("Erreur lors de l'ouverture de la vidéo.")
return []
ret, prev_frame = cap.read()
if not ret:
print("Erreur lors de la lecture de la vidéo.")
return []
prev_frame_gray = cv2.cvtColor(prev_frame, cv2.COLOR_BGR2GRAY)
scene_changes = []
frame_number = 0
while True:
ret, current_frame = cap.read()
if not ret:
break
current_frame_gray = cv2.cvtColor(current_frame, cv2.COLOR_BGR2GRAY)
# Calculer la différence absolue entre les deux images
diff = cv2.absdiff(prev_frame_gray, current_frame_gray)
mean_diff = np.mean(diff)
if mean_diff > threshold:
scene_changes.append(frame_number)
prev_frame_gray = current_frame_gray
frame_number += 1
cap.release()
return scene_changes
def video_to_descriptions(video, target_language="en"):
threshold =25.0
scene_changes = detect_scene_changes(video, threshold)
start_time = time.time()
print("START TIME = ", start_time)
ImgToText = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large")
Summarize = pipeline("summarization", model="tuner007/pegasus_summarizer")
translator = pipeline("translation", model=f"Helsinki-NLP/opus-mt-en-{target_language}")
audio = pipeline("text-to-speech", model="suno/bark-small")
voice_preset = f"v2/{target_language}_speaker_1"
cap = cv2.VideoCapture(video)
fps = int(cap.get(cv2.CAP_PROP_FPS))
descriptions = []
frame_count = 0
while True:
ret, frame = cap.read()
if not ret:
break
if (frame_count % (fps * 3) == 0) or (frame_count in scene_changes) :
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
pil_img = Image.fromarray(frame_rgb)
outputs = ImgToText(pil_img)
description = outputs[0]['generated_text']
if (frame_count in scene_changes):
descriptions.append(" There has been a scene change, now we can observe " + description)
print(str(frame_count) + " | CHANGEMENT DE PLAN | " + outputs[0]['generated_text'])
else:
descriptions.append(" we can see that " + description)
print(str(frame_count) + " | " + outputs[0]['generated_text'])
frame_count += 1
cap.release()
concatenated_description = " ".join(descriptions).split(" There has been a scene change, now we can observe")
plan_number = 1
summarized_description = f"We can see the Scene number {plan_number}, where "
for plan in concatenated_description:
if not (summarized_description == "We can see the Scene number 1, where "):
summarized_description += f"There has been a scene change, now we can observe the Scene number {plan_number}, where "
summarized_description += Summarize(plan, max_length=20)[0]["summary_text"]
plan_number += 1
else:
summarized_description += Summarize(plan, max_length=20)[0]["summary_text"]
plan_number += 1
print("SUMMARIZATION : " + summarized_description)
translated_text = translator(summarized_description, max_length=2560)[0]["translation_text"]
print("TRANSLATION : " + translated_text)
audio_file = audio(translated_text)
output_path = "./bark_out.wav"
scipy.io.wavfile.write(output_path, data=audio_file["audio"][0], rate=audio_file["sampling_rate"])
stop_time = time.time()
print("EXECUTION TIME = ", stop_time - start_time)
return output_path
language_dropdown = gr.Dropdown(
["en", "fr", "de", "es"], label="[MANDATORY] Language", info="The Voice's Language"
)
iface = gr.Interface(
fn=video_to_descriptions,
inputs=[gr.Video(label="Video to Upload", info="The Video"), language_dropdown],
outputs="audio",
live=False
)
if __name__ == "__main__":
iface.launch()