|
|
import gradio as gr |
|
|
import torch |
|
|
import torchaudio |
|
|
import whisper |
|
|
import cv2 |
|
|
import numpy as np |
|
|
from moviepy.editor import VideoFileClip, TextClip, CompositeVideoClip |
|
|
from transformers import pipeline, AutoTokenizer, AutoModel |
|
|
import tempfile |
|
|
import os |
|
|
import json |
|
|
from datetime import timedelta |
|
|
import librosa |
|
|
from scipy.signal import find_peaks |
|
|
import tensorflow as tf |
|
|
from sklearn.feature_extraction.text import TfidfVectorizer |
|
|
from sklearn.metrics.pairwise import cosine_similarity |
|
|
import spacy |
|
|
import nltk |
|
|
from googletrans import Translator |
|
|
import warnings |
|
|
warnings.filterwarnings("ignore") |
|
|
|
|
|
class ZenVisionModel: |
|
|
""" |
|
|
ZenVision - Advanced AI Subtitle Generation Model |
|
|
Desarrollado por el equipo ZenVision |
|
|
Modelo de 3GB+ con múltiples tecnologías de IA |
|
|
""" |
|
|
|
|
|
def __init__(self): |
|
|
self.device = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
print(f"🚀 Inicializando ZenVision en {self.device}") |
|
|
|
|
|
|
|
|
self.load_models() |
|
|
|
|
|
def load_models(self): |
|
|
"""Carga todos los modelos de IA necesarios""" |
|
|
print("📦 Cargando modelos de IA...") |
|
|
|
|
|
|
|
|
self.whisper_model = whisper.load_model("large-v2") |
|
|
|
|
|
|
|
|
self.translator = pipeline("translation", |
|
|
model="Helsinki-NLP/opus-mt-en-mul", |
|
|
device=0 if self.device == "cuda" else -1) |
|
|
|
|
|
|
|
|
self.sentiment_analyzer = pipeline("sentiment-analysis", |
|
|
model="cardiffnlp/twitter-roberta-base-sentiment-latest", |
|
|
device=0 if self.device == "cuda" else -1) |
|
|
|
|
|
|
|
|
self.emotion_detector = pipeline("text-classification", |
|
|
model="j-hartmann/emotion-english-distilroberta-base", |
|
|
device=0 if self.device == "cuda" else -1) |
|
|
|
|
|
|
|
|
self.bert_tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased") |
|
|
self.bert_model = AutoModel.from_pretrained("bert-base-multilingual-cased") |
|
|
|
|
|
|
|
|
self.google_translator = Translator() |
|
|
|
|
|
|
|
|
try: |
|
|
self.nlp = spacy.load("en_core_web_sm") |
|
|
except: |
|
|
print("⚠️ Modelo spacy no encontrado, usando funcionalidad básica") |
|
|
self.nlp = None |
|
|
|
|
|
print("✅ Todos los modelos cargados exitosamente") |
|
|
|
|
|
def extract_audio_features(self, video_path): |
|
|
"""Extrae características avanzadas del audio""" |
|
|
print("🎵 Extrayendo características de audio...") |
|
|
|
|
|
|
|
|
video = VideoFileClip(video_path) |
|
|
audio_path = tempfile.mktemp(suffix=".wav") |
|
|
video.audio.write_audiofile(audio_path, verbose=False, logger=None) |
|
|
|
|
|
|
|
|
y, sr = librosa.load(audio_path, sr=16000) |
|
|
|
|
|
|
|
|
mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13) |
|
|
spectral_centroids = librosa.feature.spectral_centroid(y=y, sr=sr) |
|
|
chroma = librosa.feature.chroma_stft(y=y, sr=sr) |
|
|
|
|
|
|
|
|
intervals = librosa.effects.split(y, top_db=20) |
|
|
|
|
|
video.close() |
|
|
os.remove(audio_path) |
|
|
|
|
|
return { |
|
|
'audio_data': y, |
|
|
'sample_rate': sr, |
|
|
'mfccs': mfccs, |
|
|
'spectral_centroids': spectral_centroids, |
|
|
'chroma': chroma, |
|
|
'intervals': intervals, |
|
|
'duration': len(y) / sr |
|
|
} |
|
|
|
|
|
def advanced_transcription(self, audio_features): |
|
|
"""Transcripción avanzada con Whisper y análisis contextual""" |
|
|
print("🎤 Realizando transcripción avanzada...") |
|
|
|
|
|
|
|
|
result = self.whisper_model.transcribe( |
|
|
audio_features['audio_data'], |
|
|
language="auto", |
|
|
word_timestamps=True, |
|
|
verbose=False |
|
|
) |
|
|
|
|
|
|
|
|
segments = [] |
|
|
for segment in result['segments']: |
|
|
|
|
|
sentiment = self.sentiment_analyzer(segment['text'])[0] |
|
|
|
|
|
|
|
|
emotion = self.emotion_detector(segment['text'])[0] |
|
|
|
|
|
|
|
|
entities = [] |
|
|
if self.nlp: |
|
|
doc = self.nlp(segment['text']) |
|
|
entities = [(ent.text, ent.label_) for ent in doc.ents] |
|
|
|
|
|
segments.append({ |
|
|
'start': segment['start'], |
|
|
'end': segment['end'], |
|
|
'text': segment['text'], |
|
|
'confidence': segment.get('avg_logprob', 0), |
|
|
'sentiment': sentiment, |
|
|
'emotion': emotion, |
|
|
'entities': entities, |
|
|
'words': segment.get('words', []) |
|
|
}) |
|
|
|
|
|
return { |
|
|
'language': result['language'], |
|
|
'segments': segments, |
|
|
'full_text': result['text'] |
|
|
} |
|
|
|
|
|
def intelligent_translation(self, transcription, target_language): |
|
|
"""Traducción inteligente con múltiples modelos""" |
|
|
print(f"🌍 Traduciendo a {target_language}...") |
|
|
|
|
|
translated_segments = [] |
|
|
|
|
|
for segment in transcription['segments']: |
|
|
original_text = segment['text'] |
|
|
|
|
|
|
|
|
try: |
|
|
google_translation = self.google_translator.translate( |
|
|
original_text, |
|
|
dest=target_language |
|
|
).text |
|
|
except: |
|
|
google_translation = original_text |
|
|
|
|
|
|
|
|
final_translation = google_translation |
|
|
if segment['entities']: |
|
|
for entity_text, entity_type in segment['entities']: |
|
|
if entity_type in ['PERSON', 'ORG', 'GPE']: |
|
|
final_translation = final_translation.replace( |
|
|
entity_text.lower(), entity_text |
|
|
) |
|
|
|
|
|
translated_segments.append({ |
|
|
**segment, |
|
|
'translated_text': final_translation, |
|
|
'original_text': original_text |
|
|
}) |
|
|
|
|
|
return translated_segments |
|
|
|
|
|
def generate_smart_subtitles(self, segments, video_duration): |
|
|
"""Genera subtítulos inteligentes con formato optimizado""" |
|
|
print("📝 Generando subtítulos inteligentes...") |
|
|
|
|
|
subtitles = [] |
|
|
|
|
|
for i, segment in enumerate(segments): |
|
|
|
|
|
duration = segment['end'] - segment['start'] |
|
|
text = segment.get('translated_text', segment['text']) |
|
|
|
|
|
|
|
|
max_chars = 42 |
|
|
max_lines = 2 |
|
|
|
|
|
words = text.split() |
|
|
lines = [] |
|
|
current_line = "" |
|
|
|
|
|
for word in words: |
|
|
if len(current_line + " " + word) <= max_chars: |
|
|
current_line += (" " + word) if current_line else word |
|
|
else: |
|
|
if current_line: |
|
|
lines.append(current_line) |
|
|
current_line = word |
|
|
|
|
|
if len(lines) >= max_lines: |
|
|
break |
|
|
|
|
|
if current_line: |
|
|
lines.append(current_line) |
|
|
|
|
|
|
|
|
subtitle_text = "\n".join(lines[:max_lines]) |
|
|
|
|
|
|
|
|
emotion_label = segment['emotion']['label'] |
|
|
color = self.get_emotion_color(emotion_label) |
|
|
|
|
|
subtitles.append({ |
|
|
'start': segment['start'], |
|
|
'end': segment['end'], |
|
|
'text': subtitle_text, |
|
|
'emotion': emotion_label, |
|
|
'color': color, |
|
|
'confidence': segment['confidence'] |
|
|
}) |
|
|
|
|
|
return subtitles |
|
|
|
|
|
def get_emotion_color(self, emotion): |
|
|
"""Asigna colores basados en emociones""" |
|
|
emotion_colors = { |
|
|
'joy': 'yellow', |
|
|
'sadness': 'blue', |
|
|
'anger': 'red', |
|
|
'fear': 'purple', |
|
|
'surprise': 'orange', |
|
|
'disgust': 'green', |
|
|
'neutral': 'white' |
|
|
} |
|
|
return emotion_colors.get(emotion.lower(), 'white') |
|
|
|
|
|
def create_subtitle_video(self, video_path, subtitles, output_path): |
|
|
"""Crea video con subtítulos integrados""" |
|
|
print("🎬 Creando video con subtítulos...") |
|
|
|
|
|
video = VideoFileClip(video_path) |
|
|
subtitle_clips = [] |
|
|
|
|
|
for subtitle in subtitles: |
|
|
|
|
|
txt_clip = TextClip( |
|
|
subtitle['text'], |
|
|
fontsize=24, |
|
|
font='Arial-Bold', |
|
|
color=subtitle['color'], |
|
|
stroke_color='black', |
|
|
stroke_width=2 |
|
|
).set_position(('center', 'bottom')).set_duration( |
|
|
subtitle['end'] - subtitle['start'] |
|
|
).set_start(subtitle['start']) |
|
|
|
|
|
subtitle_clips.append(txt_clip) |
|
|
|
|
|
|
|
|
final_video = CompositeVideoClip([video] + subtitle_clips) |
|
|
final_video.write_videofile( |
|
|
output_path, |
|
|
codec='libx264', |
|
|
audio_codec='aac', |
|
|
verbose=False, |
|
|
logger=None |
|
|
) |
|
|
|
|
|
video.close() |
|
|
final_video.close() |
|
|
|
|
|
return output_path |
|
|
|
|
|
def export_subtitle_formats(self, subtitles, base_path): |
|
|
"""Exporta subtítulos en múltiples formatos""" |
|
|
formats = {} |
|
|
|
|
|
|
|
|
srt_path = f"{base_path}.srt" |
|
|
with open(srt_path, 'w', encoding='utf-8') as f: |
|
|
for i, sub in enumerate(subtitles, 1): |
|
|
start_time = self.seconds_to_srt_time(sub['start']) |
|
|
end_time = self.seconds_to_srt_time(sub['end']) |
|
|
f.write(f"{i}\n{start_time} --> {end_time}\n{sub['text']}\n\n") |
|
|
formats['srt'] = srt_path |
|
|
|
|
|
|
|
|
vtt_path = f"{base_path}.vtt" |
|
|
with open(vtt_path, 'w', encoding='utf-8') as f: |
|
|
f.write("WEBVTT\n\n") |
|
|
for sub in subtitles: |
|
|
start_time = self.seconds_to_vtt_time(sub['start']) |
|
|
end_time = self.seconds_to_vtt_time(sub['end']) |
|
|
f.write(f"{start_time} --> {end_time}\n{sub['text']}\n\n") |
|
|
formats['vtt'] = vtt_path |
|
|
|
|
|
|
|
|
json_path = f"{base_path}.json" |
|
|
with open(json_path, 'w', encoding='utf-8') as f: |
|
|
json.dump(subtitles, f, indent=2, ensure_ascii=False) |
|
|
formats['json'] = json_path |
|
|
|
|
|
return formats |
|
|
|
|
|
def seconds_to_srt_time(self, seconds): |
|
|
"""Convierte segundos a formato SRT""" |
|
|
td = timedelta(seconds=seconds) |
|
|
hours, remainder = divmod(td.total_seconds(), 3600) |
|
|
minutes, seconds = divmod(remainder, 60) |
|
|
milliseconds = int((seconds % 1) * 1000) |
|
|
return f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d},{milliseconds:03d}" |
|
|
|
|
|
def seconds_to_vtt_time(self, seconds): |
|
|
"""Convierte segundos a formato VTT""" |
|
|
td = timedelta(seconds=seconds) |
|
|
hours, remainder = divmod(td.total_seconds(), 3600) |
|
|
minutes, seconds = divmod(remainder, 60) |
|
|
milliseconds = int((seconds % 1) * 1000) |
|
|
return f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d}.{milliseconds:03d}" |
|
|
|
|
|
def process_video(self, video_file, target_language="es", include_emotions=True): |
|
|
"""Procesa video completo para generar subtítulos""" |
|
|
if video_file is None: |
|
|
return None, None, "Por favor sube un video" |
|
|
|
|
|
try: |
|
|
print("🎯 Iniciando procesamiento con ZenVision...") |
|
|
|
|
|
|
|
|
audio_features = self.extract_audio_features(video_file.name) |
|
|
|
|
|
|
|
|
transcription = self.advanced_transcription(audio_features) |
|
|
|
|
|
|
|
|
if target_language != transcription['language']: |
|
|
segments = self.intelligent_translation(transcription, target_language) |
|
|
else: |
|
|
segments = transcription['segments'] |
|
|
|
|
|
|
|
|
subtitles = self.generate_smart_subtitles(segments, audio_features['duration']) |
|
|
|
|
|
|
|
|
output_video_path = tempfile.mktemp(suffix=".mp4") |
|
|
self.create_subtitle_video(video_file.name, subtitles, output_video_path) |
|
|
|
|
|
|
|
|
subtitle_base_path = tempfile.mktemp() |
|
|
subtitle_formats = self.export_subtitle_formats(subtitles, subtitle_base_path) |
|
|
|
|
|
|
|
|
stats = { |
|
|
'language_detected': transcription['language'], |
|
|
'total_segments': len(subtitles), |
|
|
'duration': audio_features['duration'], |
|
|
'avg_confidence': np.mean([s['confidence'] for s in segments]), |
|
|
'emotions_detected': len(set([s['emotion']['label'] for s in segments])) |
|
|
} |
|
|
|
|
|
status_msg = f"""✅ Procesamiento completado con ZenVision! |
|
|
|
|
|
📊 Estadísticas: |
|
|
• Idioma detectado: {stats['language_detected']} |
|
|
• Segmentos generados: {stats['total_segments']} |
|
|
• Duración: {stats['duration']:.1f}s |
|
|
• Confianza promedio: {stats['avg_confidence']:.2f} |
|
|
• Emociones detectadas: {stats['emotions_detected']} |
|
|
|
|
|
🎯 Tecnologías utilizadas: |
|
|
• Whisper Large-v2 (Transcripción) |
|
|
• BERT Multilingual (Embeddings) |
|
|
• RoBERTa (Análisis de sentimientos) |
|
|
• DistilRoBERTa (Detección de emociones) |
|
|
• Google Translate (Traducción) |
|
|
• OpenCV + MoviePy (Procesamiento de video) |
|
|
• Librosa (Análisis de audio) |
|
|
• spaCy (NLP avanzado) |
|
|
""" |
|
|
|
|
|
return output_video_path, subtitle_formats['srt'], status_msg |
|
|
|
|
|
except Exception as e: |
|
|
return None, None, f"❌ Error en ZenVision: {str(e)}" |
|
|
|
|
|
|
|
|
print("🚀 Inicializando ZenVision Model...") |
|
|
zenvision = ZenVisionModel() |
|
|
|
|
|
|
|
|
with gr.Blocks(title="ZenVision - AI Subtitle Generator", theme=gr.themes.Soft()) as demo: |
|
|
gr.HTML(""" |
|
|
<div style="text-align: center; padding: 20px;"> |
|
|
<h1>🎬 ZenVision AI Subtitle Generator</h1> |
|
|
<p style="font-size: 18px; color: #666;"> |
|
|
Modelo avanzado de subtitulado automático con IA<br> |
|
|
<strong>Desarrollado por el equipo ZenVision</strong> |
|
|
</p> |
|
|
<p style="font-size: 14px; color: #888;"> |
|
|
Modelo de 3GB+ • Whisper • BERT • RoBERTa • OpenCV • Librosa • spaCy |
|
|
</p> |
|
|
</div> |
|
|
""") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(scale=1): |
|
|
gr.Markdown("### 📤 Entrada") |
|
|
video_input = gr.Video(label="Subir Video", height=300) |
|
|
|
|
|
with gr.Row(): |
|
|
language_dropdown = gr.Dropdown( |
|
|
choices=[ |
|
|
("Español", "es"), |
|
|
("English", "en"), |
|
|
("Français", "fr"), |
|
|
("Deutsch", "de"), |
|
|
("Italiano", "it"), |
|
|
("Português", "pt"), |
|
|
("中文", "zh"), |
|
|
("日本語", "ja"), |
|
|
("한국어", "ko"), |
|
|
("Русский", "ru") |
|
|
], |
|
|
value="es", |
|
|
label="Idioma de destino" |
|
|
) |
|
|
|
|
|
emotions_checkbox = gr.Checkbox( |
|
|
label="Incluir análisis de emociones", |
|
|
value=True |
|
|
) |
|
|
|
|
|
process_btn = gr.Button( |
|
|
"🚀 Procesar con ZenVision", |
|
|
variant="primary", |
|
|
size="lg" |
|
|
) |
|
|
|
|
|
with gr.Column(scale=1): |
|
|
gr.Markdown("### 📥 Resultados") |
|
|
video_output = gr.Video(label="Video con Subtítulos", height=300) |
|
|
subtitle_file = gr.File(label="Archivo de Subtítulos (.srt)") |
|
|
|
|
|
with gr.Row(): |
|
|
status_output = gr.Textbox( |
|
|
label="Estado del Procesamiento", |
|
|
lines=15, |
|
|
interactive=False |
|
|
) |
|
|
|
|
|
|
|
|
gr.Markdown("### 🎯 Características de ZenVision") |
|
|
gr.HTML(""" |
|
|
<div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(250px, 1fr)); gap: 15px; margin: 20px 0;"> |
|
|
<div style="padding: 15px; border: 1px solid #ddd; border-radius: 8px;"> |
|
|
<h4>🎤 Transcripción Avanzada</h4> |
|
|
<p>Whisper Large-v2 con timestamps precisos y detección automática de idioma</p> |
|
|
</div> |
|
|
<div style="padding: 15px; border: 1px solid #ddd; border-radius: 8px;"> |
|
|
<h4>🌍 Traducción Inteligente</h4> |
|
|
<p>Google Translate + preservación de entidades nombradas</p> |
|
|
</div> |
|
|
<div style="padding: 15px; border: 1px solid #ddd; border-radius: 8px;"> |
|
|
<h4>😊 Análisis Emocional</h4> |
|
|
<p>Detección de emociones y sentimientos con colores adaptativos</p> |
|
|
</div> |
|
|
<div style="padding: 15px; border: 1px solid #ddd; border-radius: 8px;"> |
|
|
<h4>📝 Múltiples Formatos</h4> |
|
|
<p>Exportación en SRT, VTT y JSON con metadatos completos</p> |
|
|
</div> |
|
|
</div> |
|
|
""") |
|
|
|
|
|
|
|
|
process_btn.click( |
|
|
fn=zenvision.process_video, |
|
|
inputs=[video_input, language_dropdown, emotions_checkbox], |
|
|
outputs=[video_output, subtitle_file, status_output] |
|
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch( |
|
|
server_name="0.0.0.0", |
|
|
server_port=7860, |
|
|
share=True |
|
|
) |