Spaces:
Running
Running
import gradio as gr | |
import whisper | |
import cv2 | |
import numpy as np | |
import moviepy.editor as mp | |
from moviepy.video.fx import resize | |
from transformers import pipeline, AutoTokenizer, AutoModel | |
import torch | |
import re | |
import os | |
import tempfile | |
from typing import List, Dict, Tuple | |
import json | |
import librosa | |
from textblob import TextBlob | |
import emoji | |
import yt_dlp | |
import requests | |
from urllib.parse import urlparse, parse_qs | |
class AIVideoClipper: | |
def __init__(self): | |
# Initialize models | |
print("Loading models...") | |
self.whisper_model = whisper.load_model("base") # Using base model for free tier | |
self.sentiment_analyzer = pipeline("sentiment-analysis", | |
model="cardiffnlp/twitter-roberta-base-sentiment-latest") | |
self.emotion_analyzer = pipeline("text-classification", | |
model="j-hartmann/emotion-english-distilroberta-base") | |
# Viral keywords and patterns | |
self.viral_keywords = [ | |
"wow", "amazing", "incredible", "unbelievable", "shocking", "surprise", | |
"secret", "trick", "hack", "tip", "mistake", "fail", "success", | |
"breakthrough", "discovery", "reveal", "expose", "truth", "lie", | |
"before", "after", "transformation", "change", "upgrade", "improve", | |
"money", "rich", "poor", "expensive", "cheap", "free", "save", | |
"love", "hate", "angry", "happy", "sad", "funny", "laugh", "cry", | |
"first time", "last time", "never", "always", "everyone", "nobody", | |
"finally", "suddenly", "immediately", "instantly", "quickly" | |
] | |
self.hook_patterns = [ | |
r"you won't believe", | |
r"this will change", | |
r"nobody talks about", | |
r"the truth about", | |
r"what happens when", | |
r"here's what", | |
r"this is why", | |
r"the secret", | |
r"watch this", | |
r"wait for it" | |
] | |
def download_youtube_video(self, url: str, temp_dir: str) -> Tuple[str, Dict]: | |
"""Download YouTube video and return path + metadata""" | |
print(f"Downloading YouTube video: {url}") | |
# Validate YouTube URL | |
if not self.is_valid_youtube_url(url): | |
raise ValueError("Invalid YouTube URL. Please provide a valid YouTube video link.") | |
# Configure yt-dlp options for free tier optimization | |
ydl_opts = { | |
'format': 'best[height<=720][ext=mp4]/best[ext=mp4]/best', # Limit to 720p for performance | |
'outtmpl': os.path.join(temp_dir, '%(title)s.%(ext)s'), | |
'noplaylist': True, | |
'extractaudio': False, | |
'audioformat': 'mp3', | |
'ignoreerrors': False, | |
'no_warnings': False, | |
'extract_flat': False, | |
} | |
try: | |
with yt_dlp.YoutubeDL(ydl_opts) as ydl: | |
# Extract info first | |
info = ydl.extract_info(url, download=False) | |
# Check video duration (limit to 60 minutes for free tier) | |
duration = info.get('duration', 0) | |
if duration > 3600: # 1 hour limit | |
raise ValueError("Video too long. Please use videos shorter than 1 hour.") | |
# Download the video | |
ydl.download([url]) | |
# Find the downloaded file | |
video_title = info.get('title', 'video') | |
video_ext = info.get('ext', 'mp4') | |
video_path = os.path.join(temp_dir, f"{video_title}.{video_ext}") | |
# Sometimes yt-dlp changes the filename, so find the actual file | |
downloaded_files = [f for f in os.listdir(temp_dir) if f.endswith(('.mp4', '.mkv', '.webm'))] | |
if downloaded_files: | |
video_path = os.path.join(temp_dir, downloaded_files[0]) | |
metadata = { | |
'title': video_title, | |
'duration': duration, | |
'uploader': info.get('uploader', 'Unknown'), | |
'view_count': info.get('view_count', 0), | |
'upload_date': info.get('upload_date', 'Unknown') | |
} | |
print(f"Successfully downloaded: {video_title}") | |
return video_path, metadata | |
except Exception as e: | |
raise Exception(f"Failed to download YouTube video: {str(e)}") | |
def is_valid_youtube_url(self, url: str) -> bool: | |
"""Check if URL is a valid YouTube URL""" | |
youtube_regex = re.compile( | |
r'(https?://)?(www\.)?(youtube|youtu|youtube-nocookie)\.(com|be)/' | |
r'(watch\?v=|embed/|v/|.+\?v=)?([^&=%\?]{11})' | |
) | |
return youtube_regex.match(url) is not None | |
def extract_video_id(self, url: str) -> str: | |
"""Extract video ID from YouTube URL""" | |
patterns = [ | |
r'(?:v=|\/)([0-9A-Za-z_-]{11}).*', | |
r'(?:embed\/)([0-9A-Za-z_-]{11})', | |
r'(?:v\/)([0-9A-Za-z_-]{11})' | |
] | |
for pattern in patterns: | |
match = re.search(pattern, url) | |
if match: | |
return match.group(1) | |
return None | |
"""Extract audio features for engagement analysis""" | |
y, sr = librosa.load(audio_path) | |
# Extract features | |
tempo, _ = librosa.beat.beat_track(y=y, sr=sr) | |
spectral_centroids = librosa.feature.spectral_centroid(y=y, sr=sr)[0] | |
spectral_rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)[0] | |
mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13) | |
return { | |
'tempo': float(tempo), | |
'spectral_centroid_mean': float(np.mean(spectral_centroids)), | |
'spectral_rolloff_mean': float(np.mean(spectral_rolloff)), | |
'mfcc_mean': float(np.mean(mfccs)), | |
'energy_variance': float(np.var(librosa.feature.rms(y=y)[0])) | |
} | |
def transcribe_video(self, video_path: str) -> List[Dict]: | |
"""Transcribe video and return segments with timestamps""" | |
print("Transcribing video...") | |
result = self.whisper_model.transcribe(video_path, word_timestamps=True) | |
segments = [] | |
for segment in result["segments"]: | |
segments.append({ | |
'start': segment['start'], | |
'end': segment['end'], | |
'text': segment['text'].strip(), | |
'words': segment.get('words', []) | |
}) | |
return segments | |
def calculate_virality_score(self, text: str, audio_features: Dict, | |
segment_duration: float) -> float: | |
"""Calculate virality score for a text segment""" | |
score = 0.0 | |
text_lower = text.lower() | |
# Sentiment analysis | |
sentiment = self.sentiment_analyzer(text)[0] | |
if sentiment['label'] == 'POSITIVE' and sentiment['score'] > 0.8: | |
score += 2.0 | |
elif sentiment['label'] == 'NEGATIVE' and sentiment['score'] > 0.8: | |
score += 1.5 | |
# Emotion analysis | |
emotion = self.emotion_analyzer(text)[0] | |
high_engagement_emotions = ['surprise', 'excitement', 'anger', 'joy'] | |
if emotion['label'].lower() in high_engagement_emotions and emotion['score'] > 0.7: | |
score += 2.0 | |
# Viral keywords | |
for keyword in self.viral_keywords: | |
if keyword in text_lower: | |
score += 1.0 | |
# Hook patterns | |
for pattern in self.hook_patterns: | |
if re.search(pattern, text_lower): | |
score += 3.0 | |
# Audio engagement features | |
if audio_features['tempo'] > 120: # Higher tempo = more engaging | |
score += 1.0 | |
if audio_features['energy_variance'] > 0.01: # Energy variation | |
score += 1.0 | |
# Segment duration (30-60 seconds ideal for clips) | |
if 25 <= segment_duration <= 65: | |
score += 2.0 | |
elif 15 <= segment_duration <= 90: | |
score += 1.0 | |
# Text length (not too short, not too long) | |
word_count = len(text.split()) | |
if 20 <= word_count <= 100: | |
score += 1.0 | |
return min(score, 10.0) # Cap at 10 | |
def find_best_moments(self, segments: List[Dict], audio_features: Dict, | |
clip_duration: int = 30) -> List[Dict]: | |
"""Find the best moments for short clips""" | |
print("Analyzing segments for viral potential...") | |
scored_segments = [] | |
for i, segment in enumerate(segments): | |
# Group segments into potential clips | |
clip_segments = [segment] | |
current_duration = segment['end'] - segment['start'] | |
# Extend clip to reach desired duration | |
j = i + 1 | |
while j < len(segments) and current_duration < clip_duration: | |
next_segment = segments[j] | |
if next_segment['end'] - segment['start'] <= clip_duration * 1.5: | |
clip_segments.append(next_segment) | |
current_duration = next_segment['end'] - segment['start'] | |
j += 1 | |
else: | |
break | |
# Calculate combined text and virality score | |
combined_text = " ".join([s['text'] for s in clip_segments]) | |
virality_score = self.calculate_virality_score( | |
combined_text, audio_features, current_duration | |
) | |
scored_segments.append({ | |
'start': segment['start'], | |
'end': clip_segments[-1]['end'], | |
'text': combined_text, | |
'duration': current_duration, | |
'virality_score': virality_score, | |
'segments': clip_segments | |
}) | |
# Sort by virality score and remove overlaps | |
scored_segments.sort(key=lambda x: x['virality_score'], reverse=True) | |
# Remove overlapping segments | |
final_segments = [] | |
for segment in scored_segments: | |
overlap = False | |
for existing in final_segments: | |
if (segment['start'] < existing['end'] and | |
segment['end'] > existing['start']): | |
overlap = True | |
break | |
if not overlap: | |
final_segments.append(segment) | |
if len(final_segments) >= 5: # Limit to top 5 clips | |
break | |
return final_segments | |
def add_emojis_to_text(self, text: str) -> str: | |
"""Add relevant emojis to text based on content""" | |
emoji_map = { | |
'money': 'π°', 'rich': 'π°', 'dollar': 'π΅', | |
'love': 'β€οΈ', 'heart': 'β€οΈ', 'like': 'π', | |
'fire': 'π₯', 'hot': 'π₯', 'amazing': 'π₯', | |
'laugh': 'π', 'funny': 'π', 'lol': 'π', | |
'wow': 'π±', 'omg': 'π±', 'shocking': 'π±', | |
'cool': 'π', 'awesome': 'π', 'great': 'π', | |
'think': 'π€', 'question': 'β', 'why': 'π€', | |
'warning': 'β οΈ', 'careful': 'β οΈ', 'danger': 'β οΈ', | |
'success': 'β ', 'win': 'π', 'winner': 'π', | |
'music': 'π΅', 'song': 'π΅', 'sound': 'π' | |
} | |
words = text.lower().split() | |
for word in words: | |
clean_word = re.sub(r'[^\w]', '', word) | |
if clean_word in emoji_map: | |
text = re.sub(f"\\b{re.escape(word)}\\b", | |
f"{word} {emoji_map[clean_word]}", text, flags=re.IGNORECASE) | |
return text | |
def create_clip(self, video_path: str, start_time: float, end_time: float, | |
text: str, output_path: str, add_subtitles: bool = True) -> str: | |
"""Create a short clip from the video""" | |
print(f"Creating clip: {start_time:.1f}s - {end_time:.1f}s") | |
# Load video | |
video = mp.VideoFileClip(video_path).subclip(start_time, end_time) | |
# Resize to 9:16 aspect ratio (1080x1920) | |
target_width = 1080 | |
target_height = 1920 | |
# Calculate scaling to fit the video in the frame | |
scale_w = target_width / video.w | |
scale_h = target_height / video.h | |
scale = min(scale_w, scale_h) | |
# Resize video | |
video_resized = video.resize(scale) | |
# Create background (blur or solid color) | |
if video_resized.h < target_height or video_resized.w < target_width: | |
# Create blurred background | |
background = video.resize((target_width, target_height)) | |
background = background.fl_image(lambda frame: cv2.GaussianBlur(frame, (21, 21), 0)) | |
# Overlay the main video in center | |
final_video = mp.CompositeVideoClip([ | |
background, | |
video_resized.set_position('center') | |
], size=(target_width, target_height)) | |
else: | |
final_video = video_resized | |
# Add subtitles if requested | |
if add_subtitles and text: | |
# Add emojis to text | |
text_with_emojis = self.add_emojis_to_text(text) | |
# Create text clip | |
txt_clip = mp.TextClip( | |
text_with_emojis, | |
fontsize=60, | |
color='white', | |
stroke_color='black', | |
stroke_width=3, | |
size=(target_width - 100, None), | |
method='caption' | |
).set_position(('center', 0.8), relative=True).set_duration(final_video.duration) | |
final_video = mp.CompositeVideoClip([final_video, txt_clip]) | |
# Write the final video | |
final_video.write_videofile( | |
output_path, | |
codec='libx264', | |
audio_codec='aac', | |
temp_audiofile='temp-audio.m4a', | |
remove_temp=True, | |
fps=30, | |
preset='ultrafast' # Faster encoding for free tier | |
) | |
# Clean up | |
video.close() | |
final_video.close() | |
return output_path | |
def process_video(input_type, video_file, youtube_url, clip_duration, num_clips, add_subtitles): | |
"""Main function to process video and create clips""" | |
clipper = AIVideoClipper() | |
try: | |
# Create temporary directory | |
with tempfile.TemporaryDirectory() as temp_dir: | |
video_path = None | |
video_metadata = {} | |
# Handle input based on type | |
if input_type == "Upload Video File": | |
if video_file is None: | |
return "Please upload a video file.", [], [] | |
video_path = video_file.name | |
video_metadata = {'title': 'Uploaded Video', 'source': 'upload'} | |
elif input_type == "YouTube URL": | |
if not youtube_url or not youtube_url.strip(): | |
return "Please enter a YouTube URL.", [], [] | |
try: | |
video_path, video_metadata = clipper.download_youtube_video(youtube_url.strip(), temp_dir) | |
video_metadata['source'] = 'youtube' | |
except Exception as e: | |
return f"Error downloading YouTube video: {str(e)}", [], [] | |
else: | |
return "Please select an input method.", [], [] | |
if not video_path or not os.path.exists(video_path): | |
return "Video file not found or invalid.", [], [] | |
# Extract audio features | |
print("Extracting audio features...") | |
audio_features = clipper.extract_audio_features(video_path) | |
# Transcribe video | |
segments = clipper.transcribe_video(video_path) | |
if not segments: | |
return "Could not transcribe video. Please check the audio quality.", [], [] | |
# Find best moments | |
best_moments = clipper.find_best_moments(segments, audio_features, clip_duration) | |
best_moments = best_moments[:num_clips] # Limit to requested number | |
if not best_moments: | |
return "No suitable clips found. Try adjusting parameters.", [], [] | |
# Create clips | |
output_videos = [] | |
clip_info = [] | |
for i, moment in enumerate(best_moments): | |
output_path = os.path.join(temp_dir, f"clip_{i+1}.mp4") | |
try: | |
clipper.create_clip( | |
video_path, | |
moment['start'], | |
moment['end'], | |
moment['text'], | |
output_path, | |
add_subtitles | |
) | |
# Copy to permanent location | |
permanent_path = f"clip_{i+1}_{hash(video_path)}_{i}.mp4" | |
os.rename(output_path, permanent_path) | |
output_videos.append(permanent_path) | |
clip_info.append({ | |
'clip_number': i + 1, | |
'start_time': f"{moment['start']:.1f}s", | |
'end_time': f"{moment['end']:.1f}s", | |
'duration': f"{moment['duration']:.1f}s", | |
'virality_score': f"{moment['virality_score']:.2f}/10", | |
'text_preview': moment['text'][:100] + "..." if len(moment['text']) > 100 else moment['text'], | |
'source_video': video_metadata.get('title', 'Unknown') | |
}) | |
except Exception as e: | |
print(f"Error creating clip {i+1}: {str(e)}") | |
continue | |
success_msg = f"β Successfully created {len(output_videos)} clips from: {video_metadata.get('title', 'video')}" | |
return success_msg, output_videos, clip_info | |
except Exception as e: | |
return f"Error processing video: {str(e)}", [], [] | |
# Create Gradio interface | |
def create_interface(): | |
with gr.Blocks(title="AI Video Clipper", theme=gr.themes.Soft()) as demo: | |
gr.Markdown( | |
""" | |
# π¬ AI Video Clipper | |
Transform your long videos into viral short clips automatically! | |
Upload a video file or paste a YouTube URL and let AI find the most engaging moments. | |
**Features:** | |
- π€ AI-powered moment detection | |
- π± Auto 9:16 aspect ratio conversion | |
- π Automatic subtitles with emojis | |
- π Virality scoring | |
- π― Multi-language support | |
- π YouTube video download support | |
""" | |
) | |
with gr.Row(): | |
with gr.Column(): | |
# Input method selection | |
input_type = gr.Radio( | |
choices=["Upload Video File", "YouTube URL"], | |
value="Upload Video File", | |
label="Choose Input Method", | |
interactive=True | |
) | |
# Video file upload (conditional) | |
video_input = gr.File( | |
label="Upload Video File", | |
file_types=[".mp4", ".avi", ".mov", ".mkv", ".webm"], | |
type="filepath", | |
visible=True | |
) | |
# YouTube URL input (conditional) | |
youtube_input = gr.Textbox( | |
label="YouTube URL", | |
placeholder="https://www.youtube.com/watch?v=...", | |
visible=False, | |
info="Paste any YouTube video URL (supports various formats)" | |
) | |
# Show example URLs | |
gr.Markdown( | |
""" | |
**Supported URL formats:** | |
- `https://www.youtube.com/watch?v=VIDEO_ID` | |
- `https://youtu.be/VIDEO_ID` | |
- `https://www.youtube.com/embed/VIDEO_ID` | |
""", | |
visible=False, | |
elem_id="url_examples" | |
) | |
with gr.Row(): | |
clip_duration = gr.Slider( | |
minimum=15, | |
maximum=90, | |
value=30, | |
step=5, | |
label="Target Clip Duration (seconds)" | |
) | |
num_clips = gr.Slider( | |
minimum=1, | |
maximum=5, | |
value=3, | |
step=1, | |
label="Number of Clips to Generate" | |
) | |
add_subtitles = gr.Checkbox( | |
label="Add Subtitles with Emojis", | |
value=True | |
) | |
process_btn = gr.Button( | |
"π Create Clips", | |
variant="primary", | |
size="lg" | |
) | |
with gr.Column(): | |
status_output = gr.Textbox( | |
label="Status", | |
interactive=False, | |
lines=3 | |
) | |
clips_output = gr.Gallery( | |
label="Generated Clips", | |
show_label=True, | |
elem_id="gallery", | |
columns=1, | |
rows=3, | |
height="auto", | |
allow_preview=True, | |
show_download_button=True | |
) | |
with gr.Row(): | |
info_output = gr.JSON( | |
label="Clip Analysis", | |
visible=True | |
) | |
# Dynamic input visibility | |
def update_input_visibility(choice): | |
if choice == "Upload Video File": | |
return ( | |
gr.update(visible=True), # video_input | |
gr.update(visible=False), # youtube_input | |
gr.update(visible=False) # url_examples | |
) | |
else: # YouTube URL | |
return ( | |
gr.update(visible=False), # video_input | |
gr.update(visible=True), # youtube_input | |
gr.update(visible=True) # url_examples | |
) | |
input_type.change( | |
update_input_visibility, | |
inputs=[input_type], | |
outputs=[video_input, youtube_input, gr.Markdown(elem_id="url_examples")] | |
) | |
# Example videos section | |
gr.Markdown("### πΊ Tips for Best Results:") | |
gr.Markdown(""" | |
**π File Upload:** | |
- Upload videos with clear speech (podcasts, interviews, tutorials work great!) | |
- Supported formats: MP4, AVI, MOV, MKV, WebM | |
- Maximum recommended duration: 2 hours | |
**π YouTube Videos:** | |
- Any public YouTube video (no age restrictions) | |
- Automatically downloads in optimal quality (720p max for performance) | |
- Works with livestreams, premieres, and regular videos | |
- Maximum duration: 1 hour for free tier | |
**π― Content Tips:** | |
- Longer videos (5+ minutes) provide more clip opportunities | |
- Videos with engaging content and emotional moments score higher | |
- Good audio quality improves transcription accuracy | |
- Educational content, podcasts, and interviews work exceptionally well | |
""") | |
process_btn.click( | |
process_video, | |
inputs=[input_type, video_input, youtube_input, clip_duration, num_clips, add_subtitles], | |
outputs=[status_output, clips_output, info_output] | |
) | |
return demo | |
# Launch the app | |
if __name__ == "__main__": | |
demo = create_interface() | |
demo.launch( | |
server_name="0.0.0.0", | |
server_port=7860, | |
share=False | |
) |