from transformers import pipeline from gtts import gTTS import subprocess import streamlit as st import os # Step 1: Extract audio from the video def extract_audio_from_video(video_path, audio_path="extracted_audio.mp3"): # Use FFmpeg to extract audio from the video file ffmpeg_command = [ "ffmpeg", "-i", video_path, # Input video "-vn", # Disable video processing "-acodec", "libmp3lame", # Set audio codec to mp3 "-ar", "44100", # Set audio sample rate "-ac", "2", # Set number of audio channels audio_path ] subprocess.run(ffmpeg_command) print(f"Audio extracted to {audio_path}") return audio_path # Step 2: Extract text from the audio using Hugging Face Transformers (Whisper) def extract_text_from_audio(audio_path): # Load the ASR pipeline from Hugging Face with a Whisper-like model transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-base") # Transcribe the audio file transcription = transcriber(audio_path) text = transcription["text"] # Save transcribed text to a file (optional) with open("video_text.txt", "w") as f: f.write(text) return text # Step 3: Generate voice-over using gTTS def generate_voice_over(text, output_audio_path="voice_over.mp3"): # Generate audio with gTTS tts = gTTS(text=text, lang="en") tts.save(output_audio_path) print(f"Voice-over saved as {output_audio_path}") return output_audio_path # Step 4: Combine voice-over with original video using FFmpeg def add_voice_over_to_video(video_path, audio_path, output_video_path="output_video_with_voice.mp4"): # Use FFmpeg to combine video with new audio ffmpeg_command = [ "ffmpeg", "-i", video_path, "-i", audio_path, "-c:v", "copy", "-map", "0:v:0", "-map", "1:a:0", "-shortest", # Ensure the video ends when the audio ends output_video_path ] subprocess.run(ffmpeg_command) print(f"Final video with voice-over saved as {output_video_path}") # Step 5: Run the complete process def main(video_path): # Step 1: Extract audio from the video audio_path = extract_audio_from_video(video_path) # Step 2: Extract text from the audio text = extract_text_from_audio(audio_path) print("Extracted Text:", text) # Step 3: Generate voice-over from extracted text voice_over_path = generate_voice_over(text) # Step 4: Add voice-over to the video add_voice_over_to_video(video_path, voice_over_path) # Streamlit interface to upload video file uploaded_file = st.file_uploader("Upload a video file", type=["mp4"]) if uploaded_file is not None: # Save the uploaded file as input_video.mp4 with open("input_video.mp4", "wb") as f: f.write(uploaded_file.getbuffer()) # Call the main function after the video is uploaded main("input_video.mp4")