Spaces:

NeuralFalcon
/

parakeet-tdt-0.6b-v2-subtitle

Running

File size: 12,686 Bytes

84c2692

import string
import json
import os

import re
import uuid
from pydub import AudioSegment

# Ensure the 'subtitles' directory exists
if not os.path.exists("./subtitles"):
    os.makedirs("./subtitles", exist_ok=True)

def clean_file_name(file_path,unique_id=True):
    # Get the base file name and extension
    file_name = os.path.basename(file_path)
    file_name, file_extension = os.path.splitext(file_name)

    # Replace non-alphanumeric characters with an underscore
    cleaned = re.sub(r'[^a-zA-Z\d]+', '_', file_name)

    # Remove any multiple underscores
    clean_file_name = re.sub(r'_+', '_', cleaned).strip('_')

    # Generate a random UUID for uniqueness
    random_uuid = uuid.uuid4().hex[:6]
    if unique_id:
        clean_file_name = f"{clean_file_name}_{random_uuid}{file_extension}"
    else:
        clean_file_name = f"{clean_file_name}{file_extension}"
        
    return clean_file_name 

def convert_to_mono(file_path, output_format="mp3"):
    # Load the audio (any format supported by ffmpeg/pydub)
    audio = AudioSegment.from_file(file_path)

    # Convert to mono
    mono_audio = audio.set_channels(1)

    file_name = os.path.basename(file_path)
    file_name, file_extension = os.path.splitext(file_name)

    # Get the cleaned output file name and path
    cleaned_file_name = clean_file_name(file_name)
    output_file = f"./subtitles/{cleaned_file_name}.{output_format}"

    # Export the mono audio
    mono_audio.export(output_file, format=output_format)
    return output_file

def format_srt_time(seconds):
    hours = int(seconds // 3600)
    minutes = int((seconds % 3600) // 60)
    sec = int(seconds % 60)
    millisec = int((seconds % 1) * 1000)
    return f"{hours:02}:{minutes:02}:{sec:02},{millisec:03}"

## Word Level SRT File
def write_word_srt(mono_audio_path,word_level_timestamps, skip_punctuation=True):
    extension = os.path.splitext(mono_audio_path)[1]
    output_file=mono_audio_path.replace(extension,"_word_level.srt")
    with open(output_file, "w", encoding="utf-8") as f:
        index = 1

        for entry in word_level_timestamps:
            word = entry["word"]

            if skip_punctuation and all(c in string.punctuation for c in word):
                continue

            start_srt = format_srt_time(entry["start"])
            end_srt = format_srt_time(entry["end"])

            f.write(f"{index}\n{start_srt} --> {end_srt}\n{word}\n\n")
            index += 1
    return output_file


## Speech To text File
def write_words_to_txt(mono_audio_path, word_level_timestamps):

    extension = os.path.splitext(mono_audio_path)[1]
    output_file=mono_audio_path.replace(extension,".txt")

    with open(output_file, "w", encoding="utf-8") as f:
        words = [
            entry["word"]
            for entry in word_level_timestamps
            if not all(c in string.punctuation for c in entry["word"])
        ]
        text = " ".join(words)
        f.write(text)
        return text, output_file


## Sentence Level Srt File
def generate_professional_subtitles(mono_audio_path, word_timestamps, max_words_per_subtitle=8, max_subtitle_duration=5.0, min_pause_for_split=0.5):
    """
    Generates professional subtitles and saves to SRT file by:
    - Splitting at sentence boundaries (., ?, !) when possible
    - Respecting pauses (> min_pause_for_split) for natural breaks
    - Enforcing max_words_per_subtitle and max_subtitle_duration
    - Outputting standard SRT format with proper timing
    
    Returns:
        output_file: Path to the generated SRT file
        subtitles: List of subtitle dictionaries with text/start/end
    """
    subtitles = []
    current_sub = {
        "text": "",
        "start": None,
        "end": None,
        "word_count": 0
    }
    
    # Prepare output SRT file path
    extension = os.path.splitext(mono_audio_path)[1]
    output_file=mono_audio_path.replace(extension,".srt")

    
    # Process word timestamps to create subtitles
    for word_data in word_timestamps:
        word = word_data['word']
        word_start = word_data['start']
        word_end = word_data['end']

        # Check for sentence-ending punctuation
        is_end_of_sentence = word.endswith(('.', '?', '!'))

        # Check for a natural pause (silence between words)
        has_pause = (current_sub["end"] is not None and 
                    word_start - current_sub["end"] > min_pause_for_split)

        # Check if we need to split due to constraints
        should_split = (
            is_end_of_sentence or
            has_pause or
            current_sub["word_count"] >= max_words_per_subtitle or
            (current_sub["end"] is not None and 
             (word_end - current_sub["start"]) > max_subtitle_duration)
        )

        if should_split and current_sub["text"]:
            # Finalize current subtitle
            subtitles.append({
                "text": current_sub["text"].strip(),
                "start": current_sub["start"],
                "end": current_sub["end"]
            })
            # Reset for next subtitle
            current_sub = {
                "text": "",
                "start": None,
                "end": None,
                "word_count": 0
            }

        # Add current word to subtitle
        if current_sub["word_count"] == 0:
            current_sub["start"] = word_start
        current_sub["text"] += " " + word if current_sub["text"] else word
        current_sub["end"] = word_end
        current_sub["word_count"] += 1

    # Add last subtitle if exists
    if current_sub["text"]:
        subtitles.append({
            "text": current_sub["text"].strip(),
            "start": current_sub["start"],
            "end": current_sub["end"]
        })

    # Write to SRT file
    with open(output_file, "w", encoding="utf-8") as f:
        for i, sub in enumerate(subtitles, 1):
            f.write(f"{i}\n")
            f.write(f"{format_srt_time(sub['start'])} --> {format_srt_time(sub['end'])}\n")
            f.write(f"{sub['text']}\n\n")
    
    return output_file, subtitles   


## For vertical Videos
def for_yt_shorts(mono_audio_path, word_timestamps, min_silence_between_words=0.3, max_characters_per_subtitle=17):
    """
    Generates optimized subtitles for YouTube Shorts/Instagram Reels by:
    - Combining hyphenated words (e.g., "co-" + "-worker" → "coworker")
    - Respecting max character limits per subtitle (default: 17)
    - Creating natural breaks at pauses (> min_silence_between_words)
    - Outputting properly formatted SRT files
    
    Returns:
        output_file: Path to generated SRT file
        subtitles: List of subtitle dictionaries (text/start/end)
    """
    subtitles = []
    current_sub = {
        "text": "",
        "start": None,
        "end": None,
        "char_count": 0
    }
    

    extension = os.path.splitext(mono_audio_path)[1]
    output_file=mono_audio_path.replace(extension,"_shorts.srt")

    i = 0
    while i < len(word_timestamps):
        # Process current word and any hyphenated continuations
        full_word = word_timestamps[i]['word']
        start_time = word_timestamps[i]['start']
        end_time = word_timestamps[i]['end']
        
        # Combine hyphenated words (e.g., "co-" + "-worker")
        while (i + 1 < len(word_timestamps) and 
               word_timestamps[i+1]['word'].startswith('-')):
            next_word = word_timestamps[i+1]['word'].lstrip('-')
            full_word += next_word
            end_time = word_timestamps[i+1]['end']
            i += 1
        
        # Check if adding this word would exceed character limit
        new_char_count = current_sub["char_count"] + len(full_word) + (1 if current_sub["text"] else 0)
        
        # Check for natural break conditions
        needs_break = (
            new_char_count > max_characters_per_subtitle or
            (current_sub["end"] is not None and 
             word_timestamps[i]['start'] - current_sub["end"] > min_silence_between_words)
        )
        
        if needs_break and current_sub["text"]:
            # Finalize current subtitle
            subtitles.append({
                "text": current_sub["text"].strip(),
                "start": current_sub["start"],
                "end": current_sub["end"]
            })
            # Start new subtitle
            current_sub = {
                "text": full_word,
                "start": start_time,
                "end": end_time,
                "char_count": len(full_word)
            }
        else:
            # Add to current subtitle
            if current_sub["text"]:
                current_sub["text"] += " " + full_word
                current_sub["char_count"] += 1 + len(full_word)  # Space + word
            else:
                current_sub["text"] = full_word
                current_sub["start"] = start_time
                current_sub["char_count"] = len(full_word)
            current_sub["end"] = end_time
        
        i += 1
    
    # Add final subtitle if exists
    if current_sub["text"]:
        subtitles.append({
            "text": current_sub["text"].strip(),
            "start": current_sub["start"],
            "end": current_sub["end"]
        })
    
    # Write SRT file
    with open(output_file, "w", encoding="utf-8") as f:
        for idx, sub in enumerate(subtitles, 1):
            f.write(f"{idx}\n")
            f.write(f"{format_srt_time(sub['start'])} --> {format_srt_time(sub['end'])}\n")
            f.write(f"{sub['text']}\n\n")
    
    return output_file, subtitles



## Save word level timestamp for later use if you are a developer 
def word_timestamp_json(mono_audio_path, word_timestamps):
    """
    Save word timestamps as a JSON file with the same base name as the audio file.
    
    Args:
        mono_audio_path: Path to the audio file (e.g., "audio.wav")
        word_timestamps: List of word timestamp dictionaries
        
    Returns:
        output_file: Path to the generated JSON file
        word_timestamps: The original word timestamps (unchanged)
    """
    # Create output path
    extension = os.path.splitext(mono_audio_path)[1]
    output_file=mono_audio_path.replace(extension,"_word_timestamps.json")

    # Save as JSON with pretty formatting
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(word_timestamps, f, indent=2, ensure_ascii=False)
    
    return output_file    

## save all files 
def save_files(mono_audio_path, word_timestamps):
    """
    Processes word timestamps and generates multiple subtitle/text formats for different use cases.
    
    Generates:
    1. Professional SRT subtitles (for standard videos)
    2. Word-level SRT (for short-form content)
    3. Optimized vertical video subtitles (Shorts/Reels/TikTok)
    4. Raw speech-to-text transcript
    5. JSON timestamp data (for developers)
    6. Raw transcript text (for immediate use)
    
    Args:
        mono_audio_path: Path to the source audio file (WAV format)
        word_timestamps: List of dictionaries containing word-level timestamps
                        [{'word': str, 'start': float, 'end': float}, ...]
    
    Returns:
        Six separate values in this order:
        default_srt_path:       # Traditional subtitles (8 words max)
        word_level_srt_path:    # Single-word segments  
        shorts_srt_path:        # Vertical video optimized
        speech_text_path:       # Plain text transcript file
        timestamps_json_path:   # Raw timestamp data file
        text:                   # Raw transcript text string
    """
    
    # 1. Generate standard subtitles for traditional videos
    default_srt_path, _ = generate_professional_subtitles(
        mono_audio_path,
        word_timestamps,
        max_words_per_subtitle=8,
        max_subtitle_duration=5.0,
        min_pause_for_split=0.5
    )
    
    # 2. Create word-level SRT for short-form content
    word_level_srt_path = write_word_srt(mono_audio_path, word_timestamps)
    
    # 3. Generate optimized subtitles for vertical videos
    shorts_srt_path, _ = for_yt_shorts(
        mono_audio_path,
        word_timestamps,
        min_silence_between_words=0.3,
        max_characters_per_subtitle=17
    )
    
    # 4. Extract raw transcript text and save to file
    text, speech_text_path = write_words_to_txt(mono_audio_path, word_timestamps)
    
    # 5. Save developer-friendly timestamp data
    timestamps_json_path = word_timestamp_json(mono_audio_path, word_timestamps)
    
    # Return all six values separately
    return default_srt_path, word_level_srt_path, shorts_srt_path, speech_text_path, timestamps_json_path, text