Spaces:

tools4eu
/

asr

Sleeping

File size: 14,033 Bytes

import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from transcribe.transcribe import transcriber, languages
import gradio as gr
import torch
import torchaudio
import torch.cuda as cuda
import platform
from transformers import __version__ as transformers_version
from dotenv import load_dotenv
import shutil
from docx import Document
import logging
import subprocess
load_dotenv(override=True)

logging.basicConfig(level=logging.INFO)

HF_AUTH_TOKEN = os.getenv("HF_AUTH_TOKEN")

device = "cuda" if torch.cuda.is_available() else "cpu"
num_gpus = cuda.device_count() if torch.cuda.is_available() else 0
cuda_version = torch.version.cuda if torch.cuda.is_available() else "N/A"
cudnn_version = torch.backends.cudnn.version() if torch.cuda.is_available() else "N/A"
os_info = platform.system() + " " + platform.release() + " " + platform.machine()

# Get the available VRAM for each GPU (if available)
vram_info = []
if torch.cuda.is_available():
    for i in range(cuda.device_count()):
        gpu_properties = cuda.get_device_properties(i)
        vram_info.append(f"**GPU {i}: {gpu_properties.total_memory / 1024**3:.2f} GB**")

pytorch_version = torch.__version__
torchaudio_version = torchaudio.__version__ if 'torchaudio' in dir() else "N/A"

device_info = f"""Running on: **{device}**

    Number of GPUs available: **{num_gpus}**

    CUDA version: **{cuda_version}**

    CuDNN version: **{cudnn_version}**

    PyTorch version: **{pytorch_version}**

    Torchaudio version: **{torchaudio_version}**

    Transformers version: **{transformers_version}**

    Operating system: **{os_info}**

    Available VRAM: 
    \t {', '.join(vram_info) if vram_info else '**N/A**'}
"""

css = """
#audio_input {
    padding-bottom: 50px;
}
"""

def format_srt_time(timestamp):
    """Formats the timestamp into SRT time format."""
    hours, remainder = divmod(timestamp, 3600)
    minutes, seconds = divmod(remainder, 60)
    milliseconds = int((seconds - int(seconds)) * 1000)
    return f"{int(hours):02}:{int(minutes):02}:{int(seconds):02},{milliseconds:03}"

def generate_srt_content(chunks):
    """Generates the content for an SRT file based on transcription chunks."""
    srt_content = ""
    for i, chunk in enumerate(chunks, start=1):
        try:
            start, end = chunk["timestamp"]
            start_time = format_srt_time(start)
            end_time = format_srt_time(end)
            text = chunk["text"]
            srt_content += f"{i}\n{start_time} --> {end_time}\n{text}\n\n"
        except:
            logging.info("couldn't add phrase")
            continue
    return srt_content.strip()

def create_black_screen_video(audio_file_path, output_video_path):
    """
    Creates a video with an empty black screen and the original audio from the input audio file.

    Parameters:
    - audio_file_path: Path to the input audio file.
    - output_video_path: Path where the output video will be saved.
    """
    # Check if the output directory exists, create if not
    output_dir = os.path.dirname(output_video_path)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Construct the ffmpeg command
    command = [
        'ffmpeg',
        '-y',  # Overwrite output file if it exists
        '-f', 'lavfi',  # Input format
        '-i', 'color=c=black:s=320x240:r=10',  # Generate a black color input, with 1280x720 resolution at 30 fps
        '-i', audio_file_path,  # The input audio file
        '-c:v', 'libx264',  # Video codec to use
        '-tune', 'stillimage',  # Optimize for still image
        '-c:a', 'aac',  # Audio codec to use
        '-b:a', '192k',  # Audio bitrate
        '-shortest',  # Finish encoding when the shortest input stream ends
        output_video_path  # The output video file path
    ]

    # Execute the command
    subprocess.run(command, check=True)


def process_folder(files_source, model, language, translate, diarize, diarization_token):
    output_folder_path = "./tmp"
    if not os.path.exists(output_folder_path):
        os.makedirs(output_folder_path)

    for file_path in files_source:
        # Check if the file is an audio file (e.g., .mp3, .mp4, .wav)
        if file_path.endswith(('.mp3', '.mp4', '.wav')):
 
            file_name = os.path.basename(file_path)

            # Copy the original audio file to the output folder
            output_audio_filepath = os.path.join(output_folder_path, file_name)
            shutil.copy2(file_path, output_audio_filepath)

    #             output_filename_base = os.path.splitext(filename)[0]
    #             output_word_filepath = os.path.join(output_folder_path, output_filename_base + ".docx")
    #             output_srt_filepath = os.path.join(output_folder_path, output_filename_base + ".srt")
    #             output_summary_filepath = os.path.join(output_folder_path, output_filename_base + "_summary.docx")
    #             output_video_filepath = os.path.join(output_folder_path, output_filename_base + ".mp4")
    #             output_audio_filepath = os.path.join(output_folder_path, filename)

    #             # Skip processing if any of the output files already exist
    #             if os.path.exists(output_word_filepath) and os.path.exists(output_srt_filepath) and os.path.exists(output_summary_filepath) and os.path.exists(output_video_filepath) and os.path.exists(output_audio_filepath):
    #                 print(f"Skipping {filename} as output files already exist.")
    #                 continue
                    
            # Use the transcriber function to transcribe the audio file
            transcription_result = transcriber(file_path, model, language=language, translate=translate, diarize=diarize, input_diarization_token=diarization_token)
            print(transcription_result)
    
    #             transcribed_text = transcription_result["text"]
    #             chunks = transcription_result.get("chunks", [])
    
    #             # Create a new Word document with the transcribed text
    #             doc = Document()
    #             for chunk in chunks:
    #                 doc.add_paragraph(chunk["text"])
    #             output_filename_base = os.path.splitext(filename)[0]
    #             output_word_filepath = os.path.join(output_folder_path, output_filename_base + ".docx")
    #             doc.save(output_word_filepath)
    #             print(f"Transcription saved to {output_word_filepath}")
    
    #             # Create an SRT file with subtitles if chunks are available
    #             if chunks:
    #                 srt_content = generate_srt_content(chunks)
    #                 output_srt_filepath = os.path.join(output_folder_path, output_filename_base + ".srt")
    #                 with open(output_srt_filepath, "w", encoding='utf-8') as srt_file:
    #                     srt_file.write(srt_content)
    #                 print(f"Subtitles saved to {output_srt_filepath}")
    
    #             # Generate and save the summary
    #             output_summary_filepath = os.path.join(output_folder_path, output_filename_base + "_summary.docx")

    
    #             # Create empty video
    #             if filename.endswith(('.mp3', '.wav')):
    #                 create_black_screen_video(file_path, os.path.join(output_folder_path, output_filename_base + ".mp4"))
                
    


# def inference(input, diarize, num_speakers:int, strict, lan, trans, progress=gr.Progress()):
def inference(input, model, language, translate, diarize, input_diarization_token):
    tr = transcriber(input, model, language, translate, diarize, input_diarization_token)
    return {textbox: gr.update(value=tr)}

with gr.Blocks(title="Automatic speech recognition (beta)", css=css, analytics_enabled=False) as demo:
    with gr.Row():
        gr.Markdown(
            """
                # Automatic speech recognition (beta)

                [![License: GPL v3](https://img.shields.io/badge/License-GPLv3-blue.svg)](https://www.gnu.org/licenses/gpl-3.0) 
                
                ![Python 3.10](https://raw.githubusercontent.com/tools4eu/automatic-speech-recognition/main/badges/python3_10.svg)

                Report issues [here](https://github.com/tools4eu/automatic-speech-recognition/issues)
            """

        )
    

    with gr.Tab("Upload/record sound"):
        with gr.Column():
            dropdown_model = gr.Dropdown(
                    label='Model', 
                    choices = ["openai/whisper-large-v3", "openai/whisper-medium", "openai/whisper-small", "openai/whisper-tiny"],
                    value="openai/whisper-large-v3", 
                    info="""
                        Larger models will increase the quality of the transcription, but reduce performance.
                    """)
        with gr.Row():
            with gr.Column():
                upl_input = gr.Audio(type='filepath', elem_id="audio_input")
                upl_language = gr.Dropdown(
                    label='Language', 
                    choices = ['Automatic detection']+sorted(list(languages.keys())), 
                    value='Automatic detection', 
                    info="""
                        Setting the language to "Automatic detection" will auto-detect the language based on the first 30 seconds.
                        If the language is known upfront, always set it manually.
                    """)

        with gr.Row():
            upl_translate = gr.Checkbox(label='Translate to English')

        with gr.Column():
            with gr.Group():
                input_diarization_token = gr.Textbox(label='Paste your HF token here for speaker diarization (or add it as an environment variable)', value=HF_AUTH_TOKEN)
                check_diarization = gr.Checkbox(label='Speaker diarization')
                with gr.Accordion("For more details click here...", open=False):
                    gr.Markdown("""
                                    An access token can be created [here](https://hf.co/settings/tokens)
                                
                                    If not done yet for your account, you need to [accept segmentation terms & conditions](https://huggingface.co/pyannote/segmentation-3.0)
                                
                                    If not done yet for your account, you need to [accept diarization terms & conditions](https://huggingface.co/pyannote/speaker-diarization-3.1)
                                """)
        
        with gr.Row():
            upl_btn = gr.Button("Transcribe")
        
        with gr.Row(variant='panel'):
            with gr.Column():
                textbox = gr.Textbox(label='Transciption',visible=True)

    with gr.Tab("Process multiple files"):
        files_source=gr.Files(label="Select Audio Files", file_count="multiple")
        with gr.Column():
            dropdown_model_multi = gr.Dropdown(
                    label='Model', 
                    choices = ["openai/whisper-large-v3", "openai/whisper-medium", "openai/whisper-small", "openai/whisper-tiny"],
                    value="openai/whisper-large-v3", 
                    info="""
                        Larger models will increase the quality of the transcription, but reduce performance.
                    """)
        dropdown_lang_multi = gr.Dropdown(
                    label='Language', 
                    choices = ['Automatic detection']+sorted(list(languages.keys())), 
                    value='Automatic detection', 
                    info="""
                        Setting the language to "Automatic detection" will auto-detect the language based on the first 30 seconds.
                        If the language is known upfront, always set it manually.
                    """)
        checkbox_trans_multi = gr.Checkbox(label='Translate to English')
        with gr.Column():
            with gr.Group():
                input_diarization_token_multi = gr.Textbox(label='Paste your Hugging Face token here for speaker diarization (or add it as an environment variable)', value=HF_AUTH_TOKEN)
                check_diarization_multi = gr.Checkbox(label='Speaker diarization')
                with gr.Accordion("For more details click here...", open=False):
                    gr.Markdown("""
                                    An access token can be created [here](https://hf.co/settings/tokens)
                                
                                    If not done yet for your account, you need to [accept segmentation terms & conditions](https://huggingface.co/pyannote/segmentation-3.0)
                                
                                    If not done yet for your account, you need to [accept diarization terms & conditions](https://huggingface.co/pyannote/speaker-diarization-3.1)
                                """)
        btn_transcribe_multi= gr.Button("Transcribe")
        textbox_transcribe_multi= gr.Chatbot(label='Transciption',visible=True)

    with gr.Tab("Device info"):
        gr.Markdown(device_info, label="Hardware info & installed packages")
        # gr.Markdown(device_info, label="Hardware info & installed packages", lines=len(device_info.split("\n")), container=False)

    transcribe_event = upl_btn.click(fn=inference, inputs=[upl_input, dropdown_model, upl_language, upl_translate, check_diarization, input_diarization_token], outputs=[textbox], concurrency_limit=1)
    # transcribe_files_event = btn_transcribe_folder.click(fn=process_folder, inputs=[files_source, dropdown_lang_multi, checkbox_trans_multi, input_diarization_token], outputs=[textbox_transcribe_folder], concurrency_limit=1)
    transcribe_files_event = btn_transcribe_multi.click(fn=process_folder, inputs=[files_source, dropdown_model_multi, dropdown_lang_multi, check_diarization_multi, checkbox_trans_multi, input_diarization_token_multi], outputs=[], concurrency_limit=1)

demo.queue().launch(server_name="0.0.0.0")