File size: 2,110 Bytes
3597c88
 
 
 
 
 
 
 
 
 
 
 
 
87f602f
3597c88
87f602f
3597c88
 
87f602f
3733074
 
 
 
 
87f602f
 
 
 
 
 
 
 
 
 
 
3597c88
 
 
87f602f
 
 
3597c88
87f602f
 
 
 
 
 
3597c88
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87f602f
3597c88
 
 
 
 
5777262
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import gradio as gr
import os
from pydub import AudioSegment

def audio_converter(audio_file:str):
    audio_input = AudioSegment.from_file(audio_file,'m4a')
    audio_input_name = os.path.splitext(audio_file)[0]
    audio_wav_filename = f"{audio_input_name}.wav"
    audio_input.export(audio_wav_filename, 'wav')
    
    return audio_wav_filename

def asr_transcriber(audio_file):
    from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
    import torch
    import optimum

    audio_file_wav = audio_converter(audio_file)

    # Check for CUDA availability (GPU)
    if torch.cuda.is_available():
        device_id = torch.device('cuda')
    else:
        device_id = torch.device('cpu')

    torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

    #device_id = "mps" for Mac only
    #torch_dtype = float16
    flash = False
    ts = True

    #Try to optimize when CPU and float32
    model_id = "openai/whisper-small"

    # Initialize the ASR pipeline
    pipe = pipeline(
        "automatic-speech-recognition",
        model=model_id,
        torch_dtype=torch_dtype,
        device=device_id,
    )
    
    if device_id == "mps":
        torch.mps.empty_cache()
    elif not flash:
        pipe.model = pipe.model.to_bettertransformer()
        
    language = None
    task = "transcribe"

    json_output = pipe(
        audio_file_wav, 
        chunk_length_s=30, 
        batch_size=2, 
        generate_kwargs={"task": task, "language": language},
        return_timestamps=ts
    )

    return json_output["text"]

with gr.Blocks() as transcriberUI:
    gr.Markdown(
    """
    # Ola Xara & Solange!
    Clicar no botao abaixo para selecionar o Audio a ser transcrito!
    Ambiente de Teste: pode demorar um pouco. Nao fiquem nervosos :-)
    """)
    inp = gr.File(label="Arquivo de Audio", show_label=True, file_count="single", file_types=["m4a"])
    transcribe = gr.Textbox(label="Transcricao", show_label=True, show_copy_button=True)
    inp.upload(asr_transcriber, inp, transcribe)

if __name__ == "__main__":
    transcriberUI.launch()