File size: 2,182 Bytes
b357c71
8553d54
dfc9440
b357c71
 
 
 
 
29ca1ce
5398a1d
b357c71
dfc9440
 
 
 
 
b357c71
9aedf57
 
 
 
 
 
 
 
 
 
7fb921e
9aedf57
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b357c71
9aedf57
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b357c71
 
9aedf57
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
from transformers import WhisperTokenizer
import os
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small") #, language="marathi", task="transcribe"

from transformers import pipeline
import gradio as gr
import torch 

pipe = pipeline(model="thak123/gom-stt-v3", #"thak123/whisper-small-LDC-V1", #"thak123/whisper-small-gom", 
                task="automatic-speech-recognition", tokenizer= tokenizer)  # change to "your-username/the-name-you-picked"

# pipe.model.config.forced_decoder_ids = (
#         pipe.tokenizer.get_decoder_prompt_ids(
#             language="marathi", task="transcribe"
#         )
#     )

def transcribe_speech(filepath):
    output = pipe(
        filepath,
        max_new_tokens=256,
        generate_kwargs={
            "task": "transcribe",
            "language": "konkani",
        },  # update with the language you've fine-tuned on
        chunk_length_s=30,
        batch_size=8,
        padding=True
    )
    return output["text"]


demo = gr.Blocks()

mic_transcribe = gr.Interface(
    fn=transcribe_speech,
    inputs=gr.Audio(sources="microphone", type="filepath"),
    outputs=gr.components.Textbox(),
)

file_transcribe = gr.Interface(
    fn=transcribe_speech,
    inputs=gr.Audio(sources="upload", type="filepath"),
    outputs=gr.components.Textbox(),
)
with demo:
    gr.TabbedInterface(
        [mic_transcribe, file_transcribe],
        ["Transcribe Microphone", "Transcribe Audio File"],
    )

demo.launch(debug=True)

# def transcribe(audio):
#     # text = pipe(audio)["text"]
#     # pipe(audio)
#     text = pipe(audio)
#     print("op",text)
#     return text#pipe(audio) #text

# iface = gr.Interface(
#     fn=transcribe, 
#     inputs=[gr.Audio(sources=["microphone", "upload"])], 
#     outputs="text",
#     examples=[
#         [os.path.join(os.path.dirname("."),"audio/chalyaami.mp3")],
#         [os.path.join(os.path.dirname("."),"audio/ekdonteen.flac")],
#         [os.path.join(os.path.dirname("."),"audio/heyatachadjaale.mp3")],
#     ],
#     title="Whisper Konkani",
#     description="Realtime demo for Konkani speech recognition using a fine-tuned Whisper small model.",
# )


# iface.launch()