File size: 2,878 Bytes
48b9b5d
3ccf873
 
 
c7a4f81
3ccf873
 
 
 
 
 
 
48b9b5d
 
 
 
 
 
75b7975
 
48b9b5d
 
 
 
75b7975
 
48b9b5d
 
 
 
75b7975
48b9b5d
 
 
8b4aa8a
75b7975
48b9b5d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3ccf873
 
48b9b5d
 
 
 
c092255
 
 
 
 
 
 
 
 
 
48b9b5d
 
 
3ccf873
 
48b9b5d
3ccf873
 
 
48b9b5d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7dc8950
48b9b5d
 
7dc8950
f5ddb49
48b9b5d
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import gradio as gr
import numpy as np
import torch
import transformers
from pathlib import Path
from transformers import pipeline
from transformers.utils import logging

# Log

#logging.set_verbosity_debug()
logger = logging.get_logger("transformers")

# Pipelines

## Automatic Speech Recognition
## https://huggingface.co/docs/transformers/task_summary#automatic-speech-recognition
## Require ffmpeg to be installed
asr_device = "cuda:0" if torch.cuda.is_available() else "cpu"
asr_torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
asr_model = "openai/whisper-tiny"
asr = pipeline(
    "automatic-speech-recognition",
    model=asr_model,
    torch_dtype=asr_torch_dtype,
    device=asr_device
)

## Token Classification / Name Entity Recognition
## https://huggingface.co/docs/transformers/task_summary#token-classification
tc_device = 0 if torch.cuda.is_available() else "cpu"
tc_model = "dslim/distilbert-NER"
tc = pipeline(
    "token-classification", # ner
    model=tc_model,
    device=tc_device
)

# ---

# Transformers

# https://www.gradio.app/main/docs/gradio/audio#behavior
# As output component: expects audio data in any of these formats:
# - a str or pathlib.Path filepath
# - or URL to an audio file,
# - or a bytes object (recommended for streaming),
# - or a tuple of (sample rate in Hz, audio data as numpy array)
def transcribe(audio: str | Path | bytes | tuple[int, np.ndarray] | None):
    if audio is None:
        return "..."
    # TODO Manage str/Path

    logger.debug("Transcribe")

    text = ""
    # https://huggingface.co/docs/transformers/main_classes/pipelines#transformers.AutomaticSpeechRecognitionPipeline.__call__
    # Whisper input format for tuple differ from output provided by gradio audio component
    if asr_model.startswith("openai/whisper"):
        sampling_rate, raw = audio

        # Convert to mono if stereo
        if raw.ndim > 1:
            raw = raw.mean(axis=1)

        raw = raw.astype(np.float32)
        raw /= np.max(np.abs(raw))

        inputs = {"sampling_rate": sampling_rate, "raw": raw} if type(audio) is tuple else audio
        transcript = asr(inputs)
        text = transcript['text']

    logger.debug("Tokenize:[" + text + "]")

    entities = tc(text)

    #logger.debug("Classify:[" + entities + "]")

    # TODO Add Text Classification for sentiment analysis
    return {"text": text, "entities": entities}

# ---

# Gradio

## Interfaces

# https://www.gradio.app/main/docs/gradio/audio
input_audio = gr.Audio(
    sources=["upload", "microphone"],
    show_share_button=False
)

## App

gradio_app = gr.Interface(
    transcribe,
    inputs=[
        input_audio
    ],
    outputs=[
        gr.HighlightedText()
    ],
    title="ASRNERSBX",
    description=(
        "Transcribe, Tokenize, Classify"
    ),
    flagging_mode="never"
)

## Start!
gradio_app.launch()