Spaces:
Paused
Paused
File size: 2,878 Bytes
48b9b5d 3ccf873 c7a4f81 3ccf873 48b9b5d 75b7975 48b9b5d 75b7975 48b9b5d 75b7975 48b9b5d 8b4aa8a 75b7975 48b9b5d 3ccf873 48b9b5d c092255 48b9b5d 3ccf873 48b9b5d 3ccf873 48b9b5d 7dc8950 48b9b5d 7dc8950 f5ddb49 48b9b5d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 |
import gradio as gr
import numpy as np
import torch
import transformers
from pathlib import Path
from transformers import pipeline
from transformers.utils import logging
# Log
#logging.set_verbosity_debug()
logger = logging.get_logger("transformers")
# Pipelines
## Automatic Speech Recognition
## https://huggingface.co/docs/transformers/task_summary#automatic-speech-recognition
## Require ffmpeg to be installed
asr_device = "cuda:0" if torch.cuda.is_available() else "cpu"
asr_torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
asr_model = "openai/whisper-tiny"
asr = pipeline(
"automatic-speech-recognition",
model=asr_model,
torch_dtype=asr_torch_dtype,
device=asr_device
)
## Token Classification / Name Entity Recognition
## https://huggingface.co/docs/transformers/task_summary#token-classification
tc_device = 0 if torch.cuda.is_available() else "cpu"
tc_model = "dslim/distilbert-NER"
tc = pipeline(
"token-classification", # ner
model=tc_model,
device=tc_device
)
# ---
# Transformers
# https://www.gradio.app/main/docs/gradio/audio#behavior
# As output component: expects audio data in any of these formats:
# - a str or pathlib.Path filepath
# - or URL to an audio file,
# - or a bytes object (recommended for streaming),
# - or a tuple of (sample rate in Hz, audio data as numpy array)
def transcribe(audio: str | Path | bytes | tuple[int, np.ndarray] | None):
if audio is None:
return "..."
# TODO Manage str/Path
logger.debug("Transcribe")
text = ""
# https://huggingface.co/docs/transformers/main_classes/pipelines#transformers.AutomaticSpeechRecognitionPipeline.__call__
# Whisper input format for tuple differ from output provided by gradio audio component
if asr_model.startswith("openai/whisper"):
sampling_rate, raw = audio
# Convert to mono if stereo
if raw.ndim > 1:
raw = raw.mean(axis=1)
raw = raw.astype(np.float32)
raw /= np.max(np.abs(raw))
inputs = {"sampling_rate": sampling_rate, "raw": raw} if type(audio) is tuple else audio
transcript = asr(inputs)
text = transcript['text']
logger.debug("Tokenize:[" + text + "]")
entities = tc(text)
#logger.debug("Classify:[" + entities + "]")
# TODO Add Text Classification for sentiment analysis
return {"text": text, "entities": entities}
# ---
# Gradio
## Interfaces
# https://www.gradio.app/main/docs/gradio/audio
input_audio = gr.Audio(
sources=["upload", "microphone"],
show_share_button=False
)
## App
gradio_app = gr.Interface(
transcribe,
inputs=[
input_audio
],
outputs=[
gr.HighlightedText()
],
title="ASRNERSBX",
description=(
"Transcribe, Tokenize, Classify"
),
flagging_mode="never"
)
## Start!
gradio_app.launch() |