|
import subprocess |
|
import sys |
|
|
|
|
|
def install_package(package): |
|
subprocess.check_call([sys.executable, "-m", "pip", "install", package]) |
|
|
|
|
|
def install_rust(): |
|
try: |
|
subprocess.check_call(["rustc", "--version"]) |
|
except subprocess.CalledProcessError: |
|
subprocess.check_call(["curl", "--proto", "=https", "--tlsv1.2", "-sSf", "https://sh.rustup.rs", "|", "sh"]) |
|
|
|
|
|
required_packages = [ |
|
"transformers==4.10.3", |
|
"datasets", |
|
"huggingface-hub>=0.19", |
|
"hf-transfer>=0.1.4", |
|
"protobuf<4", |
|
"click<8.1", |
|
"pydantic~=1.0", |
|
"librosa==0.8.1", |
|
"torch==2.2.0", |
|
"torchaudio==2.2.0", |
|
"scipy", |
|
"Cython==0.29.21", |
|
"phonemizer==2.2.1", |
|
"scikit-learn", |
|
"matplotlib", |
|
"gradio==3.1.4", |
|
"sentencepiece", |
|
"sacremoses", |
|
"tokenizers==0.10.3", |
|
"resampy>=0.2.2", |
|
"numba>=0.43.0", |
|
"soundfile>=0.10.2", |
|
"pooch>=1.0", |
|
"decorator>=3.0.0", |
|
"joblib>=0.14", |
|
"audioread>=2.0.0" |
|
] |
|
|
|
|
|
for package in required_packages: |
|
install_package(package) |
|
|
|
import gradio as gr |
|
import torch |
|
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor |
|
import librosa |
|
|
|
|
|
model_name = "facebook/wav2vec2-base-960h" |
|
processor = Wav2Vec2Processor.from_pretrained(model_name) |
|
model = Wav2Vec2ForCTC.from_pretrained(model_name) |
|
|
|
def transcribe(audio): |
|
|
|
audio_input, _ = librosa.load(audio, sr=16000) |
|
|
|
|
|
inputs = processor(audio_input, sampling_rate=16000, return_tensors="pt", padding=True) |
|
with torch.no_grad(): |
|
logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits |
|
|
|
|
|
predicted_ids = torch.argmax(logits, dim=-1) |
|
|
|
|
|
transcription = processor.batch_decode(predicted_ids) |
|
return transcription[0] |
|
|
|
|
|
iface = gr.Interface( |
|
fn=transcribe, |
|
inputs=gr.Audio(source="microphone", type="filepath"), |
|
outputs="text" |
|
) |
|
|
|
if __name__ == "__main__": |
|
install_rust() |
|
iface.launch() |
|
|