File size: 2,234 Bytes
7a86b92 46a891a 2244bbb 46a891a e26488d 46a891a e26488d 46a891a 94780f8 7a86b92 382e37a 7a86b92 a7f185d 7a86b92 382e37a 7a86b92 2244bbb e26488d 7a86b92 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 |
import subprocess
import sys
# Function to install a package if not already installed
def install_package(package):
subprocess.check_call([sys.executable, "-m", "pip", "install", package])
# Install Rust compiler if not present
def install_rust():
try:
subprocess.check_call(["rustc", "--version"])
except subprocess.CalledProcessError:
subprocess.check_call(["curl", "--proto", "=https", "--tlsv1.2", "-sSf", "https://sh.rustup.rs", "|", "sh"])
# List of required packages
required_packages = [
"transformers==4.10.3",
"datasets",
"huggingface-hub>=0.19",
"hf-transfer>=0.1.4",
"protobuf<4",
"click<8.1",
"pydantic~=1.0",
"librosa==0.8.1",
"torch==2.2.0",
"torchaudio==2.2.0",
"scipy",
"Cython==0.29.21",
"phonemizer==2.2.1",
"scikit-learn",
"matplotlib",
"gradio==3.1.4",
"sentencepiece",
"sacremoses",
"tokenizers==0.10.3",
"resampy>=0.2.2",
"numba>=0.43.0",
"soundfile>=0.10.2",
"pooch>=1.0",
"decorator>=3.0.0",
"joblib>=0.14",
"audioread>=2.0.0"
]
# Install all required packages
for package in required_packages:
install_package(package)
import gradio as gr
import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import librosa
# Load pre-trained model and processor
model_name = "facebook/wav2vec2-base-960h"
processor = Wav2Vec2Processor.from_pretrained(model_name)
model = Wav2Vec2ForCTC.from_pretrained(model_name)
def transcribe(audio):
# Load audio
audio_input, _ = librosa.load(audio, sr=16000)
# Tokenize and process
inputs = processor(audio_input, sampling_rate=16000, return_tensors="pt", padding=True)
with torch.no_grad():
logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
# Get predicted ids
predicted_ids = torch.argmax(logits, dim=-1)
# Decode the ids to text
transcription = processor.batch_decode(predicted_ids)
return transcription[0]
# Define the Gradio interface
iface = gr.Interface(
fn=transcribe,
inputs=gr.Audio(source="microphone", type="filepath"),
outputs="text"
)
if __name__ == "__main__":
install_rust()
iface.launch()
|