Spaces:
Running
on
Zero
Running
on
Zero
File size: 5,889 Bytes
535ff2b f289f22 ffb8b65 535ff2b 41199e7 6d40a3f 535ff2b f289f22 6d40a3f f289f22 3bc425a ffb8b65 535ff2b f289f22 535ff2b e0fb7f3 f289f22 653c292 f289f22 ffb8b65 f289f22 c989fa9 f289f22 ffb8b65 f289f22 535ff2b f289f22 535ff2b f289f22 3bc425a f289f22 ffb8b65 f289f22 ffb8b65 f289f22 ffb8b65 f289f22 3bc425a f289f22 ffb8b65 3bc425a f289f22 ffb8b65 f289f22 ffb8b65 f289f22 ffb8b65 f289f22 ffb8b65 9881c0d ffb8b65 9881c0d ffb8b65 653c292 ffb8b65 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 |
import os
import uuid
import logging
import tempfile
from datetime import datetime
import spaces
import gradio as gr
import librosa
import soundfile as sf
import torch
from datasets import Dataset, DatasetDict, concatenate_datasets, Audio, load_dataset, DownloadConfig
from transformers import pipeline
from huggingface_hub import HfApi, login
from resemble_enhance.enhancer.inference import denoise, enhance
import torchaudio
# Configure logging
logging.basicConfig(
format="%(asctime)s — %(levelname)s — %(message)s",
level=logging.INFO
)
logger = logging.getLogger(__name__)
# Constants
HF_TOKEN = os.getenv("HF_TOKEN")
if not HF_TOKEN:
logger.error("Hugging Face token not found. Please set HF_TOKEN environment variable.")
raise SystemExit
CURRENT_DATASET = "sawadogosalif/Sachi_demo_dataset"
SAMPLE_RATE = 16_000
ASR_MODEL = "sawadogosalif/SaChi-ASR"
# Authenticate with Hugging Face
login(token=HF_TOKEN)
api = HfApi(token=HF_TOKEN)
def get_or_create_dataset(dataset_name: str) -> Dataset:
"""
Load the dataset if it exists, otherwise create a new empty one.
"""
try:
ds = load_dataset(
dataset_name,
split="train",
download_config=DownloadConfig(token=HF_TOKEN)
)
logger.info(f"Loaded dataset '{dataset_name}' with {len(ds)} examples.")
except Exception:
logger.warning(f"Dataset '{dataset_name}' not found or failed to load. Creating a new one.")
ds = Dataset.from_dict({
"audio": [],
"text": [],
"language": [],
"datetime": [],
})
DatasetDict({"train": ds}).push_to_hub(dataset_name, token=HF_TOKEN)
logger.info(f"Created empty dataset '{dataset_name}'.")
return ds
def save_dataset(dataset: Dataset, dataset_name: str) -> None:
"""
Push the updated dataset back to Hugging Face hub.
"""
ds_dict = DatasetDict({"train": dataset})
ds_dict.push_to_hub(dataset_name, token=HF_TOKEN)
logger.info(f"Pushed updated dataset to '{dataset_name}' ({len(dataset)} records).")
class Transcriber:
def __init__(self, asr_model: str):
self.pipeline = pipeline(model=asr_model)
def transcribe(self, audio_path: str) -> str:
result = self.pipeline(audio_path)
return result.get("text", "")
# Initialize components
current_dataset = get_or_create_dataset(CURRENT_DATASET)
asr_client = Transcriber(ASR_MODEL)
@spaces.GPU(duration=15)
def transcribe_and_update(audio_filepath: str, history: str, apply_enhance: bool) -> tuple:
"""
Denoise every input, optionally enhance, then transcribe and push to HF dataset.
"""
if not audio_filepath:
return "No audio detected. Please record or upload audio.", history
try:
# Load and preprocess
audio_data, sr = dwav, sr = torchaudio.load(audio_filepath)
# Always denoise
try:
device = "cuda"
audio_data = audio_data.mean(dim=0)
denoised_data, sr = denoise(audio_data, sr, device)
logger.info("Audio denoised successfully.")
except Exception as e:
logger.warning(f"Denoise failed, using raw audio: {e}")
denoised_data = audio_data
# Optionally enhance
if apply_enhance:
try:
enhanced_data, sr = enhance(denoised_data, sr, device)
final_audio = enhanced_data
logger.info("Audio enhanced successfully.")
except Exception as e:
logger.warning(f"Enhancement failed, using denoised audio: {e}")
final_audio = denoised_data
else:
final_audio = denoised_data
# Save processed audio to temp file
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmpf:
sf.write(tmpf.name, final_audio, sr)
local_path = tmpf.name
# Transcription
transcription = asr_client.transcribe(local_path)
logger.info(f"Transcription: {transcription}")
# Prepare new record
new_record = {
"audio": [local_path],
"text": [transcription],
"language": ["moore"],
"datetime": [datetime.utcnow().isoformat()]
}
new_ds = Dataset.from_dict(new_record).cast_column("audio", Audio())
# Update in-memory dataset
global current_dataset
if len(current_dataset) == 0:
current_dataset = new_ds
else:
current_dataset = concatenate_datasets([current_dataset, new_ds])
# Push to hub
save_dataset(current_dataset, CURRENT_DATASET)
# Update conversation history
history = history + f"\nUser: [audio]\nAssistant: {transcription}"
return transcription, history
except Exception as exc:
logger.error(f"Error during transcription pipeline: {exc}")
return f"Error: {exc}", history
def build_interface():
with gr.Blocks() as demo:
gr.Markdown("# 🗣️ ASR Moore Live 🧠")
gr.Markdown("Speech Recognition interface for Moore language. Records or uploads audio, always denoises, and optionally enhances before ASR.")
with gr.Row():
audio_input = gr.Audio(type="filepath", label="Record or upload audio", sources=["microphone", "upload"])
state_box = gr.State(value="")
enhance_checkbox = gr.Checkbox(label="Apply Enhancement", value=False)
output_text = gr.Textbox(label="Transcription")
submit_btn = gr.Button("Transcribe and Save")
submit_btn.click(fn=transcribe_and_update,
inputs=[audio_input, state_box, enhance_checkbox],
outputs=[output_text, state_box])
demo.launch(debug=True)
if __name__ == "__main__":
build_interface() |