Spaces:

sawadogosalif
/

Sachi-ASR-demo

Running on Zero

File size: 5,889 Bytes

535ff2b
f289f22
 
 
 
ffb8b65
535ff2b
41199e7
6d40a3f
535ff2b
f289f22
 
6d40a3f
f289f22
 
3bc425a
 
ffb8b65
 
 
 
 
 
 
535ff2b
f289f22
 
 
 
 
535ff2b
e0fb7f3
f289f22
 
653c292
f289f22
 
ffb8b65
f289f22
 
 
 
 
 
c989fa9
f289f22
 
 
ffb8b65
f289f22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
535ff2b
f289f22
535ff2b
f289f22
 
 
 
 
3bc425a
f289f22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ffb8b65
 
f289f22
ffb8b65
f289f22
 
ffb8b65
f289f22
 
 
3bc425a
f289f22
 
ffb8b65
3bc425a
f289f22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ffb8b65
 
 
 
 
 
 
 
 
 
 
 
 
f289f22
ffb8b65
 
 
 
f289f22
ffb8b65
 
 
f289f22
 
 
ffb8b65
 
 
 
9881c0d
ffb8b65
 
 
 
9881c0d
ffb8b65
 
 
 
 
 
 
 
 
 
 
 
653c292
ffb8b65

import os
import uuid
import logging
import tempfile
from datetime import datetime
import spaces

import gradio as gr
import librosa
import soundfile as sf
import torch
from datasets import Dataset, DatasetDict, concatenate_datasets, Audio, load_dataset, DownloadConfig
from transformers import pipeline
from huggingface_hub import HfApi, login
from resemble_enhance.enhancer.inference import denoise, enhance
import torchaudio


# Configure logging
logging.basicConfig(
    format="%(asctime)s — %(levelname)s — %(message)s",
    level=logging.INFO
)
logger = logging.getLogger(__name__)

# Constants
HF_TOKEN = os.getenv("HF_TOKEN")
if not HF_TOKEN:
    logger.error("Hugging Face token not found. Please set HF_TOKEN environment variable.")
    raise SystemExit

CURRENT_DATASET = "sawadogosalif/Sachi_demo_dataset"
SAMPLE_RATE = 16_000
ASR_MODEL = "sawadogosalif/SaChi-ASR"

# Authenticate with Hugging Face
login(token=HF_TOKEN)
api = HfApi(token=HF_TOKEN)


def get_or_create_dataset(dataset_name: str) -> Dataset:
    """
    Load the dataset if it exists, otherwise create a new empty one.
    """
    try:
        ds = load_dataset(
            dataset_name,
            split="train",
            download_config=DownloadConfig(token=HF_TOKEN)
        )
        logger.info(f"Loaded dataset '{dataset_name}' with {len(ds)} examples.")
    except Exception:
        logger.warning(f"Dataset '{dataset_name}' not found or failed to load. Creating a new one.")
        ds = Dataset.from_dict({
            "audio": [],
            "text": [],
            "language": [],
            "datetime": [],
        })
        DatasetDict({"train": ds}).push_to_hub(dataset_name, token=HF_TOKEN)
        logger.info(f"Created empty dataset '{dataset_name}'.")
    return ds


def save_dataset(dataset: Dataset, dataset_name: str) -> None:
    """
    Push the updated dataset back to Hugging Face hub.
    """
    ds_dict = DatasetDict({"train": dataset})
    ds_dict.push_to_hub(dataset_name, token=HF_TOKEN)
    logger.info(f"Pushed updated dataset to '{dataset_name}' ({len(dataset)} records).")





class Transcriber:
    def __init__(self, asr_model: str):
        self.pipeline = pipeline(model=asr_model)

    def transcribe(self, audio_path: str) -> str:
        result = self.pipeline(audio_path)
        return result.get("text", "")


# Initialize components
current_dataset = get_or_create_dataset(CURRENT_DATASET)
asr_client = Transcriber(ASR_MODEL)


@spaces.GPU(duration=15)
def transcribe_and_update(audio_filepath: str, history: str, apply_enhance: bool) -> tuple:
    """
    Denoise every input, optionally enhance, then transcribe and push to HF dataset.
    """
    if not audio_filepath:
        return "No audio detected. Please record or upload audio.", history

    try:
        # Load and preprocess
        audio_data, sr =     dwav, sr = torchaudio.load(audio_filepath)
        # Always denoise
        try:
            device = "cuda"
            audio_data = audio_data.mean(dim=0)
            denoised_data, sr = denoise(audio_data, sr, device)
            logger.info("Audio denoised successfully.")
        except Exception as e:
            logger.warning(f"Denoise failed, using raw audio: {e}")
            denoised_data = audio_data

        # Optionally enhance
        if apply_enhance:
            try:
                enhanced_data, sr = enhance(denoised_data, sr, device)
                final_audio = enhanced_data
                logger.info("Audio enhanced successfully.")
            except Exception as e:
                logger.warning(f"Enhancement failed, using denoised audio: {e}")
                final_audio = denoised_data
        else:
            final_audio = denoised_data

        # Save processed audio to temp file
        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmpf:
            sf.write(tmpf.name, final_audio, sr)
            local_path = tmpf.name

        # Transcription
        transcription = asr_client.transcribe(local_path)
        logger.info(f"Transcription: {transcription}")

        # Prepare new record
        new_record = {
            "audio": [local_path],
            "text": [transcription],
            "language": ["moore"],
            "datetime": [datetime.utcnow().isoformat()]
        }
        new_ds = Dataset.from_dict(new_record).cast_column("audio", Audio())

        # Update in-memory dataset
        global current_dataset
        if len(current_dataset) == 0:
            current_dataset = new_ds
        else:
            current_dataset = concatenate_datasets([current_dataset, new_ds])

        # Push to hub
        save_dataset(current_dataset, CURRENT_DATASET)

        # Update conversation history
        history = history + f"\nUser: [audio]\nAssistant: {transcription}"
        return transcription, history

    except Exception as exc:
        logger.error(f"Error during transcription pipeline: {exc}")
        return f"Error: {exc}", history


def build_interface():
    with gr.Blocks() as demo:
        gr.Markdown("# 🗣️ ASR Moore Live 🧠")
        gr.Markdown("Speech Recognition interface for Moore language. Records or uploads audio, always denoises, and optionally enhances before ASR.")

        with gr.Row():
            audio_input = gr.Audio(type="filepath", label="Record or upload audio", sources=["microphone", "upload"])
            state_box = gr.State(value="")
            enhance_checkbox = gr.Checkbox(label="Apply Enhancement", value=False)

        output_text = gr.Textbox(label="Transcription")
        submit_btn = gr.Button("Transcribe and Save")
        submit_btn.click(fn=transcribe_and_update,
                         inputs=[audio_input, state_box, enhance_checkbox],
                         outputs=[output_text, state_box])

    demo.launch(debug=True)


if __name__ == "__main__":
    build_interface()