import gradio as gr from transformers import pipeline from rapidfuzz import process, fuzz # Initialize ASR pipeline asr = pipeline( task="automatic-speech-recognition", model="vhdm/whisper-large-fa-v1", device=-1 # CPU; set device=0 for GPU ) # Custom vocabulary with multiple forms custom_vocab_map = { "نرد": ["نرد", "نِرد", "نَرد"], "کامپیوتر": ["کامپیوتر", "کامپیوتره"], "هوش مصنوعی": ["هوش مصنوعی", "هوش صنعتی"], "ماشین": ["ماشین", "ماشینه"] } def replace_fuzzy(text, vocab_map, threshold=85): """ Replace words/phrases in text using fuzzy matching with high threshold. """ for target, alternatives in vocab_map.items(): result = process.extractOne(text, alternatives, scorer=fuzz.partial_ratio) if result is None: continue if hasattr(result, 'score') and hasattr(result, 'value'): score = result.score match = result.value else: match, score = result[:2] if score >= threshold: text = text.replace(match, target) return text def transcribe(audio_file): """ audio_file: path to WAV file (Gradio mic or upload) """ if not audio_file: return "No audio input detected." try: # Run ASR result = asr(audio_file, chunk_length_s=30, stride_length_s=[5,5]) except Exception as e: return f"ASR error: {e}" text = result.get("text", "") final_text = replace_fuzzy(text, custom_vocab_map, threshold=85) return final_text # Gradio interface iface = gr.Interface( fn=transcribe, inputs=gr.Audio(type="filepath", label="Record or upload audio"), outputs="text", title="Persian ASR with High Accuracy Vocabulary", description="""Speak in Persian or upload an audio file; recognized words are corrected using a custom high-accuracy vocabulary.""" ) if __name__ == "__main__": iface.launch()