Spaces:
Sleeping
Sleeping
import gradio as gr | |
from gtts import gTTS | |
import tempfile | |
import os | |
import torch | |
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor | |
import torchaudio | |
import difflib | |
import pandas as pd | |
from Levenshtein import distance as lev_distance | |
# Load AI4Bharat Hindi model & processor (public model on Hugging Face) | |
MODEL_NAME = "ai4bharat/indicwav2vec-hindi" | |
processor = Wav2Vec2Processor.from_pretrained(MODEL_NAME) | |
model = Wav2Vec2ForCTC.from_pretrained(MODEL_NAME) | |
def play_text(text): | |
tts = gTTS(text=text, lang='hi', slow=False) | |
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp3') | |
tts.save(temp_file.name) | |
return temp_file.name | |
def get_error_type(asr_word, correct_word): | |
if not asr_word: | |
return "Missing word" | |
if not correct_word: | |
return "Extra word" | |
if lev_distance(asr_word, correct_word) <= 2: | |
return "Spelling mistake" | |
set1, set2 = set(asr_word), set(correct_word) | |
if set1 & set2: | |
return "Phonetic/Matra error" | |
return "Substitution/Distorted" | |
def compare_hindi_sentences(expected, transcribed): | |
expected_words = expected.strip().split() | |
transcribed_words = transcribed.strip().split() | |
matcher = difflib.SequenceMatcher(None, transcribed_words, expected_words) | |
errors = [] | |
for opcode, i1, i2, j1, j2 in matcher.get_opcodes(): | |
if opcode == "equal": | |
continue | |
elif opcode == "replace": | |
for k in range(max(i2 - i1, j2 - j1)): | |
asr_word = transcribed_words[i1 + k] if i1 + k < i2 else "" | |
correct_word = expected_words[j1 + k] if j1 + k < j2 else "" | |
error_type = get_error_type(asr_word, correct_word) | |
errors.append((asr_word, correct_word, error_type)) | |
elif opcode == "insert": | |
for k in range(j1, j2): | |
errors.append(("", expected_words[k], "Missing word")) | |
elif opcode == "delete": | |
for k in range(i1, i2): | |
errors.append((transcribed_words[k], "", "Extra word")) | |
return errors | |
def calculate_accuracy(expected, transcribed): | |
expected_words = expected.strip().split() | |
transcribed_words = transcribed.strip().split() | |
matcher = difflib.SequenceMatcher(None, transcribed_words, expected_words) | |
correct = 0 | |
total = len(expected_words) | |
for tag, i1, i2, j1, j2 in matcher.get_opcodes(): | |
if tag == 'equal': | |
correct += (j2-j1) | |
accuracy = (correct / total) * 100 if total > 0 else 0 | |
return round(accuracy, 2) | |
def transcribe_audio(audio_path, original_text): | |
try: | |
waveform, sample_rate = torchaudio.load(audio_path) | |
if waveform.shape[0] > 1: | |
waveform = waveform.mean(dim=0, keepdim=True) | |
if sample_rate != 16000: | |
transform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000) | |
waveform = transform(waveform) | |
waveform = waveform / waveform.abs().max() | |
input_values = processor(waveform.squeeze().numpy(), sampling_rate=16000, return_tensors="pt").input_values | |
with torch.no_grad(): | |
logits = model(input_values).logits | |
predicted_ids = torch.argmax(logits, dim=-1) | |
transcription = processor.decode(predicted_ids[0]).strip() | |
# Error analysis | |
errors = compare_hindi_sentences(original_text, transcription) | |
df_errors = pd.DataFrame(errors, columns=["बिगड़ा हुआ शब्द", "संभावित सही शब्द", "गलती का प्रकार"]) | |
# Speaking speed | |
transcribed_words = transcription.strip().split() | |
duration = waveform.shape[1] / 16000 | |
speed = round(len(transcribed_words) / duration, 2) if duration > 0 else 0 | |
# Accuracy | |
accuracy = calculate_accuracy(original_text, transcription) | |
result = { | |
"📝 Transcribed Text": transcription, | |
"⏱️ Speaking Speed (words/sec)": speed, | |
"✅ Reading Accuracy (%)": accuracy | |
} | |
return result, df_errors | |
except Exception as e: | |
return {"error": str(e)}, pd.DataFrame(columns=["बिगड़ा हुआ शब्द", "संभावित सही शब्द", "गलती का प्रकार"]) | |
with gr.Blocks() as app: | |
gr.Markdown("## 🗣️ Hindi Reading & Pronunciation Practice App (AI4Bharat IndicWav2Vec)") | |
with gr.Row(): | |
input_text = gr.Textbox(label="Paste Hindi Text Here", placeholder="यहाँ हिंदी टेक्स्ट लिखें...") | |
play_button = gr.Button("🔊 Listen to Text") | |
audio_output = gr.Audio(label="Text-to-Speech Output", type="filepath") | |
play_button.click(play_text, inputs=input_text, outputs=audio_output) | |
gr.Markdown("### 🎤 Now upload or record yourself reading the text aloud below:") | |
audio_input = gr.Audio(type="filepath", label="Upload or Record Your Voice") | |
submit_button = gr.Button("✅ Submit Recording for Checking") | |
output = gr.JSON(label="Results") | |
error_table = gr.Dataframe(label="गलती तालिका (Error Table)") | |
submit_button.click( | |
transcribe_audio, | |
inputs=[audio_input, input_text], | |
outputs=[output, error_table] | |
) | |
app.launch() | |