File size: 5,321 Bytes
92d559f
 
 
0b42505
 
 
 
92d559f
 
 
 
0b42505
 
 
 
92d559f
24fa0cd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92d559f
 
 
0b42505
 
 
 
 
 
 
 
 
 
 
 
 
92d559f
 
0b42505
92d559f
0b42505
92d559f
24fa0cd
92d559f
0b42505
92d559f
 
0b42505
92d559f
0b42505
92d559f
 
24fa0cd
 
a5a830c
24fa0cd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import gradio as gr
from gtts import gTTS
import tempfile
import os
import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import torchaudio
import difflib
import pandas as pd
from Levenshtein import distance as lev_distance

# Load AI4Bharat Hindi model & processor (public model on Hugging Face)
MODEL_NAME = "ai4bharat/indicwav2vec-hindi"
processor = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
model = Wav2Vec2ForCTC.from_pretrained(MODEL_NAME)

def play_text(text):
    tts = gTTS(text=text, lang='hi', slow=False)
    temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp3')
    tts.save(temp_file.name)
    return temp_file.name

def get_error_type(asr_word, correct_word):
    if not asr_word:
        return "Missing word"
    if not correct_word:
        return "Extra word"
    if lev_distance(asr_word, correct_word) <= 2:
        return "Spelling mistake"
    set1, set2 = set(asr_word), set(correct_word)
    if set1 & set2:
        return "Phonetic/Matra error"
    return "Substitution/Distorted"

def compare_hindi_sentences(expected, transcribed):
    expected_words = expected.strip().split()
    transcribed_words = transcribed.strip().split()
    matcher = difflib.SequenceMatcher(None, transcribed_words, expected_words)
    errors = []
    for opcode, i1, i2, j1, j2 in matcher.get_opcodes():
        if opcode == "equal":
            continue
        elif opcode == "replace":
            for k in range(max(i2 - i1, j2 - j1)):
                asr_word = transcribed_words[i1 + k] if i1 + k < i2 else ""
                correct_word = expected_words[j1 + k] if j1 + k < j2 else ""
                error_type = get_error_type(asr_word, correct_word)
                errors.append((asr_word, correct_word, error_type))
        elif opcode == "insert":
            for k in range(j1, j2):
                errors.append(("", expected_words[k], "Missing word"))
        elif opcode == "delete":
            for k in range(i1, i2):
                errors.append((transcribed_words[k], "", "Extra word"))
    return errors

def calculate_accuracy(expected, transcribed):
    expected_words = expected.strip().split()
    transcribed_words = transcribed.strip().split()
    matcher = difflib.SequenceMatcher(None, transcribed_words, expected_words)
    correct = 0
    total = len(expected_words)
    for tag, i1, i2, j1, j2 in matcher.get_opcodes():
        if tag == 'equal':
            correct += (j2-j1)
    accuracy = (correct / total) * 100 if total > 0 else 0
    return round(accuracy, 2)

def transcribe_audio(audio_path, original_text):
    try:
        waveform, sample_rate = torchaudio.load(audio_path)
        if waveform.shape[0] > 1:
            waveform = waveform.mean(dim=0, keepdim=True)
        if sample_rate != 16000:
            transform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
            waveform = transform(waveform)
        waveform = waveform / waveform.abs().max()
        input_values = processor(waveform.squeeze().numpy(), sampling_rate=16000, return_tensors="pt").input_values
        with torch.no_grad():
            logits = model(input_values).logits
        predicted_ids = torch.argmax(logits, dim=-1)
        transcription = processor.decode(predicted_ids[0]).strip()
        # Error analysis
        errors = compare_hindi_sentences(original_text, transcription)
        df_errors = pd.DataFrame(errors, columns=["बिगड़ा हुआ शब्द", "संभावित सही शब्द", "गलती का प्रकार"])
        # Speaking speed
        transcribed_words = transcription.strip().split()
        duration = waveform.shape[1] / 16000
        speed = round(len(transcribed_words) / duration, 2) if duration > 0 else 0
        # Accuracy
        accuracy = calculate_accuracy(original_text, transcription)
        result = {
            "📝 Transcribed Text": transcription,
            "⏱️ Speaking Speed (words/sec)": speed,
            "✅ Reading Accuracy (%)": accuracy
        }
        return result, df_errors
    except Exception as e:
        return {"error": str(e)}, pd.DataFrame(columns=["बिगड़ा हुआ शब्द", "संभावित सही शब्द", "गलती का प्रकार"])

with gr.Blocks() as app:
    gr.Markdown("## 🗣️ Hindi Reading & Pronunciation Practice App (AI4Bharat IndicWav2Vec)")
    with gr.Row():
        input_text = gr.Textbox(label="Paste Hindi Text Here", placeholder="यहाँ हिंदी टेक्स्ट लिखें...")
        play_button = gr.Button("🔊 Listen to Text")
        audio_output = gr.Audio(label="Text-to-Speech Output", type="filepath")
    play_button.click(play_text, inputs=input_text, outputs=audio_output)

    gr.Markdown("### 🎤 Now upload or record yourself reading the text aloud below:")
    audio_input = gr.Audio(type="filepath", label="Upload or Record Your Voice")
    submit_button = gr.Button("✅ Submit Recording for Checking")
    output = gr.JSON(label="Results")
    error_table = gr.Dataframe(label="गलती तालिका (Error Table)")
    submit_button.click(
        transcribe_audio,
        inputs=[audio_input, input_text],
        outputs=[output, error_table]
    )

app.launch()