File size: 5,095 Bytes
b63476f
fb93a17
d6fa022
ba18501
 
 
e3afeb6
3ada721
81a3a36
e3afeb6
5db928f
 
fb93a17
d6fa022
fb93a17
d6fa022
 
650c3e9
fb93a17
ba18501
 
 
 
65516e4
ba18501
65516e4
ba18501
 
 
 
 
 
 
3ada721
 
ea23e90
ba18501
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e3afeb6
 
092c4c2
 
e3afeb6
 
 
 
 
 
 
 
 
 
81a3a36
fb93a17
e3afeb6
 
 
650c3e9
 
 
 
6cca945
e3afeb6
650c3e9
e3afeb6
 
 
d6fa022
650c3e9
e3afeb6
650c3e9
e3afeb6
ba18501
 
a59a577
d6fa022
e3afeb6
d6fa022
 
 
650c3e9
 
d6fa022
c994feb
 
d6fa022
 
650c3e9
ba18501
 
 
 
 
d6fa022
5995a5d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import gradio as gr
from gtts import gTTS
import tempfile
import difflib
import pandas as pd
from Levenshtein import distance as lev_distance
import whisper
import string

# Load Whisper model once (choose "small" or "medium" for better results)
#model = whisper.load_model("small")
model = whisper.load_model("large-v3")

def play_text(text):
    tts = gTTS(text=text, lang='hi', slow=False)
    temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp3')
    tts.save(temp_file.name)
    return temp_file.name

def get_error_type(asr_word, correct_word):
    if not asr_word:
        return "Missing word"
    if not correct_word:
        return "अतिरिक्त शब्द"
    if lev_distance(asr_word, correct_word) <= 2:
        return "उच्चारण दोष (Pronunciation Errors) "
    set1, set2 = set(asr_word), set(correct_word)
    if set1 & set2:
        return "Phonetic/Matra error"
    return "Substitution/Distorted"

def compare_hindi_sentences(expected, transcribed):
    expected_words = expected.strip().split()
    expected_clean = expected.translate(str.maketrans('', '', string.punctuation))
    expected_words = expected_clean.strip().split()
    transcribed = transcribed.translate(str.maketrans('', '', string.punctuation))
    transcribed_words = transcribed.strip().split()
    matcher = difflib.SequenceMatcher(None, transcribed_words, expected_words)
    errors = []
    for opcode, i1, i2, j1, j2 in matcher.get_opcodes():
        if opcode == "equal":
            continue
        elif opcode == "replace":
            for k in range(max(i2 - i1, j2 - j1)):
                asr_word = transcribed_words[i1 + k] if i1 + k < i2 else ""
                correct_word = expected_words[j1 + k] if j1 + k < j2 else ""
                error_type = get_error_type(asr_word, correct_word)
                errors.append((asr_word, correct_word, error_type))
        elif opcode == "insert":
            for k in range(j1, j2):
                errors.append(("", expected_words[k], "Missing word"))
        elif opcode == "delete":
            for k in range(i1, i2):
                errors.append((transcribed_words[k], "", "Extra word"))
    return errors

def calculate_accuracy(expected, transcribed):
    expected_words = expected.strip().split()
    transcribed = transcribed.translate(str.maketrans('', '', string.punctuation))
    transcribed  = transcribed.replace(',',' ')
    transcribed_words = transcribed.strip().split()
    matcher = difflib.SequenceMatcher(None, transcribed_words, expected_words)
    correct = 0
    total = len(expected_words)
    for tag, i1, i2, j1, j2 in matcher.get_opcodes():
        if tag == 'equal':
            correct += (j2-j1)
    accuracy = (correct / total) * 100 if total > 0 else 0
    return round(accuracy, 2)

def transcribe_audio(audio_path, original_text):
    try:
        # Use Whisper for transcription
        result = model.transcribe(audio_path, language='hi')
        transcription = result['text'].strip()
        # Error analysis
        errors = compare_hindi_sentences(original_text, transcription)
        df_errors = pd.DataFrame(errors, columns=["बिगड़ा हुआ शब्द", "संभावित सही शब्द", "गलती का प्रकार"])
        # Speaking speed
        transcribed_words = transcription.strip().split()  
        duration = result['segments'][-1]['end'] if result.get('segments') else 1.0
        speed = round(len(transcribed_words) / duration, 2) if duration > 0 else 0
        # Accuracy
        accuracy = calculate_accuracy(original_text, transcription)
        result_dict = {
            "📝 Transcribed Text": transcription,
            "⏱️ Speaking Speed (words/sec)": speed,
            "✅ Reading Accuracy (%)": accuracy,
        }
        return result_dict, df_errors
    except Exception as e:
        return {"error": str(e)}, pd.DataFrame(columns=["बिगड़ा हुआ शब्द", "संभावित सही शब्द", "गलती का प्रकार"])

with gr.Blocks() as app:
    gr.Markdown("## 🗣️ Hindi Reading & Pronunciation Practice App (OpenAI Whisper)")
    with gr.Row():
        input_text = gr.Textbox(label="Paste Hindi Text Here", placeholder="यहाँ हिंदी टेक्स्ट लिखें...")
        play_button = gr.Button("🔊 Listen to Text")
        audio_output = gr.Audio(label="Text-to-Speech Output", type="filepath")
    play_button.click(play_text, inputs=input_text, outputs=audio_output)

    gr.Markdown("### 🎤 Now upload or record yourself reading the text aloud below:")
    audio_input = gr.Audio(type="filepath", label="Upload or Record Your Voice")
    submit_button = gr.Button("✅ Submit Recording for Checking")
    output = gr.JSON(label="Results")
    error_table = gr.Dataframe(label="गलती तालिका (Error Table)")
    submit_button.click(
        transcribe_audio,
        inputs=[audio_input, input_text],
        outputs=[output, error_table]
    )

app.launch()