Spaces:
Sleeping
Sleeping
File size: 5,095 Bytes
b63476f fb93a17 d6fa022 ba18501 e3afeb6 3ada721 81a3a36 e3afeb6 5db928f fb93a17 d6fa022 fb93a17 d6fa022 650c3e9 fb93a17 ba18501 65516e4 ba18501 65516e4 ba18501 3ada721 ea23e90 ba18501 e3afeb6 092c4c2 e3afeb6 81a3a36 fb93a17 e3afeb6 650c3e9 6cca945 e3afeb6 650c3e9 e3afeb6 d6fa022 650c3e9 e3afeb6 650c3e9 e3afeb6 ba18501 a59a577 d6fa022 e3afeb6 d6fa022 650c3e9 d6fa022 c994feb d6fa022 650c3e9 ba18501 d6fa022 5995a5d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 |
import gradio as gr
from gtts import gTTS
import tempfile
import difflib
import pandas as pd
from Levenshtein import distance as lev_distance
import whisper
import string
# Load Whisper model once (choose "small" or "medium" for better results)
#model = whisper.load_model("small")
model = whisper.load_model("large-v3")
def play_text(text):
tts = gTTS(text=text, lang='hi', slow=False)
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp3')
tts.save(temp_file.name)
return temp_file.name
def get_error_type(asr_word, correct_word):
if not asr_word:
return "Missing word"
if not correct_word:
return "अतिरिक्त शब्द"
if lev_distance(asr_word, correct_word) <= 2:
return "उच्चारण दोष (Pronunciation Errors) "
set1, set2 = set(asr_word), set(correct_word)
if set1 & set2:
return "Phonetic/Matra error"
return "Substitution/Distorted"
def compare_hindi_sentences(expected, transcribed):
expected_words = expected.strip().split()
expected_clean = expected.translate(str.maketrans('', '', string.punctuation))
expected_words = expected_clean.strip().split()
transcribed = transcribed.translate(str.maketrans('', '', string.punctuation))
transcribed_words = transcribed.strip().split()
matcher = difflib.SequenceMatcher(None, transcribed_words, expected_words)
errors = []
for opcode, i1, i2, j1, j2 in matcher.get_opcodes():
if opcode == "equal":
continue
elif opcode == "replace":
for k in range(max(i2 - i1, j2 - j1)):
asr_word = transcribed_words[i1 + k] if i1 + k < i2 else ""
correct_word = expected_words[j1 + k] if j1 + k < j2 else ""
error_type = get_error_type(asr_word, correct_word)
errors.append((asr_word, correct_word, error_type))
elif opcode == "insert":
for k in range(j1, j2):
errors.append(("", expected_words[k], "Missing word"))
elif opcode == "delete":
for k in range(i1, i2):
errors.append((transcribed_words[k], "", "Extra word"))
return errors
def calculate_accuracy(expected, transcribed):
expected_words = expected.strip().split()
transcribed = transcribed.translate(str.maketrans('', '', string.punctuation))
transcribed = transcribed.replace(',',' ')
transcribed_words = transcribed.strip().split()
matcher = difflib.SequenceMatcher(None, transcribed_words, expected_words)
correct = 0
total = len(expected_words)
for tag, i1, i2, j1, j2 in matcher.get_opcodes():
if tag == 'equal':
correct += (j2-j1)
accuracy = (correct / total) * 100 if total > 0 else 0
return round(accuracy, 2)
def transcribe_audio(audio_path, original_text):
try:
# Use Whisper for transcription
result = model.transcribe(audio_path, language='hi')
transcription = result['text'].strip()
# Error analysis
errors = compare_hindi_sentences(original_text, transcription)
df_errors = pd.DataFrame(errors, columns=["बिगड़ा हुआ शब्द", "संभावित सही शब्द", "गलती का प्रकार"])
# Speaking speed
transcribed_words = transcription.strip().split()
duration = result['segments'][-1]['end'] if result.get('segments') else 1.0
speed = round(len(transcribed_words) / duration, 2) if duration > 0 else 0
# Accuracy
accuracy = calculate_accuracy(original_text, transcription)
result_dict = {
"📝 Transcribed Text": transcription,
"⏱️ Speaking Speed (words/sec)": speed,
"✅ Reading Accuracy (%)": accuracy,
}
return result_dict, df_errors
except Exception as e:
return {"error": str(e)}, pd.DataFrame(columns=["बिगड़ा हुआ शब्द", "संभावित सही शब्द", "गलती का प्रकार"])
with gr.Blocks() as app:
gr.Markdown("## 🗣️ Hindi Reading & Pronunciation Practice App (OpenAI Whisper)")
with gr.Row():
input_text = gr.Textbox(label="Paste Hindi Text Here", placeholder="यहाँ हिंदी टेक्स्ट लिखें...")
play_button = gr.Button("🔊 Listen to Text")
audio_output = gr.Audio(label="Text-to-Speech Output", type="filepath")
play_button.click(play_text, inputs=input_text, outputs=audio_output)
gr.Markdown("### 🎤 Now upload or record yourself reading the text aloud below:")
audio_input = gr.Audio(type="filepath", label="Upload or Record Your Voice")
submit_button = gr.Button("✅ Submit Recording for Checking")
output = gr.JSON(label="Results")
error_table = gr.Dataframe(label="गलती तालिका (Error Table)")
submit_button.click(
transcribe_audio,
inputs=[audio_input, input_text],
outputs=[output, error_table]
)
app.launch()
|