speech_recognize1

Sleeping

File size: 4,661 Bytes

46e19ef
d558c26
 
 
540f4c8
d558c26
e3a58c6
46e19ef
 
 
e3a58c6
8ef9310
0f433ab
e3a58c6
0f433ab
d558c26
0498d1c
3e9568e
8ef9310
 
 
 
 
 
 
 
 
 
 
e3a58c6
91a2ea1
304ed82
 
 
e3a58c6
540f4c8
e3a58c6
91a2ea1
e3a58c6
d558c26
e3a58c6
d558c26
 
e3a58c6
d558c26
 
 
 
e3a58c6
61c9f90
 
 
 
 
e3a58c6
d558c26
3e9568e
d558c26
e3a58c6
61c9f90
e3a58c6
d558c26
e3a58c6
 
0f433ab
 
3e9568e
e3a58c6
61c9f90
e3a58c6
9586c71
91a2ea1
9586c71
 
e3a58c6
91a2ea1
61c9f90
d558c26
e3a58c6
85d956d
e3a58c6
 
85d956d
e3a58c6
8ef9310
 
d558c26
 
 
 
 
8ef9310
58f3405
0498d1c
8ef9310
61c9f90
d558c26
 
 
 
85d956d
 
 
9586c71
85d956d
 
 
 
e3a58c6
85d956d
 
e3a58c6
85d956d

import os
import speech_recognition as sr
import difflib
import gradio as gr
from transformers import pipeline

# Tạo thư mục audio nếu chưa tồn tại
if not os.path.exists('audio'):
    os.makedirs('audio')

# Bước 1: Chuyển đổi âm thanh thành văn bản
def transcribe_audio(audio):
    if audio is None:
        return "No audio file provided."  # Xử lý trường hợp không có tệp âm thanh
    
    recognizer = sr.Recognizer()
    audio_file = sr.AudioFile(audio)
    
    with audio_file as source:
        audio_data = recognizer.record(source)

    try:
        transcription = recognizer.recognize_google(audio_data)
        return transcription
    except sr.UnknownValueError:
        return "Google Speech Recognition could not understand the audio"
    except sr.RequestError as e:
        return f"Error with Google Speech Recognition service: {e}"

# Bước 2: Tạo âm thanh phát âm cho các từ sai
def create_pronunciation_audio(word):
    # Cập nhật mô hình ở đây nếu cần
    tts = pipeline("text-to-speech", model="tts_models/en/ljspeech/fastspeech2_hifigan")  # Mô hình TTS

    audio_file_path = f"audio/{word}.wav"
    tts(word, output_file=audio_file_path)  # Tạo âm thanh từ văn bản
    return audio_file_path

# Bước 3: So sánh văn bản đã chuyển đổi với đoạn văn bản gốc
def compare_texts(reference_text, transcribed_text):
    word_scores = []
    reference_words = reference_text.split()
    transcribed_words = transcribed_text.split()
    incorrect_words_audios = []  # Lưu trữ đường dẫn âm thanh cho các từ sai

    sm = difflib.SequenceMatcher(None, reference_text, transcribed_text)
    similarity_score = round(sm.ratio() * 100, 2)

    # Tạo đầu ra HTML
    html_output = f"<strong>Fidelity Class:</strong> {'CORRECT' if similarity_score > 50 else 'INCORRECT'}<br>"
    html_output += f"<strong>Quality Score:</strong> {similarity_score}<br>"
    html_output += f"<strong>Transcribed Text:</strong> {transcribed_text}<br>"
    html_output += "<strong>Word Score List:</strong><br>"

    # Tạo danh sách điểm số từ màu sắc
    for i, word in enumerate(reference_words):
        try:
            if word.lower() == transcribed_words[i].lower():
                html_output += f'<span style="color: green;">{word}</span> '  # Từ đúng màu xanh
            elif difflib.get_close_matches(word, transcribed_words):
                html_output += f'<span style="color: yellow;">{word}</span> '  # Từ gần đúng màu vàng
            else:
                html_output += f'<span style="color: red;">{word}</span> '  # Từ sai màu đỏ
                # Tạo âm thanh phát âm cho từ sai
                audio_file_path = create_pronunciation_audio(word)
                incorrect_words_audios.append((word, audio_file_path))
        except IndexError:
            html_output += f'<span style="color: red;">{word}</span> '  # Từ tham chiếu không được chuyển đổi

    # Cung cấp âm thanh cho các từ sai
    if incorrect_words_audios:
        html_output += "<br><strong>Pronunciation for Incorrect Words:</strong><br>"
        for word, audio in incorrect_words_audios:
            html_output += f'{word}: '
            html_output += f'<audio controls><source src="{audio}" type="audio/wav">Your browser does not support the audio tag.</audio><br>'

    return html_output

# Bước 4: Chức năng Text-to-Speech
def text_to_speech(paragraph):
    audio_file_path = create_pronunciation_audio(paragraph)  # Sử dụng hàm đã sửa
    return audio_file_path

# Giao diện Gradio
def gradio_function(paragraph, audio):
    transcribed_text = transcribe_audio(audio)
    comparison_result = compare_texts(paragraph, transcribed_text)
    return comparison_result

interface = gr.Interface(
    fn=gradio_function, 
    inputs=[
        gr.Textbox(lines=5, label="Input Paragraph"),
        gr.Audio(type="filepath", label="Record Audio")
    ], 
    outputs="html",
    title="Speech Recognition Comparison",
    description="Input a paragraph, record your audio, and compare the transcription to the original text."
)

tts_interface = gr.Interface(
    fn=text_to_speech,
    inputs=gr.Textbox(lines=5, label="Input Paragraph to Read Aloud"),
    outputs=gr.Audio(label="Text-to-Speech Output"),
    title="Text-to-Speech",
    description="This tool will read your input paragraph aloud."
)

# Kết hợp cả hai giao diện
demo = gr.TabbedInterface([interface, tts_interface], ["Speech Recognition", "Text-to-Speech"])

# Khởi động ứng dụng Gradio
demo.launch()