import os import streamlit as st import torch import torch.nn.functional as F import librosa import speech_recognition as sr # from transformers import Wav2Vec2Processor, Wav2Vec2Model from sklearn.feature_extraction.text import CountVectorizer from sklearn.metrics.pairwise import cosine_similarity from groq import Groq # # Load pretrained model and processor # processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h") # model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h") # Initialize Groq client groq_client = Groq(api_key="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") # Function to transcribe audio into text def transcribe_audio(audio_file): recognizer = sr.Recognizer() try: with sr.AudioFile(audio_file) as source: audio_data = recognizer.record(source) # Read the entire audio file text = recognizer.recognize_google(audio_data, language='ar-SA') # Arabic transcription return text except sr.UnknownValueError: return None except sr.RequestError: return None # Function to convert Arabic text to Romanized text def romanize_arabic(text): romanized_mapping = { "الله": "Allahu", "اكبر": "akbar", "اشهد": "Ashhadu", "ان": "an", "لا": "la", "اله": "ilaha", "الا": "illa", "محمد": "Muhammad", "رسول": "Rasul", "حي": "Hayya", "على": "'ala", "الصلاه": "as-salah", "الفلاح": "al-falah", "لا": "la", "الا": "illa", } words = text.split() romanized_text = ' '.join(romanized_mapping.get(word, word) for word in words) return romanized_text # Function to convert audio file into embeddings import torch from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Model import librosa # Load pretrained model and processor feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-base-960h") model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h") # Function to convert audio file into embeddings from io import BytesIO import librosa # Updated function for Streamlit-compatible audio processing from io import BytesIO from pydub import AudioSegment def get_audio_embedding(file_input): # Convert Streamlit file input to BytesIO if it's not a string path if not isinstance(file_input, str): file_input = BytesIO(file_input.read()) # Convert the audio to WAV format using pydub (supports various formats like mp3, m4a, etc.) try: # Read the audio file using pydub and convert to WAV format audio = AudioSegment.from_file(file_input) wav_io = BytesIO() audio.export(wav_io, format="wav") wav_io.seek(0) # Move back to the start of the BytesIO object # Load the converted WAV file using librosa audio_data, sr = librosa.load(wav_io, sr=16000) except Exception as e: raise ValueError(f"Failed to process the audio file: {str(e)}") # Convert audio to embeddings using Wav2Vec2 inputs = feature_extractor(audio_data, sampling_rate=sr, return_tensors="pt", padding=True) with torch.no_grad(): embeddings = model(**inputs).last_hidden_state.mean(dim=1) return embeddings # Function to calculate cosine similarity for embeddings def compare_embeddings(embedding_1, embedding_2): similarity = F.cosine_similarity(embedding_1, embedding_2, dim=1) return similarity.item() # Function to calculate text similarity using Cosine Similarity def compare_text_similarity(text1, text2): vectorizer = CountVectorizer().fit_transform([text1, text2]) vectors = vectorizer.toarray() cosine_sim = cosine_similarity(vectors) return cosine_sim[0][1] # LLM feedback function using Groq def generate_llm_feedback(similarity_score): feedback_prompt = f""" A user has just pronounced part of the Azaan, and the similarity score between their pronunciation and the reference Azaan is {similarity_score:.2f}. Based on this score: - If the score is above 0.9, the pronunciation is excellent. - If the score is between 0.7 and 0.9, the pronunciation is good but may need slight improvement. - If the score is below 0.7, the pronunciation requires significant improvement. Provide detailed feedback for the user about their pronunciation, considering their score of {similarity_score:.2f}. """ chat_completion = groq_client.chat.completions.create( messages=[ { "role": "user", "content": feedback_prompt, } ], model="llama3-8b-8192", ) return chat_completion.choices[0].message.content # Custom CSS for styling st.markdown( """ """, unsafe_allow_html=True ) # Streamlit UI def main(): st.title("🔔 Azaan Pronunciation Evaluation") st.markdown("

Welcome to the Azaan Pronunciation Evaluation!

", unsafe_allow_html=True) st.subheader("Upload Your Audio") uploaded_file = st.file_uploader("Choose an audio file...", type=["wav", "mp3", "m4a"]) if uploaded_file is not None: st.audio(uploaded_file, format='audio/wav') # Step 1: Transcribe expert audio and user audio expert_audio_path = "Hafiz muqeem.wav" # Change this to the correct path st.write("🎤 Step 1: Checking if the words match...") # Transcribe user audio user_text = transcribe_audio(uploaded_file) expert_text = transcribe_audio(expert_audio_path) if user_text and expert_text: st.write("✅ Transcription successful!") st.write(f"**Expert Azaan Text:** {expert_text}") st.write(f"**Your Azaan Text:** {user_text}") # Step 2: Romanize and compare texts user_romanized = romanize_arabic(user_text) expert_romanized = romanize_arabic(expert_text) text_similarity = compare_text_similarity(user_romanized, expert_romanized) st.write(f"📝 Text Similarity Score: {text_similarity:.2f}") if text_similarity >= 0.1: st.success("✅ Great! Your words match well enough. Now, let's evaluate your pronunciation.") # Step 3: Evaluate pronunciation similarity expert_embedding = get_audio_embedding(expert_audio_path) user_embedding = get_audio_embedding(uploaded_file) pronunciation_similarity = compare_embeddings(expert_embedding, user_embedding) st.write(f"🔊 Pronunciation Similarity Score: {pronunciation_similarity:.2f}") # Get feedback feedback = generate_llm_feedback(pronunciation_similarity) st.markdown(f"
{feedback}
", unsafe_allow_html=True) else: st.warning("⚠️ Your words do not match sufficiently. Please try again.") else: st.error("❌ There was an error transcribing one or both audio files.") st.markdown("", unsafe_allow_html=True) if __name__ == "__main__": main()