import gradio as gr import azure.cognitiveservices.speech as speechsdk import time # Azure credentials SPEECH_KEY = "vkv2VVTi1agTmU74Sz8C62mOymEHmoknwCaQEnEsroK1AE0B7xt9JQQJ99BDACI8hq2XJ3w3AAAYACOGrzMV" SERVICE_REGION = "switzerlandnorth" # Define the language and dialect mapping language_dialects = { "Arabic": { "Egypt": "ar-EG", "Saudi Arabia": "ar-SA", "United Arab Emirates": "ar-AE", "Bahrain": "ar-BH", "Algeria": "ar-DZ", "Iraq": "ar-IQ", "Jordan": "ar-JO", "Kuwait": "ar-KW", "Lebanon": "ar-LB", "Libya": "ar-LY", "Morocco": "ar-MA", "Oman": "ar-OM", "Palestinian Authority": "ar-PS", "Qatar": "ar-QA", "Syria": "ar-SY", "Tunisia": "ar-TN", "Yemen": "ar-YE" }, "English": { "United States": "en-US", "United Kingdom": "en-GB", "Australia": "en-AU", "Canada": "en-CA", "India": "en-IN", "Ireland": "en-IE", "New Zealand": "en-NZ", "South Africa": "en-ZA", "Singapore": "en-SG", "Philippines": "en-PH" }, "French": { "France": "fr-FR", "Canada": "fr-CA", "Switzerland": "fr-CH" }, "Spanish": { "Spain": "es-ES", "Mexico": "es-MX", "Argentina": "es-AR", "Colombia": "es-CO", "Chile": "es-CL", "Peru": "es-PE", "Venezuela": "es-VE" }, "German": { "Germany": "de-DE", "Austria": "de-AT", "Switzerland": "de-CH" }, "Portuguese": { "Portugal": "pt-PT", "Brazil": "pt-BR" }, "Chinese": { "Mainland China": "zh-CN", "Hong Kong": "zh-HK", "Taiwan": "zh-TW" }, "Italian": { "Italy": "it-IT" }, "Japanese": { "Japan": "ja-JP" }, "Korean": { "Korea": "ko-KR" } # Add more languages and dialects as needed } # Function to get dialects based on selected language def get_dialects(language): dialects = list(language_dialects.get(language, {}).keys()) return gr.update(choices=dialects, value=dialects[0] if dialects else None) # Function to transcribe audio def transcribe_audio(audio_file, duration, language, dialect): # Simulate recording duration print(f"Recording for {duration} seconds...") time.sleep(duration) # Get the locale code locale_code = language_dialects.get(language, {}).get(dialect, "en-US") print(f"Selected Locale Code: {locale_code}") # Set up speech recognition speech_config = speechsdk.SpeechConfig(subscription=SPEECH_KEY, region=SERVICE_REGION) speech_config.speech_recognition_language = locale_code print(locale_code) audio_input = speechsdk.audio.AudioConfig(filename=audio_file) speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_input) result = speech_recognizer.recognize_once() if result.reason == speechsdk.ResultReason.RecognizedSpeech: return result.text elif result.reason == speechsdk.ResultReason.NoMatch: return "No speech could be recognized" elif result.reason == speechsdk.ResultReason.Canceled: cancellation_details = result.cancellation_details return f"Speech recognition canceled: {cancellation_details.error_details}" else: return "Unknown error occurred during speech recognition" # Create the Gradio interface with gr.Blocks() as demo: gr.Markdown("## Azure Speech to Text with Language and Dialect Selection") with gr.Row(): audio_input = gr.Audio(type="filepath", label="Upload Audio") duration_input = gr.Dropdown(choices=[5, 10], label="Recording Duration", value=5) with gr.Row(): language_input = gr.Dropdown(choices=list(language_dialects.keys()), label="Select Language") dialect_input = gr.Dropdown(choices=[], label="Select Dialect") transcribe_button = gr.Button("Transcribe") output_text = gr.Textbox(label="Transcription Result") # Update dialect options based on selected language language_input.change(fn=get_dialects, inputs=language_input, outputs=dialect_input) # Transcribe audio on button click transcribe_button.click(fn=transcribe_audio, inputs=[audio_input, duration_input, language_input, dialect_input], outputs=output_text) # Launch the app demo.launch()