import gradio as gr from transformers import pipeline, WhisperProcessor, WhisperForConditionalGeneration import torch import edge_tts import asyncio import numpy as np class FrenchLearningApp: def __init__(self): # Initialize models self.conversation_model = pipeline("text-generation", model="gpt2") # Initialize Whisper model self.whisper_processor = WhisperProcessor.from_pretrained("openai/whisper-large-v3") self.whisper_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v3") self.context = "Start a conversation in French" self.learning_goals = [] def set_learning_goals(self, goals): self.learning_goals = goals.split('\n') return f"Learning goals set: {self.learning_goals}" async def generate_french(self): french_text = self.conversation_model(self.context, max_length=100)[0]['generated_text'] # Generate audio using edge-tts voice = "fr-FR-HenriNeural" # You can change this to any available French voice communicate = edge_tts.Communicate(french_text, voice) audio_data = b"" async for chunk in communicate.stream(): if chunk["type"] == "audio": audio_data += chunk["data"] # Convert audio to numpy array for Gradio audio_np = np.frombuffer(audio_data, dtype=np.int16) audio_float = audio_np.astype(np.float32) / 32768.0 # Convert to float32 return (24000, audio_float), french_text # 24000 is the default sample rate for edge-tts def process_user_response(self, audio): # Transcribe audio to French text using Whisper input_features = self.whisper_processor(audio, sampling_rate=16000, return_tensors="pt").input_features # Generate French transcription self.whisper_model.config.forced_decoder_ids = self.whisper_processor.get_decoder_prompt_ids(language="french", task="transcribe") predicted_ids = self.whisper_model.generate(input_features) french_text = self.whisper_processor.batch_decode(predicted_ids, skip_special_tokens=True)[0] # Translate French to English using Whisper self.whisper_model.config.forced_decoder_ids = self.whisper_processor.get_decoder_prompt_ids(language="french", task="translate") predicted_ids = self.whisper_model.generate(input_features) english_text = self.whisper_processor.batch_decode(predicted_ids, skip_special_tokens=True)[0] # Analyze response (simplified) analysis = self.analyze_response(english_text) # Update context self.context += f" {french_text}" return french_text, english_text, analysis def analyze_response(self, english_text): # Simplified analysis - check if any learning goal is mentioned analysis = [] for goal in self.learning_goals: if goal.lower() in english_text.lower(): analysis.append(f"Goal met: {goal}") else: analysis.append(f"Goal not yet met: {goal}") return "\n".join(analysis) def launch_app(): app = FrenchLearningApp() with gr.Blocks() as interface: gr.Markdown("# French Learning Application") with gr.Tab("Teacher Setup"): goals_input = gr.Textbox(label="Enter learning goals (one per line)") set_goals_button = gr.Button("Set Learning Goals") goals_output = gr.Textbox(label="Goals Status") set_goals_button.click(app.set_learning_goals, inputs=goals_input, outputs=goals_output) with gr.Tab("Conversation"): generate_button = gr.Button("Generate French") audio_output = gr.Audio(label="AI Speech") french_output = gr.Textbox(label="French Text") generate_button.click(lambda: asyncio.run(app.generate_french()), inputs=None, outputs=[audio_output, french_output]) audio_input = gr.Audio(source="microphone", type="numpy", label="Your Response") transcription_output = gr.Textbox(label="Your Speech (Transcribed)") translation_output = gr.Textbox(label="English Translation") analysis_output = gr.Textbox(label="Analysis") audio_input.change(app.process_user_response, inputs=audio_input, outputs=[transcription_output, translation_output, analysis_output]) interface.launch() if __name__ == "__main__": launch_app()