Spaces:

cindyangelira
/

so-you-think-you-can-speak-chinese

Runtime error

App Files Files Community

cindyangelira commited on 25 days ago

Commit

ce99676

verified ·

1 Parent(s): aba5bcc

Update app.py

Browse files

Files changed (1) hide show

app.py +52 -39

app.py CHANGED Viewed

@@ -7,28 +7,36 @@ from transformers import (
     pipeline,
     AutoProcessor,
     AutoModelForSpeechSeq2Seq,
-    BitsAndBytesConfig
 )
 from datasets import load_dataset
 import numpy as np
-from transformers import AutoModelForTextToSpeech, SpeechT5HifiGan
 import torchaudio
 @spaces.GPU
-def dummy(): # just a dummy
     pass
-# Constants
-# DEVICE = "cpu"
 LANGUAGE_CODES = {
     "English": "en",
     "Chinese": "zh"
 }
-# Initialize components with efficient settings
 def initialize_components():
-    # Use XVERSE-13B-Chat as the base model - good multilingual support and reasonable size
-    # Load in 4-bit quantization to reduce memory usage
     bnb_config = BitsAndBytesConfig(
         load_in_4bit=True,
         bnb_4bit_quant_type="nf4",
@@ -42,43 +50,45 @@ def initialize_components():
     )
     tokenizer = AutoTokenizer.from_pretrained("xverse/XVERSE-13B-Chat")
-    # Whisper model for STT (small for efficiency)
-    processor = AutoProcessor.from_pretrained("openai/whisper-small")
     stt_model = AutoModelForSpeechSeq2Seq.from_pretrained(
         "openai/whisper-small",
         torch_dtype=torch.float32,
         low_cpu_mem_usage=True,
     )
-    # VITS for TTS (supports both English and Chinese)
-    tts_model = load_model("facebook/mms-tts-eng")
     vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
-    return llm, tokenizer, processor, stt_model, tts_model, vocoder
-def load_model(model_name):
-    """Helper function to load models with optimized settings"""
-    return AutoModelForTextToSpeech.from_pretrained(
-        model_name,
-        torch_dtype=torch.float32,
-        low_cpu_mem_usage=True,
-    )
 class ConversationManager:
     def __init__(self):
         self.history = []
-    def add_message(self, role, content, audio_path=None):
         self.history.append({
             "role": role,
-            "content": content,
-            "audio_path": audio_path
         })
     def get_formatted_history(self):
-        return "\n".join([
             f"{msg['role']}: {msg['content']}" for msg in self.history
         ])
 def speech_to_text(audio, processor, model, target_language):
     """Convert speech to text using Whisper"""
@@ -113,15 +123,19 @@ def generate_response(prompt, llm, tokenizer):
     response = tokenizer.decode(outputs[0], skip_special_tokens=True)
     return response
-def text_to_speech(text, model, vocoder, language):
-    """Convert text to speech using MMS-TTS"""
-    inputs = processor(text, return_tensors="pt")
-    speech = model.generate_speech(inputs["input_ids"], vocoder)
     return speech
 def create_gradio_interface():
     # Initialize components
-    llm, tokenizer, processor, stt_model, tts_model, vocoder = initialize_components()
     conversation_manager = ConversationManager()
     with gr.Blocks() as interface:
@@ -133,7 +147,6 @@ def create_gradio_interface():
             )
         with gr.Row():
-            # Audio input
             audio_input = gr.Audio(
                 source="microphone",
                 type="numpy",
@@ -141,7 +154,6 @@ def create_gradio_interface():
             )
         with gr.Row():
-            # Chat history display
             chat_display = gr.Textbox(
                 value="",
                 label="Conversation History",
@@ -150,17 +162,18 @@ def create_gradio_interface():
             )
         with gr.Row():
-            # Assistant's audio response
             audio_output = gr.Audio(
-                label="Assistant's Response",
                 type="numpy"
             )
         def process_conversation(audio, language):
             # Speech to text
             user_text = speech_to_text(
                 audio,
-                processor,
                 stt_model,
                 language
             )
@@ -169,14 +182,15 @@ def create_gradio_interface():
             # Generate LLM response
             context = conversation_manager.get_formatted_history()
             response = generate_response(context, llm, tokenizer)
-            conversation_manager.add_message("Assistant", response)
             # Text to speech
             speech_output = text_to_speech(
                 response,
                 tts_model,
                 vocoder,
-                language
             )
             return (
@@ -192,7 +206,6 @@ def create_gradio_interface():
     return interface
-# Launch the application
 if __name__ == "__main__":
     interface = create_gradio_interface()
     interface.launch()

     pipeline,
     AutoProcessor,
     AutoModelForSpeechSeq2Seq,
+    BitsAndBytesConfig,
+    SpeechT5Processor,
+    SpeechT5ForTextToSpeech,
+    SpeechT5HifiGan
 )
 from datasets import load_dataset
 import numpy as np
 import torchaudio
 @spaces.GPU
+def dummy():  # just a dummy
     pass
 LANGUAGE_CODES = {
     "English": "en",
     "Chinese": "zh"
 }
+def get_system_prompt(language):
+    if language == "Chinese":
+        return """你是Lin Yi（林意），一个友好的AI助手。你是我的好朋友，说话亲切自然。
+请用中文回答，语气要自然友好。如果我用英文问你问题，你也要用中文回答。
+记住你要像朋友一样交谈，不要太正式。"""
+    else:
+        return """You are Lin Yi, a friendly AI assistant and my good friend (hao pengyou).
+Speak naturally and warmly. If I speak in Chinese, respond in English.
+Remember to converse like a friend, not too formal."""
 def initialize_components():
+    # LLM initialization
     bnb_config = BitsAndBytesConfig(
         load_in_4bit=True,
         bnb_4bit_quant_type="nf4",
     )
     tokenizer = AutoTokenizer.from_pretrained("xverse/XVERSE-13B-Chat")
+    # Speech-to-text
+    whisper_processor = AutoProcessor.from_pretrained("openai/whisper-small")
     stt_model = AutoModelForSpeechSeq2Seq.from_pretrained(
         "openai/whisper-small",
         torch_dtype=torch.float32,
         low_cpu_mem_usage=True,
     )
+    # Text-to-speech
+    tts_processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
+    tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
     vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
+    # Load speaker embedding
+    embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
+    speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
+    return llm, tokenizer, whisper_processor, stt_model, tts_processor, tts_model, vocoder, speaker_embeddings
 class ConversationManager:
     def __init__(self):
         self.history = []
+        self.current_language = "English"
+    def add_message(self, role, content):
         self.history.append({
             "role": role,
+            "content": content
         })
     def get_formatted_history(self):
+        system_prompt = get_system_prompt(self.current_language)
+        history_text = "\n".join([
             f"{msg['role']}: {msg['content']}" for msg in self.history
         ])
+        return f"{system_prompt}\n\n{history_text}"
+    def set_language(self, language):
+        self.current_language = language
 def speech_to_text(audio, processor, model, target_language):
     """Convert speech to text using Whisper"""
     response = tokenizer.decode(outputs[0], skip_special_tokens=True)
     return response
+def text_to_speech(text, processor, model, vocoder, speaker_embeddings):
+    """Convert text to speech using SpeechT5"""
+    inputs = processor(text=text, return_tensors="pt")
+    speech = model.generate_speech(
+        inputs["input_ids"],
+        speaker_embeddings,
+        vocoder=vocoder
+    )
     return speech
 def create_gradio_interface():
     # Initialize components
+    llm, tokenizer, whisper_processor, stt_model, tts_processor, tts_model, vocoder, speaker_embeddings = initialize_components()
     conversation_manager = ConversationManager()
     with gr.Blocks() as interface:
             )
         with gr.Row():
             audio_input = gr.Audio(
                 source="microphone",
                 type="numpy",
             )
         with gr.Row():
             chat_display = gr.Textbox(
                 value="",
                 label="Conversation History",
             )
         with gr.Row():
             audio_output = gr.Audio(
+                label="Lin Yi's Response",
                 type="numpy"
             )
         def process_conversation(audio, language):
+            conversation_manager.set_language(language)
             # Speech to text
             user_text = speech_to_text(
                 audio,
+                whisper_processor,
                 stt_model,
                 language
             )
             # Generate LLM response
             context = conversation_manager.get_formatted_history()
             response = generate_response(context, llm, tokenizer)
+            conversation_manager.add_message("Lin Yi", response)
             # Text to speech
             speech_output = text_to_speech(
                 response,
+                tts_processor,
                 tts_model,
                 vocoder,
+                speaker_embeddings
             )
             return (
     return interface
 if __name__ == "__main__":
     interface = create_gradio_interface()
     interface.launch()