import streamlit as st from streamlit_mic_recorder import mic_recorder #import whisper import tempfile import os from pydub import AudioSegment from faster_whisper import WhisperModel # Load whisper model @st.cache_resource def load_model(): #return whisper.load_model("small") return WhisperModel("large-v3", device="auto", compute_type="int8") whisper_model = load_model() st.title("🗣 中文語音識別 (Whisper + Mic Recorder)") mode = st.radio("選擇輸入方式", ["🎤 使用麥克風錄音", "📁 上傳本地音檔","✍️ 手動文字輸入"], horizontal=True) def prompt_switch(index, input_text): print("prompt_switch") prompts = { "Qwen/Qwen2.5-7B-Instruct-Turbo": """ You are an assistant for intent classification. Your task is to classify a given user input into one of the following two categories: 1."Reservation": user input is related to or imply a a restaurant reservation. 2."unrelated": user input is anything else. Your response should be in JSON format either {{"result": "Reservation"}} or {{"result": "Others"}}. If the user input is related to restaurant reservation, return {{"result": "Reservation"}}; If the user input is anything else, return {{"result": "Others"}}. Here is the user input: {input} """.strip(), "Qwen/Qwen2.5-Coder-32B-Instruct": """ You are an assistant for intent classification. Your task is to classify a given user input into one of the following two categories: "Reservation": user input is related to or imply a a restaurant reservation. "unrelated": user input is anything else. Your response should be in JSON format either {{"result": "Reservation"}} or {{"result": "Others"}}. Here is the user input: {input} """.strip(), "google/gemma-2b-it": """ You are an assistant for intent classification. Your task is to classify a given user input into one of the following two categories: 1."Reservation": user input is related to or imply a a restaurant reservation. 2."unrelated": user input is anything else. Your response should be in JSON format either {{"result": "Reservation"}} or {{"result": "Others"}}. If the user input is related to restaurant reservation, return {{"result": "Reservation"}}; If the user input is anything else, return {{"result": "Others"}}. Here is the user input: {input} """.strip(), "google/gemma-2-9b-it": """ You are an assistant for intent classification. Your task is to classify a given user input into one of the following two categories: 1."Reservation": user input is related to or imply a a restaurant reservation. 2."unrelated": user input is anything else. Your response should be in JSON format either {{"result": "Reservation"}} or {{"result": "Others"}}. If the user input is related to restaurant reservation, return {{"result": "Reservation"}}; If the user input is anything else, return {{"result": "Others"}}. Here is the user input: {input} """.strip(), "google/gemma-2-27b-it": """ You are an assistant for intent classification. Your task is to classify a given user input into one of the following two categories: 1."Reservation": user input is related to or imply a a restaurant reservation. 2."unrelated": user input is anything else. Your response should be in JSON format either {{"result": "Reservation"}} or {{"result": "Others"}}. If the user input is related to restaurant reservation, return {{"result": "Reservation"}}; If the user input is anything else, return {{"result": "Others"}}. Here is the user input: {input} """.strip() } prompt = prompts[index].format(input=input_text) return prompt model_option = st.selectbox( "你要選擇哪一個模型?", ("Qwen/Qwen2.5-7B-Instruct-Turbo", "Qwen/Qwen2.5-Coder-32B-Instruct","google/gemma-2b-it", "google/gemma-2-9b-it", "google/gemma-2-27b-it") ) st.write("你選擇的模型:", model_option) # Record audio from browser # audio_data = mic_recorder(start_prompt="🎤 點擊開始錄音", stop_prompt="⏹️ 停止錄音", just_once=True, use_container_width=True) import os import outlines import os, termcolor from termcolor import cprint, colored from outlines.models import openai #from outlines.generate import choice # ''' # prompt_messages_q32b = f""" # You are an assistant for intent classification. # Your task is to classify a given user input into one of the following two categories: # "Reservation": user input is related to or imply a a restaurant reservation. # "unrelated": user input is anything else. # Your response should be in JSON format either {{"result": "Reservation"}} or {{"result": "Others"}}. # Here is the user input: {input} # """.strip() # prompt_messages_q7b = f""" # You are an assistant for intent classification. # Your task is to classify a given user input into one of the following two categories: # 1."Reservation": user input is related to or imply a a restaurant reservation. # 2."unrelated": user input is anything else. # Your response should be in JSON format either # {{"result": "Reservation"}} or {{"result": "Others"}}. # If the user input is related to restaurant reservation, return {{"result": "Reservation"}}; # If the user input is anything else, return {{"result": "Others"}}. # Here is the user input: {input} # """.strip() # ''' def clssification( input): st.write("🧠 LLM辨識意圖中..") labels = ["Reservation", "unrelated"] model = openai( #"Qwen/Qwen2.5-Coder-32B-Instruct", #"Qwen/Qwen2.5-7B-Instruct-Turbo", #"google/gemma-2b-it", model_option, api_key=os.environ["TOGETHER_API_KEY"], base_url="https://api.together.xyz/v1" ) generator = outlines.generate.choice(model, labels) prompt_message = prompt_switch(model_option, input) st.write(prompt_message) answer = generator(prompt_message) return answer def convert_audio_to_wav(audio_bytes, target_sample_rate=16000): with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_input: temp_input.write(audio_bytes) temp_input_path = temp_input.name audio = AudioSegment.from_file(temp_input_path) audio = audio.set_channels(1).set_frame_rate(target_sample_rate) converted_path = temp_input_path.replace(".wav", "_converted.wav") audio.export(converted_path, format="wav") os.remove(temp_input_path) return converted_path def transcribe_audio(wav_path): st.write("🧠 Whisper 正在識別語音..") #result = whisper_model.transcribe(wav_path, language="zh") #st.text_area("📜 轉寫結果", result["text"], height=200) segments, info = whisper_model.transcribe(wav_path, language="zh") result_text = "".join([seg.text for seg in segments]) st.text_area("📜 轉寫結果", result_text, height=200) os.remove(wav_path) intent_classification(result_text) def intent_classification(input_text): st.write("🧠 意圖識別 ") intent=clssification(input_text) st.write(intent) # --- Mode: Microphone --- if mode == "🎤 使用麥克風錄音": audio_data = mic_recorder(start_prompt="🎤 點擊開始錄音", stop_prompt="⏹️ 停止錄音", just_once=True, use_container_width=True) if audio_data: st.audio(audio_data["bytes"], format="audio/wav") wav_path = convert_audio_to_wav(audio_data["bytes"]) transcribe_audio(wav_path) # --- Mode: File Upload --- elif mode == "📁 上傳本地音檔": uploaded_file = st.file_uploader("上傳音頻文件 (支持 wav, mp3, m4a 等)", type=["wav", "mp3", "m4a", "ogg", "flac"]) if uploaded_file is not None: st.audio(uploaded_file, format="audio/wav") wav_path = convert_audio_to_wav(uploaded_file.read()) transcribe_audio(wav_path) elif mode == "✍️ 手動文字輸入": manual_text = st.text_area("請輸入文字", height=200, key="manual_input") if st.button("確認輸入"): st.success("✅ 已接收輸入內容!") st.text_area("📜 輸入內容", manual_text, height=200, key="manual_output") intent_classification(manual_text) #==============