import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from datasets import load_dataset
import spacy
import gradio as gr
from pydub import AudioSegment  # 引入 pydub 庫

# 確保導入 numpy，以解決缺少依賴項的问题。
try:
    import numpy as np
    
except ImportError:
    print("Numpy 未找到，正在嘗試自動安裝...")
    
    try:
        subprocess.run(["pip", "install", "numpy"])
        
    except Exception as e:
        print(f"自動安裝Numpy失敗：{e}")
        
# 設置設備和環境變數（如有需要）
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

# Whisper 模型初始化（語音轉文字）
whisper_model_id = "openai/whisper-large-v3"
whisper_model = AutoModelForSpeechSeq2Seq.from_pretrained(
    whisper_model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True)
whisper_model.to(device)
whisper_processor = AutoProcessor.from_pretrained(whisper_model_id)

try:
    
        whisper_pipe = pipeline(
            "automatic-speech-recognition",
            model=whisper_model,
            tokenizer=whisper_processor.tokenizer,
            feature_extractor=whisper_processor.feature_extractor,
            device=device)
        
except Exception as e:
    
        print(f"初始化Whisper管道時出現錯誤：{e}")

# spaCy 初始化（文本分類與標籤）
nlp=None 

try:
        
            nlp=spacy.load("en_core_web_sm")
            
except Exception as e:
            
            print(f"加載spaCy模型時出現錯誤：{e}")

def process_audio(audio_file):
    
        # 將 MP3 轉換為 WAV
    
        try:
            audio_segment = AudioSegment.from_mp3(audio_file)
            wav_path = "/tmp/" + audio_file.split("/")[-1].replace(".mp3", ".wav")  # 將檔案存放於 /tmp 目錄
            
            audio_segment.export(wav_path, format="wav")
            
        except Exception as e:
            print(f"MP3 轉 WAV 時出現錯誤：{e}")
        
        # 語音轉文字
            
        try:
                result= whisper_pipe(wav_path)["text"]
                
                # 使用 T5 作為替代模型
                
                messages=[{"role": "user", "content": result}]
                
                deepseek_response="" 
                
                try: 
                    from transformers import pipeline
                    
                    pipe=pipeline("text-generation",model="t5-base")
                    
                    deepseek_response=pipe(messages)[0]["generated_text"]
                            
                        # 使用 spaCy 分析文本
                        
                    doc=nlp(deepseek_response) if nlp is not None else None 
                    entities=[(ent.text, ent.label_) for ent in doc.ents] if doc is not None else []
                            
                    return {
                        "Transcription (Whister)": result,
                        "AI Response (T5)": deepseek_response,# 修改為 T5 回應以避免與原來不同步                            
                        "Extracted Entities (spaCy)": entities}
                            
                except Exception as e: 
                            return {
                                "Transcription (Whister)": result,# 保留原始轉錄內容                                
                                    }
                                
        
        except Exception as e:
            
            return {"Error": f"語音轉文字失敗：{e}"}

def clear_input():
    
        return "", ""

with gr.Blocks() as app:

   
   with gr.Row():
       audio_input=gr.Audio(type="filepath", label="上傳語音")  
       output_text=gr.JSON(label="結果")
       
       
   submit_button = gr.Button("提交")  
   
   submit_button.click(fn=lambda x: process_audio(x), inputs=[audio_input], outputs=[output_text])  

   clear_button = gr.Button("清除")  # 新增清除按鈕
   
   clear_button.click(fn=lambda x: "", inputs=[], outputs=[audio_input])  # 清除輸入欄位


if __name__ == "__main__":
     app.launch()