tzzte commited on
Commit
0e29aed
·
verified ·
1 Parent(s): 22bedd1

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +147 -0
  2. requirements.txt +9 -0
app.py ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import subprocess
2
+ import sys
3
+
4
+ subprocess.check_call([sys.executable, "-m", "pip", "install", "pip==24.0"])
5
+ subprocess.check_call([sys.executable, "-m", "pip", "install", "omegaconf==2.0.6"])
6
+ subprocess.check_call([sys.executable, "-m", "pip", "install", "git+https://github.com/facebookresearch/fairseq.git@v0.12.2"])
7
+ import gradio as gr
8
+ import os
9
+ import torch
10
+ import librosa
11
+ import soundfile as sf
12
+ import tempfile
13
+ import spaces # ZeroGPU requirement
14
+
15
+ # 导入你的模块
16
+ import Echox_copy_stream as Echox
17
+
18
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
19
+
20
+ # 全局变量
21
+ _MODEL_ON_CUDA = False
22
+ inference_model = None
23
+
24
+ def init_model():
25
+ """在CPU上初始化模型"""
26
+ global inference_model
27
+ if inference_model is None:
28
+ inference_model = Echox.EchoxAssistant()
29
+ return inference_model
30
+
31
+ def process_audio_input(audio):
32
+ """处理音频输入"""
33
+ if audio is None:
34
+ return None
35
+
36
+ try:
37
+ # 如果是文件路径,直接返回
38
+ if isinstance(audio, str):
39
+ return audio
40
+
41
+ # 如果是numpy数组格式 (sr, data)
42
+ if isinstance(audio, tuple):
43
+ sr, y = audio
44
+ if y.ndim > 1:
45
+ y = y[:, 0] # 只保留第一个声道
46
+ else:
47
+ # 如果直接是数组
48
+ y = audio
49
+ sr = 16000 # 默认采样率
50
+
51
+ # 保存为临时文件
52
+ with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp_file:
53
+ sf.write(tmp_file.name, y, sr)
54
+ return tmp_file.name
55
+
56
+ except Exception as e:
57
+ print(f"Error processing audio: {e}")
58
+ return None
59
+
60
+ @spaces.GPU(duration=180) # 使用ZeroGPU,3分钟超时
61
+ def process_audio_text(text, audio):
62
+ """主要处理函数"""
63
+ global _MODEL_ON_CUDA, inference_model
64
+
65
+ # 初始化模型(如果还没初始化)
66
+ if inference_model is None:
67
+ init_model()
68
+
69
+ # 首次使用GPU时移动模型
70
+ if not _MODEL_ON_CUDA:
71
+ try:
72
+ # 将模型移动到GPU
73
+ if hasattr(inference_model, 'model'):
74
+ inference_model.model = inference_model.model.to("cuda")
75
+ if hasattr(inference_model, 'unit_translator'):
76
+ inference_model.unit_translator = inference_model.unit_translator.to("cuda")
77
+
78
+ inference_model.device = "cuda"
79
+ _MODEL_ON_CUDA = True
80
+ print("Model moved to GPU")
81
+ except Exception as e:
82
+ print(f"Error moving model to GPU: {e}")
83
+
84
+ # 处理音频输入
85
+ audio_path = process_audio_input(audio)
86
+
87
+ text = ""
88
+
89
+ tmp = [{
90
+ "conversations": [
91
+ {
92
+ "from": "user",
93
+ "value": text,
94
+ "audio": audio_path
95
+ }
96
+ ]
97
+ }]
98
+
99
+ accumulated_text = ""
100
+
101
+ try:
102
+ for text_response, audio_data in inference_model._inference(tmp):
103
+ if text_response:
104
+ accumulated_text = text_response
105
+
106
+ if audio_data is not None:
107
+ sr, audio_array = audio_data
108
+ yield accumulated_text, (sr, audio_array)
109
+ else:
110
+ yield accumulated_text, None
111
+ except Exception as e:
112
+ yield f"Error: {str(e)}", None
113
+ finally:
114
+ # 清理临时文件
115
+ if audio_path and audio_path != audio and os.path.exists(audio_path):
116
+ try:
117
+ os.unlink(audio_path)
118
+ except:
119
+ pass
120
+
121
+ # 初始化模型(在CPU上)
122
+ init_model()
123
+
124
+ if __name__ == "__main__":
125
+ examples = [
126
+ ["", "./show_case/1.wav"],
127
+ ["", "./show_case/2.wav"],
128
+ ]
129
+
130
+ iface = gr.Interface(
131
+ fn=process_audio_text,
132
+ inputs=[
133
+ gr.Textbox(label="Enter text instruction", value=""),
134
+ gr.Audio(type="filepath", label="Upload Audio")
135
+ ],
136
+ outputs=[
137
+ gr.Textbox(label="Model output"),
138
+ gr.Audio(label="Streamed Audio", streaming=True, autoplay=True)
139
+ ],
140
+ examples=examples,
141
+ title="🔊 EchoX Assistant",
142
+ description="A multimodal AI assistant that understands speech and responds with both text and audio",
143
+ live=False,
144
+ allow_flagging="never"
145
+ )
146
+
147
+ iface.launch(server_name="0.0.0.0", server_port=7860, share=True)
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ gradio==5.44.1
2
+ librosa==0.10.2.post1
3
+ numpy==1.24.4
4
+ peft==0.5.0
5
+ sentencepiece==0.2.0
6
+ soundfile==0.12.1
7
+ torch==2.3.0
8
+ tqdm==4.66.5
9
+ transformers==4.49.0