txh17 commited on
Commit
3320745
·
verified ·
1 Parent(s): 5957b86

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +211 -0
app.py ADDED
@@ -0,0 +1,211 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import pipeline
3
+ import torch
4
+ from diffusers import StableDiffusionPipeline
5
+ import soundfile as sf
6
+ import speech_recognition as sr
7
+ import numpy as np
8
+ import os
9
+
10
+ # 初始化组件
11
+ # 使用较小的开源LLM进行提示增强
12
+ llm_pipe = pipeline("text-generation", model="mistralai/Mistral-7B-Instruct-v0.1")
13
+
14
+ # 初始化Stable Diffusion
15
+ sd_pipe = StableDiffusionPipeline.from_pretrained(
16
+ "runwayml/stable-diffusion-v1-5",
17
+ torch_dtype=torch.float16
18
+ ).to("cuda" if torch.cuda.is_available() else "cpu")
19
+
20
+ # 语音识别初始化
21
+ recognizer = sr.Recognizer()
22
+
23
+ def enhance_prompt(basic_prompt, style, detail_level, artist_style):
24
+ """使用LLM增强提示词"""
25
+ prompt_template = f"""
26
+ 根据以下简短描述创建一个详细的Stable Diffusion提示:
27
+ 原始描述: {basic_prompt}
28
+ 风格: {style}
29
+ 细节级别: {detail_level}
30
+ 艺术家风格: {artist_style}
31
+
32
+ 请生成一个包含以下元素的详细提示:
33
+ - 主体描述
34
+ - 环境/背景
35
+ - 光照条件
36
+ - 色彩风格
37
+ - 艺术媒介(如数字绘画、油画等)
38
+ - 质量描述(如4K、超详细等)
39
+
40
+ 生成的提示:
41
+ """
42
+
43
+ enhanced_prompt = llm_pipe(
44
+ prompt_template,
45
+ max_length=200,
46
+ num_return_sequences=1,
47
+ temperature=0.7
48
+ )[0]['generated_text']
49
+
50
+ # 清理生成的文本
51
+ enhanced_prompt = enhanced_prompt.replace(prompt_template, "").strip()
52
+ return enhanced_prompt
53
+
54
+ def generate_image(enhanced_prompt, steps, guidance_scale, seed):
55
+ """使用Stable Diffusion生成图像"""
56
+ if seed == -1:
57
+ seed = torch.randint(0, 2**32, (1,)).item()
58
+
59
+ generator = torch.Generator(device="cuda" if torch.cuda.is_available() else "cpu").manual_seed(seed)
60
+
61
+ image = sd_pipe(
62
+ enhanced_prompt,
63
+ num_inference_steps=steps,
64
+ guidance_scale=guidance_scale,
65
+ generator=generator
66
+ ).images[0]
67
+
68
+ return image, seed
69
+
70
+ def process_audio(audio):
71
+ """处理语音输入"""
72
+ sr, audio_data = audio
73
+ audio_array = np.array(audio_data, dtype=np.float32)
74
+
75
+ # 保存临时文件供语音识别使用
76
+ temp_file = "temp_audio.wav"
77
+ sf.write(temp_file, audio_array, sr)
78
+
79
+ with sr.AudioFile(temp_file) as source:
80
+ audio_data = recognizer.record(source)
81
+ try:
82
+ text = recognizer.recognize_google(audio_data, language='en-US')
83
+ os.remove(temp_file)
84
+ return text
85
+ except Exception as e:
86
+ os.remove(temp_file)
87
+ return f"语音识别错误: {str(e)}"
88
+
89
+ def full_process(basic_prompt, style, detail_level, artist_style, steps, guidance_scale, seed, use_audio, audio_input):
90
+ """完整处理流程"""
91
+ # 处理语音输入
92
+ if use_audio and audio_input is not None:
93
+ basic_prompt = process_audio(audio_input)
94
+
95
+ # 生成增强提示
96
+ enhanced_prompt = enhance_prompt(basic_prompt, style, detail_level, artist_style)
97
+
98
+ # 生成图像
99
+ image, used_seed = generate_image(enhanced_prompt, steps, guidance_scale, seed)
100
+
101
+ return enhanced_prompt, image, used_seed
102
+
103
+ # Gradio界面
104
+ with gr.Blocks(title="魔法树屋图像生成器") as demo:
105
+ gr.Markdown("# 🎨 魔法树屋图像生成器")
106
+ gr.Markdown("输入简短描述或使用语音输入,生成精美图像!")
107
+
108
+ with gr.Row():
109
+ with gr.Column():
110
+ # 输入部分
111
+ use_audio = gr.Checkbox(label="使用语音输入")
112
+ audio_input = gr.Audio(label="录音", visible=False)
113
+
114
+ basic_prompt = gr.Textbox(
115
+ label="简短描述",
116
+ placeholder="例如: 天空中的魔法树屋",
117
+ visible=True
118
+ )
119
+
120
+ # 当复选框变化时切换输入方式
121
+ def toggle_input(use_audio):
122
+ return {
123
+ basic_prompt: gr.update(visible=not use_audio),
124
+ audio_input: gr.update(visible=use_audio)
125
+ }
126
+
127
+ use_audio.change(
128
+ toggle_input,
129
+ inputs=use_audio,
130
+ outputs=[basic_prompt, audio_input]
131
+ )
132
+
133
+ # 风格选项
134
+ style = gr.Dropdown(
135
+ label="风格",
136
+ choices=["现实主义", "幻想艺术", "赛博朋克", "水墨画", "卡通", "极简主义"],
137
+ value="幻想艺术"
138
+ )
139
+
140
+ detail_level = gr.Slider(
141
+ label="细节级别",
142
+ minimum=1,
143
+ maximum=5,
144
+ step=1,
145
+ value=3
146
+ )
147
+
148
+ artist_style = gr.Dropdown(
149
+ label="艺术家风格",
150
+ choices=["无", "梵高", "毕加索", "莫奈", "达利", "宫崎骏"],
151
+ value="无"
152
+ )
153
+
154
+ # 高级选项
155
+ with gr.Accordion("高级选项", open=False):
156
+ steps = gr.Slider(
157
+ label="生成步数",
158
+ minimum=20,
159
+ maximum=100,
160
+ step=5,
161
+ value=50
162
+ )
163
+
164
+ guidance_scale = gr.Slider(
165
+ label="引导尺度",
166
+ minimum=1.0,
167
+ maximum=20.0,
168
+ step=0.5,
169
+ value=7.5
170
+ )
171
+
172
+ seed = gr.Number(
173
+ label="随机种子 (-1 表示随机)",
174
+ value=-1
175
+ )
176
+
177
+ submit_btn = gr.Button("生成图像", variant="primary")
178
+
179
+ with gr.Column():
180
+ # 输出部分
181
+ enhanced_prompt = gr.Textbox(
182
+ label="生成的提示",
183
+ interactive=False
184
+ )
185
+
186
+ image_output = gr.Image(
187
+ label="生成的图像",
188
+ height=512
189
+ )
190
+
191
+ used_seed = gr.Number(
192
+ label="使用的种子",
193
+ interactive=False
194
+ )
195
+
196
+ # 连接按钮
197
+ submit_btn.click(
198
+ fn=full_process,
199
+ inputs=[
200
+ basic_prompt, style, detail_level, artist_style,
201
+ steps, guidance_scale, seed, use_audio, audio_input
202
+ ],
203
+ outputs=[enhanced_prompt, image_output, used_seed]
204
+ )
205
+
206
+ # 对于Hugging Face Spaces,我们需要设置队列
207
+ demo.queue()
208
+
209
+ # 启动应用
210
+ if __name__ == "__main__":
211
+ demo.launch()