File size: 11,037 Bytes
423e6fa
e48f6e2
a67e7e4
 
c703fc0
e48f6e2
423e6fa
e48f6e2
 
423e6fa
c3b1169
e48f6e2
423e6fa
 
 
 
a67e7e4
423e6fa
 
 
 
 
 
 
 
e48f6e2
a67e7e4
 
 
 
 
 
423e6fa
 
 
 
a67e7e4
 
423e6fa
a67e7e4
423e6fa
a67e7e4
 
 
 
 
 
423e6fa
 
 
 
 
a67e7e4
 
e48f6e2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
423e6fa
e48f6e2
a67e7e4
e48f6e2
c703fc0
e48f6e2
 
423e6fa
 
e48f6e2
423e6fa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e48f6e2
 
 
 
 
 
 
 
 
 
 
 
 
423e6fa
a67e7e4
423e6fa
a67e7e4
423e6fa
a67e7e4
 
 
 
 
 
 
 
 
 
423e6fa
a67e7e4
 
 
e48f6e2
 
a67e7e4
e48f6e2
 
a67e7e4
e48f6e2
423e6fa
e48f6e2
 
 
 
 
423e6fa
e48f6e2
 
 
 
423e6fa
e48f6e2
423e6fa
e48f6e2
 
f96b39a
 
 
6c7fa87
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c7ae4ac
6c7fa87
 
 
f96b39a
 
 
e48f6e2
 
a512a66
e48f6e2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
423e6fa
e48f6e2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
# app.py (为 Hugging Face ZeroGPU 修改)
import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
import spaces
# --------------------------------------------------------------------------
# 1. 配置 (在应用启动时执行)
# --------------------------------------------------------------------------

# !! 重要的模型 ID (从 HF Hub 加载)
model_id = "AIDC-AI/Marco-MT-Algharb"

# --- ZeroGPU 修改 1: 
# 在启动时 *只* 定义全局变量为 None
# 大模型将在第一个请求到来时被加载
# ---
model = None
tokenizer = None
generation_config = None

print("ZeroGPU 启动脚本开始...")
print(f"准备从 {model_id} 加载 Tokenizer...")

# Tokenizer 很小, 可以在启动时加载
# ★★★ 提醒: 这仍然需要你已在 Space settings 中设置 HF_TOKEN 密钥 ★★★
try:
    tokenizer = AutoTokenizer.from_pretrained(
        model_id, 
        trust_remote_code=True
    )
    print("Tokenizer 加载成功!")

    # --- ZeroGPU 修改 2: 
    # Tokenizer 加载成功后, *立即* 定义 GenerationConfig
    # (这解决了你之前关于 Qwen3 停止 token 的问题)
    # ---
    im_end_id = tokenizer.convert_tokens_to_ids("<|im_end|>")
    eot_id = tokenizer.eos_token_id
    
    print(f"设置停止 IDs: <|im_end|_id={im_end_id}, <|endoftext|_id={eot_id}")
    
    generation_config = GenerationConfig(
        do_sample=False,
        max_new_tokens=512,
        eos_token_id=[im_end_id, eot_id], 
        pad_token_id=eot_id
    )
    print("GenerationConfig 配置成功。")
    
except Exception as e:
    print(f"Tokenizer 加载失败: {e}")
    print("!! 严重错误: 如果这是 Gated Repo 问题, 请确保 HF_TOKEN 密钥已设置并重启 Space。")

# 语言代码到全名的映射 (保持不变)
source_lang_name_map = {
    "en": "english",
    "ja": "japanese",
    "cs": "czech",
    "de": "german",
}

target_lang_name_map = {
    "zh": "chinese",
    "ko": "korean",
    "ja": "japanese",
    "ar": "arabic",
    "cs": "czech",
    "ru": "russian",
    "uk": "ukraine",
    "et": "estonian",
    "bho": "bhojpuri",
    "sr_latin": "serbian",
    "de": "german",
}

# --------------------------------------------------------------------------
# 2. 定义核心翻译函数 (修改版)
# --------------------------------------------------------------------------
@spaces.GPU
def translate(source_text, source_lang_code, target_lang_code):
    """
    接收用户输入并返回翻译结果
    (ZeroGPU: 在首次调用时加载模型)
    """
    global model # ★★★ 关键: 引用全局 'model' 变量
    
    # --- ZeroGPU 修改 3: 首次调用时加载模型 ---
    if model is None:
        if tokenizer is None:
            return "错误:Tokenizer 未能成功加载,无法继续。请检查启动日志。"
        
        print("--- 首次请求 ---")
        print("检测到模型未加载。正在加载模型到 ZeroGPU (Nvidia H200)...")
        try:
            # 这一步会触发 ZeroGPU 分配 H200
            model = AutoModelForCausalLM.from_pretrained(
                model_id,
                torch_dtype="auto",
                device_map="auto",    # 'auto' 将会检测到 H200
                trust_remote_code=True
            )
            model.eval()
            print("模型已成功加载到 GPU!")
        except Exception as e:
            print(f"在首次加载时模型失败: {e}")
            return f"错误:模型在加载到 GPU 时失败: {e}"
    # -----------------------------------------
    
    # (从这里开始, 代码与之前版本相同)
    
    # 简单的输入验证
    if not source_text or not source_text.strip():
        return ""

    source_language_name = source_lang_name_map.get(source_lang_code, "the source language")
    target_language_name = target_lang_name_map.get(target_lang_code, "the target language")

    prompt = (
        f"Human: Please translate the following text into {target_language_name}: \n"
        f"{source_text}<|im_end|>\n"
        f"Assistant:"
    )
    
    try:
        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                generation_config=generation_config
            )
        
        input_length = inputs.input_ids.shape[1]
        generated_ids = outputs[0][input_length:]
        generated_text = tokenizer.decode(generated_ids, skip_special_tokens=True).strip()
        
        return generated_text
    
    except Exception as e:
        print(f"翻译过程中出错: {e}")
        return f"翻译时发生错误: {e}"

# --------------------------------------------------------------------------
# 3. 创建并配置 Gradio 界面 (这部分保持不变)
# --------------------------------------------------------------------------

# <--- 定义自定义 CSS 样式 --->
css = """
/* ... 你的所有 CSS 样式 ... */
.gradio-textbox {
    min-height: 300px !important;
}
"""

# <--- 修复: choices 定义 ---
source_lang_choices = [(name.capitalize(), code) for code, name in source_lang_name_map.items()]
target_lang_choices = [(name.capitalize(), code) for code, name in target_lang_name_map.items()]


# <--- 使用 gr.Blocks 并保持主题 --->
with gr.Blocks(
    theme=gr.themes.Soft(primary_hue="amber", secondary_hue="amber"), 
    css=css,
) as demo:

    gr.HTML(
        """
        <div align="center" style="font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, sans-serif, 'Microsoft YaHei', sans-serif; padding: 20px 0;">
            <h1 style="font-weight: 700; color: #2C3E50; margin-bottom: 0.25rem; font-size: 2.5rem;">
                Marco-MT-Algharb
            </h1>
            <p style="margin-top: 0; margin-bottom: 1.5rem;">
                <a href="https://www.aidc-ai.com/marcomt" style="font-size: 1.25rem; color: #E67E22; text-decoration: none; font-weight: 500;">
                    Alibaba International Digital Commerce
                </a>
            </p>
    
            <div style="display: flex; justify-content: center; gap: 8px;">
                <a href="https://github.com/AIDC-AI/Marco-MT">
                    <img src="https://img.shields.io/badge/GitHub-Repository-181717?logo=github&style=for-the-badge" alt="GitHub">
                </a>
                <a href="https://huggingface.co/AIDC-AI/Marco-MT-Algharb">
                    <img src="https://img.shields.io/badge/Hugging%20Face-Model-FFC107?logo=huggingface&style=for-the-badge" alt="Hugging Face Model">
                </a>
                <a href="https://www2.statmt.org/wmt25/pdf/2025.wmt-1.33.pdf">
                    <img src="https://img.shields.io/badge/Paper-WMT%202025-C0392B?logo=arxiv&style=for-the-badge" alt="Paper WMT 2025">
                </a>
                <a href="https://huggingface.co/spaces/AIDC-AI/Marco-MT-Algharb">
                    <img src="https://img.shields.io/badge/Demo-HF%20Space-E67E22?logo=huggingface&style=for-the-badge" alt="Demo HF Space">
                </a>
            </div>
        </div>
        """
    )
    # --- 标题 ---
    gr.HTML(f"""
    
    """)
    
    # --- 翻译器主界面 (两栏布局) ---
    with gr.Row(variant="panel", equal_height=True):
        
        # --- 左侧输入卡片 ---
        with gr.Group():
            source_lang_dd = gr.Dropdown(
                choices=source_lang_choices,
                value="en", 
                label="源语言 (Source Language)"
            )
            source_text_tb = gr.Textbox(
                lines=10, 
                label="源文本 (Source Text)", 
                placeholder="Enter text to translate here...",
                elem_classes=["gradio-textbox"]
            )

        # --- 右侧输出卡片 ---
        with gr.Group():
            target_lang_dd = gr.Dropdown(
                choices=target_lang_choices,
                value="zh", 
                label="目标语言 (Target Language)"
            )
            output_text_tb = gr.Textbox(
                lines=10, 
                label="翻译结果 (Translation)", 
                interactive=False,
                elem_classes=["gradio-textbox"]
            )

    # --- 按钮行 ---
    with gr.Row():
        clear_btn = gr.ClearButton(
            value="清除 (Clear)",
            components=[source_text_tb, output_text_tb, source_lang_dd, target_lang_dd]
        )
        submit_btn = gr.Button("翻译 (Submit)", variant="primary", scale=1)

    # --- 示例 ---
    example_list = [
            ["The quick brown fox jumps over the lazy dog.", "en", "zh"],
            ["The sunset painted the sky with brilliant shades of orange and purple.", "en", "ko"],
            ["The ancient ruins stand as a silent testament to the rise and fall of a great civilization.", "en", "ja"],
        ]
    gr.Examples(
        examples=example_list,
        inputs=[source_text_tb, source_lang_dd, target_lang_dd]
    )

    # --- 支持的语向卡片 ---
    gr.HTML(f"""
    <div style="color: #444; font-size: 16px; margin-top: 30px; padding: 20px 25px; background-color: #FFFFFF; border-radius: 15px; max-width: 900px; margin-left: auto; margin-right: auto; box-shadow: 0 4px 20px rgba(0,0,0,0.05);">
        
        <h3 style="text-align: center; margin-top: 5px; margin-bottom: 20px; color: #444444; font-weight: 600;">Supported Language Pairs</h3>
        
        <div style="display: flex; justify-content: space-around; text-align: left; line-height: 1.8;">
            
            <div>
                <strong>From English (en):</strong>
                <ul style="list-style-type: '» '; margin: 5px 0 0 20px; padding: 0;">
                    <li>en2zh</li>
                    <li>en2ja</li>
                    <li>en2ko</li>
                    <li>en2ar</li>
                    <li>en2et</li>
                    <li>en2sr_latin</li>
                    <li>en2ru</li>
                    <li>en2uk</li>
                    <li>en2cs</li>
                    <li>en2bho</li>
                </ul>
            </div>
            
            <div style="margin-left: 20px;">
                <strong>From Czech (cs):</strong>
                <ul style="list-style-type: '» '; margin: 5px 0 15px 20px; padding: 0;">
                    <li>cs2uk</li>
                    <li>cs2de</li>
                </ul>
                
                <strong>From Japanese (ja):</strong>
                <ul style="list-style-type: '» '; margin: 5px 0 0 20px; padding: 0;">
                    <li>ja2zh</li>
                </ul>
            </div>
        </div>
    </div>
    """)

    # --- 设定按钮的点击逻辑 ---
    submit_btn.click(
        fn=translate,
        inputs=[source_text_tb, source_lang_dd, target_lang_dd],
        outputs=[output_text_tb],
        api_name="translate"
    )

# 启动应用
if __name__ == "__main__":
    demo.launch()