MGGroup commited on
Commit
44dbc68
·
verified ·
1 Parent(s): e255484

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +45 -70
app.py CHANGED
@@ -3,7 +3,6 @@ import requests
3
  import os
4
  import fitz
5
  import re
6
- import time
7
  from huggingface_hub import HfApi, hf_hub_download
8
 
9
  # --- 核心配置 ---
@@ -11,16 +10,18 @@ GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY")
11
  HF_TOKEN = os.environ.get("HF_TOKEN")
12
  MODEL_ID = "gemini-3-flash-preview"
13
 
14
- # 包含您新加的 TaxGuides
15
  DATASETS = ["MGGroup/Treaties", "MGGroup/InvestmentGuide"]
16
 
 
17
  DESCRIPTION = """
18
  <div style="text-align: left; border-left: 4px solid #2196F3; padding-left: 15px; margin-bottom: 20px;">
19
  <h3>MG TaxAI | 跨境财税合规实验室 (Beta)</h3>
20
- <p>本系统依托 <b>MG 核心智库</b> 构建,已深度整合 2025/2026 全球税务指南。支持解析结果实时溯源至官方文本。</p>
 
21
  <hr style="border: 0; border-top: 1px solid #eee; margin: 10px 0;">
22
  <p style="font-size: 0.85em; color: #666;">
23
- <b>⚠️ AI 免责声明:</b> 仅供专业参考,决策前请咨询 MG Consult 专业团队。
 
24
  </p>
25
  </div>
26
  """
@@ -29,79 +30,46 @@ def fetch_dataset_context(query):
29
  if not HF_TOKEN: return ""
30
  api = HfApi(token=HF_TOKEN)
31
  combined_text = ""
32
-
33
- # 提取国家和业务关键词
34
  keywords = [k for k in re.findall(r'[\u4e00-\u9fa5]{2,}|[a-zA-Z]{3,}', query) if k not in ["资料", "关于", "查询", "政策"]]
35
- # 针对 Worldwide 指南的主题映射
36
- topic_keywords = []
37
- if "个人" in query or "人所得" in query: topic_keywords.append("Personal")
38
- if "所得税" in query: topic_keywords.append("Corporate")
39
- if "定价" in query: topic_keywords.append("Transfer")
40
-
41
  for repo in DATASETS:
42
  try:
43
  files = api.list_repo_files(repo_id=repo, repo_type="dataset")
44
- # 匹配文件名:包含国家关键词 包含主题词
45
- matched = sorted([f for f in files if f.lower().endswith(".pdf") and any(k.lower() in f.lower() for k in keywords + topic_keywords)])
46
-
47
- for f_path in matched[:5]: # 减少到5个文件以降低负荷
48
- try:
49
- temp_path = hf_hub_download(repo_id=repo, filename=f_path, repo_type="dataset", token=HF_TOKEN)
50
- doc = fitz.open(temp_path)
51
-
52
- # --- 核心优化:页码精确定位 ---
53
- pages_to_read = []
54
- country_key = next((k for k in keywords if len(k) >= 2), "")
55
-
56
- if country_key: # 如果提到了国家,只读含国家名的页
57
- for i in range(len(doc)):
58
- if country_key in doc[i].get_text():
59
- pages_to_read.append(i)
60
- if len(pages_to_read) >= 10: break
61
-
62
- if not pages_to_read: pages_to_read = range(min(15, len(doc)))
63
-
64
- combined_text += f"\n[参考文件: {f_path}]\n"
65
- for p in pages_to_read:
66
- combined_text += doc[p].get_text()
67
- doc.close()
68
- os.remove(temp_path)
69
- except: continue
70
  except: continue
71
- return combined_text[:8000] # 压缩上下文到 8000,显著提升 API 成功率
72
 
73
- def call_gemini_with_retry(messages):
74
- url = f"https://generativelanguage.googleapis.com/v1beta/models/{MODEL_ID}:generateContent?key={GEMINI_API_KEY}"
75
- payload = {
76
- "contents": [{"role": "user" if m["role"] in ["user", "system"] else "model",
77
- "parts": [{"text": m["content"]}]} for m in messages],
78
- "tools": [{"google_search_retrieval": {}}]
79
- }
80
-
81
- for attempt in range(3):
82
- try:
83
- res = requests.post(url, json=payload, timeout=90)
84
- if res.status_code == 200:
85
- return res.json()['candidates'][0]['content']['parts'][0]['text']
86
- elif res.status_code == 429:
87
- time.sleep(10 * (attempt + 1)) # 遭遇 429 自动阶梯式等待
88
- continue
89
- else:
90
- return f"⚠️ 引擎响应异常 (Code: {res.status_code})"
91
- except:
92
- time.sleep(5)
93
- continue
94
- return "⚠️ 系统目前过于繁忙,请 30 秒后重试。"
95
 
96
  def respond(message, history, system_message, max_tokens, temperature, top_p):
97
- # 获取压缩后的知识
98
  knowledge = fetch_dataset_context(message)
 
99
 
 
100
  full_system_prompt = (
101
  f"{system_message}\n\n"
102
  "【重要约束】:\n"
103
- "1. 智库包含 Worldwide 指南,请准确定位该国章节并结合 Google Search 验证 2026 最新政策。\n"
104
- "2. 严禁自我介绍,直接进入分析。\n\n"
 
105
  f"参考资料:\n{knowledge}"
106
  )
107
 
@@ -111,21 +79,28 @@ def respond(message, history, system_message, max_tokens, temperature, top_p):
111
  if a: messages.append({"role": "assistant", "content": a})
112
  messages.append({"role": "user", "content": message})
113
 
114
- reply = call_gemini_with_retry(messages)
 
 
 
 
 
115
 
 
116
  header = "您好,我是 **MG Consult** 的国际税收 AI 专家。很高兴为您提供专业咨询。\n\n---\n\n"
117
  yield header + reply
118
 
 
119
  demo = gr.ChatInterface(
120
  fn=respond,
121
  description=DESCRIPTION,
122
  theme="soft",
123
- css=".gradio-container {max-width: 950px !important}",
124
  additional_inputs=[
125
- gr.Textbox(value="你代表 MG Consult 专家。直接进入正文,严禁自我介绍。", label="系统指令"),
126
- gr.Slider(512, 4096, 2048, label="回复长度"),
127
- gr.Slider(0, 1, 0.05, label="严谨度"),
128
- gr.Slider(0, 1, 0.95, label="采样率"),
129
  ],
130
  )
131
 
 
3
  import os
4
  import fitz
5
  import re
 
6
  from huggingface_hub import HfApi, hf_hub_download
7
 
8
  # --- 核心配置 ---
 
10
  HF_TOKEN = os.environ.get("HF_TOKEN")
11
  MODEL_ID = "gemini-3-flash-preview"
12
 
 
13
  DATASETS = ["MGGroup/Treaties", "MGGroup/InvestmentGuide"]
14
 
15
+ # 顶部的专业描述(保持原样,符合您的要求)
16
  DESCRIPTION = """
17
  <div style="text-align: left; border-left: 4px solid #2196F3; padding-left: 15px; margin-bottom: 20px;">
18
  <h3>MG TaxAI | 跨境财税合规实验室 (Beta)</h3>
19
+ <p>本系统依托 <b>MG 核心智库</b> 构建,旨在实现解析结果实时溯源至各国官方税收协定与法律文本。目前系统正处于<b>知识库全量装载阶段</b>,已优先上线核心业务国家的官方协定库。</p>
20
+ <p>我们正持续同步全球各主要经济体的国别投资税收指南及多税种年度税收报告。受限于测试版的数据填充进度,相关解析结果仅供专业参考。MG 团队正加速完善每一条咨询建议的合规证据链,以确保交付专家级的数字化合规支持。</p>
21
  <hr style="border: 0; border-top: 1px solid #eee; margin: 10px 0;">
22
  <p style="font-size: 0.85em; color: #666;">
23
+ <b>⚠️ AI 免责声明:</b><br>
24
+ 本系统生成的内容由人工智能根据现有库文件分析得出,不构成正式的法律或税务建议。在使用本系统结果进行任何商业决策前,请务必咨询 MG Consult 专业团队。
25
  </p>
26
  </div>
27
  """
 
30
  if not HF_TOKEN: return ""
31
  api = HfApi(token=HF_TOKEN)
32
  combined_text = ""
 
 
33
  keywords = [k for k in re.findall(r'[\u4e00-\u9fa5]{2,}|[a-zA-Z]{3,}', query) if k not in ["资料", "关于", "查询", "政策"]]
34
+ if not keywords: keywords = re.findall(r'[\u4e00-\u9fa5]+|[a-zA-Z]+', query)
35
+
 
 
 
 
36
  for repo in DATASETS:
37
  try:
38
  files = api.list_repo_files(repo_id=repo, repo_type="dataset")
39
+ matched = sorted([f for f in files if f.lower().endswith(".pdf") and any(k.lower() in f.lower() for k in keywords)])
40
+ for f_path in matched[:8]:
41
+ temp_path = hf_hub_download(repo_id=repo, filename=f_path, repo_type="dataset", token=HF_TOKEN)
42
+ doc = fitz.open(temp_path)
43
+ combined_text += f"\n[Ref: {f_path}]\n" + "".join([page.get_text() for page in doc[:15]])
44
+ doc.close()
45
+ os.remove(temp_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  except: continue
47
+ return combined_text[:12000]
48
 
49
+ def find_local_context(query):
50
+ base_dir = "./treaties"
51
+ if not os.path.exists(base_dir): return ""
52
+ try:
53
+ keywords = re.findall(r'[\u4e00-\u9fa5]+|[a-zA-Z]+', query)
54
+ folders = [d for d in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, d))]
55
+ selected = next((f for f in folders if any(k.lower() in f.lower() for k in keywords)), None)
56
+ if not selected: return ""
57
+ path = os.path.join(base_dir, selected)
58
+ texts = [f"\n[Local: {pdf}]\n" + "".join([p.get_text() for p in fitz.open(os.path.join(path, pdf))[:15]]) for pdf in sorted([f for f in os.listdir(path) if f.endswith(".pdf")])[:5]]
59
+ return "\n".join(texts)[:8000]
60
+ except: return ""
 
 
 
 
 
 
 
 
 
 
61
 
62
  def respond(message, history, system_message, max_tokens, temperature, top_p):
 
63
  knowledge = fetch_dataset_context(message)
64
+ if not knowledge: knowledge = find_local_context(message)
65
 
66
+ # --- 核心修改:在系统提示词中强制要求 AI 保持静默,不要自我介绍 ---
67
  full_system_prompt = (
68
  f"{system_message}\n\n"
69
  "【重要约束】:\n"
70
+ "1. 严禁进行任何形式的自我介绍或身份说明(例如:不要说'我是MG的专'、'你好'等)。\n"
71
+ "2. 严禁包含任何开场白,直接针对用户问题进入专业分析。\n"
72
+ "3. 必须严格基于以下参考资料进行回答。\n\n"
73
  f"参考资料:\n{knowledge}"
74
  )
75
 
 
79
  if a: messages.append({"role": "assistant", "content": a})
80
  messages.append({"role": "user", "content": message})
81
 
82
+ url = f"https://generativelanguage.googleapis.com/v1beta/models/{MODEL_ID}:generateContent?key={GEMINI_API_KEY}"
83
+ try:
84
+ res = requests.post(url, json={"contents": [{"role":"user" if m["role"] in ["user","system"] else "model", "parts":[{"text":m["content"]}]} for m in messages]}, timeout=60)
85
+ reply = res.json()['candidates'][0]['content']['parts'][0]['text']
86
+ except:
87
+ reply = "系统繁忙,请稍后再试。"
88
 
89
+ # --- 唯一的身份开场白在这里定义,由代码控制,不给 AI 发挥空间 ---
90
  header = "您好,我是 **MG Consult** 的国际税收 AI 专家。很高兴为您提供专业咨询。\n\n---\n\n"
91
  yield header + reply
92
 
93
+ # 界面
94
  demo = gr.ChatInterface(
95
  fn=respond,
96
  description=DESCRIPTION,
97
  theme="soft",
98
+ css=".gradio-container {max-width: 950px !important} .description {margin-bottom: 20px}",
99
  additional_inputs=[
100
+ gr.Textbox(value="你代表 MG Consult,是国际税收专家。请严格基于参考资料提供深度分析。请直接进入正文,严禁自我介绍。", label="系统指令"),
101
+ gr.Slider(512, 4096, 2048, label="回复长度限制"),
102
+ gr.Slider(0, 1, 0.05, label="严谨度 (Temperature)"),
103
+ gr.Slider(0, 1, 0.95, label="采样率 (Top-p)"),
104
  ],
105
  )
106