Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -3,7 +3,6 @@ import requests
|
|
| 3 |
import os
|
| 4 |
import fitz
|
| 5 |
import re
|
| 6 |
-
import time
|
| 7 |
from huggingface_hub import HfApi, hf_hub_download
|
| 8 |
|
| 9 |
# --- 核心配置 ---
|
|
@@ -11,16 +10,18 @@ GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY")
|
|
| 11 |
HF_TOKEN = os.environ.get("HF_TOKEN")
|
| 12 |
MODEL_ID = "gemini-3-flash-preview"
|
| 13 |
|
| 14 |
-
# 包含您新加的 TaxGuides
|
| 15 |
DATASETS = ["MGGroup/Treaties", "MGGroup/InvestmentGuide"]
|
| 16 |
|
|
|
|
| 17 |
DESCRIPTION = """
|
| 18 |
<div style="text-align: left; border-left: 4px solid #2196F3; padding-left: 15px; margin-bottom: 20px;">
|
| 19 |
<h3>MG TaxAI | 跨境财税合规实验室 (Beta)</h3>
|
| 20 |
-
<p>本系统依托 <b>MG 核心智库</b> 构建,
|
|
|
|
| 21 |
<hr style="border: 0; border-top: 1px solid #eee; margin: 10px 0;">
|
| 22 |
<p style="font-size: 0.85em; color: #666;">
|
| 23 |
-
<b>⚠️ AI 免责声明:</b>
|
|
|
|
| 24 |
</p>
|
| 25 |
</div>
|
| 26 |
"""
|
|
@@ -29,79 +30,46 @@ def fetch_dataset_context(query):
|
|
| 29 |
if not HF_TOKEN: return ""
|
| 30 |
api = HfApi(token=HF_TOKEN)
|
| 31 |
combined_text = ""
|
| 32 |
-
|
| 33 |
-
# 提取国家和业务关键词
|
| 34 |
keywords = [k for k in re.findall(r'[\u4e00-\u9fa5]{2,}|[a-zA-Z]{3,}', query) if k not in ["资料", "关于", "查询", "政策"]]
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
if "个人" in query or "人所得" in query: topic_keywords.append("Personal")
|
| 38 |
-
if "所得税" in query: topic_keywords.append("Corporate")
|
| 39 |
-
if "定价" in query: topic_keywords.append("Transfer")
|
| 40 |
-
|
| 41 |
for repo in DATASETS:
|
| 42 |
try:
|
| 43 |
files = api.list_repo_files(repo_id=repo, repo_type="dataset")
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
# --- 核心优化:页码精确定位 ---
|
| 53 |
-
pages_to_read = []
|
| 54 |
-
country_key = next((k for k in keywords if len(k) >= 2), "")
|
| 55 |
-
|
| 56 |
-
if country_key: # 如果提到了国家,只读含国家名的页
|
| 57 |
-
for i in range(len(doc)):
|
| 58 |
-
if country_key in doc[i].get_text():
|
| 59 |
-
pages_to_read.append(i)
|
| 60 |
-
if len(pages_to_read) >= 10: break
|
| 61 |
-
|
| 62 |
-
if not pages_to_read: pages_to_read = range(min(15, len(doc)))
|
| 63 |
-
|
| 64 |
-
combined_text += f"\n[参考文件: {f_path}]\n"
|
| 65 |
-
for p in pages_to_read:
|
| 66 |
-
combined_text += doc[p].get_text()
|
| 67 |
-
doc.close()
|
| 68 |
-
os.remove(temp_path)
|
| 69 |
-
except: continue
|
| 70 |
except: continue
|
| 71 |
-
return combined_text[:
|
| 72 |
|
| 73 |
-
def
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
return res.json()['candidates'][0]['content']['parts'][0]['text']
|
| 86 |
-
elif res.status_code == 429:
|
| 87 |
-
time.sleep(10 * (attempt + 1)) # 遭遇 429 自动阶梯式等待
|
| 88 |
-
continue
|
| 89 |
-
else:
|
| 90 |
-
return f"⚠️ 引擎响应异常 (Code: {res.status_code})"
|
| 91 |
-
except:
|
| 92 |
-
time.sleep(5)
|
| 93 |
-
continue
|
| 94 |
-
return "⚠️ 系统目前过于繁忙,请 30 秒后重试。"
|
| 95 |
|
| 96 |
def respond(message, history, system_message, max_tokens, temperature, top_p):
|
| 97 |
-
# 获取压缩后的知识
|
| 98 |
knowledge = fetch_dataset_context(message)
|
|
|
|
| 99 |
|
|
|
|
| 100 |
full_system_prompt = (
|
| 101 |
f"{system_message}\n\n"
|
| 102 |
"【重要约束】:\n"
|
| 103 |
-
"1.
|
| 104 |
-
"2. 严禁
|
|
|
|
| 105 |
f"参考资料:\n{knowledge}"
|
| 106 |
)
|
| 107 |
|
|
@@ -111,21 +79,28 @@ def respond(message, history, system_message, max_tokens, temperature, top_p):
|
|
| 111 |
if a: messages.append({"role": "assistant", "content": a})
|
| 112 |
messages.append({"role": "user", "content": message})
|
| 113 |
|
| 114 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 115 |
|
|
|
|
| 116 |
header = "您好,我是 **MG Consult** 的国际税收 AI 专家。很高兴为您提供专业咨询。\n\n---\n\n"
|
| 117 |
yield header + reply
|
| 118 |
|
|
|
|
| 119 |
demo = gr.ChatInterface(
|
| 120 |
fn=respond,
|
| 121 |
description=DESCRIPTION,
|
| 122 |
theme="soft",
|
| 123 |
-
css=".gradio-container {max-width: 950px !important}",
|
| 124 |
additional_inputs=[
|
| 125 |
-
gr.Textbox(value="你代表 MG Consult
|
| 126 |
-
gr.Slider(512, 4096, 2048, label="回复长度"),
|
| 127 |
-
gr.Slider(0, 1, 0.05, label="严谨度"),
|
| 128 |
-
gr.Slider(0, 1, 0.95, label="采样率"),
|
| 129 |
],
|
| 130 |
)
|
| 131 |
|
|
|
|
| 3 |
import os
|
| 4 |
import fitz
|
| 5 |
import re
|
|
|
|
| 6 |
from huggingface_hub import HfApi, hf_hub_download
|
| 7 |
|
| 8 |
# --- 核心配置 ---
|
|
|
|
| 10 |
HF_TOKEN = os.environ.get("HF_TOKEN")
|
| 11 |
MODEL_ID = "gemini-3-flash-preview"
|
| 12 |
|
|
|
|
| 13 |
DATASETS = ["MGGroup/Treaties", "MGGroup/InvestmentGuide"]
|
| 14 |
|
| 15 |
+
# 顶部的专业描述(保持原样,符合您的要求)
|
| 16 |
DESCRIPTION = """
|
| 17 |
<div style="text-align: left; border-left: 4px solid #2196F3; padding-left: 15px; margin-bottom: 20px;">
|
| 18 |
<h3>MG TaxAI | 跨境财税合规实验室 (Beta)</h3>
|
| 19 |
+
<p>本系统依托 <b>MG 核心智库</b> 构建,旨在实现解析结果实时溯源至各国官方税收协定与法律文本。目前系统正处于<b>知识库全量装载阶段</b>,已优先上线核心业务国家的官方协定库。</p>
|
| 20 |
+
<p>我们正持续同步全球各主要经济体的国别投资税收指南及多税种年度税收报告。受限于测试版的数据填充进度,相关解析结果仅供专业参考。MG 团队正加速完善每一条咨询建议的合规证据链,以确保交付专家级的数字化合规支持。</p>
|
| 21 |
<hr style="border: 0; border-top: 1px solid #eee; margin: 10px 0;">
|
| 22 |
<p style="font-size: 0.85em; color: #666;">
|
| 23 |
+
<b>⚠️ AI 免责声明:</b><br>
|
| 24 |
+
本系统生成的内容由人工智能根据现有库文件分析得出,不构成正式的法律或税务建议。在使用本系统结果进行任何商业决策前,请务必咨询 MG Consult 专业团队。
|
| 25 |
</p>
|
| 26 |
</div>
|
| 27 |
"""
|
|
|
|
| 30 |
if not HF_TOKEN: return ""
|
| 31 |
api = HfApi(token=HF_TOKEN)
|
| 32 |
combined_text = ""
|
|
|
|
|
|
|
| 33 |
keywords = [k for k in re.findall(r'[\u4e00-\u9fa5]{2,}|[a-zA-Z]{3,}', query) if k not in ["资料", "关于", "查询", "政策"]]
|
| 34 |
+
if not keywords: keywords = re.findall(r'[\u4e00-\u9fa5]+|[a-zA-Z]+', query)
|
| 35 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
for repo in DATASETS:
|
| 37 |
try:
|
| 38 |
files = api.list_repo_files(repo_id=repo, repo_type="dataset")
|
| 39 |
+
matched = sorted([f for f in files if f.lower().endswith(".pdf") and any(k.lower() in f.lower() for k in keywords)])
|
| 40 |
+
for f_path in matched[:8]:
|
| 41 |
+
temp_path = hf_hub_download(repo_id=repo, filename=f_path, repo_type="dataset", token=HF_TOKEN)
|
| 42 |
+
doc = fitz.open(temp_path)
|
| 43 |
+
combined_text += f"\n[Ref: {f_path}]\n" + "".join([page.get_text() for page in doc[:15]])
|
| 44 |
+
doc.close()
|
| 45 |
+
os.remove(temp_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
except: continue
|
| 47 |
+
return combined_text[:12000]
|
| 48 |
|
| 49 |
+
def find_local_context(query):
|
| 50 |
+
base_dir = "./treaties"
|
| 51 |
+
if not os.path.exists(base_dir): return ""
|
| 52 |
+
try:
|
| 53 |
+
keywords = re.findall(r'[\u4e00-\u9fa5]+|[a-zA-Z]+', query)
|
| 54 |
+
folders = [d for d in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, d))]
|
| 55 |
+
selected = next((f for f in folders if any(k.lower() in f.lower() for k in keywords)), None)
|
| 56 |
+
if not selected: return ""
|
| 57 |
+
path = os.path.join(base_dir, selected)
|
| 58 |
+
texts = [f"\n[Local: {pdf}]\n" + "".join([p.get_text() for p in fitz.open(os.path.join(path, pdf))[:15]]) for pdf in sorted([f for f in os.listdir(path) if f.endswith(".pdf")])[:5]]
|
| 59 |
+
return "\n".join(texts)[:8000]
|
| 60 |
+
except: return ""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
|
| 62 |
def respond(message, history, system_message, max_tokens, temperature, top_p):
|
|
|
|
| 63 |
knowledge = fetch_dataset_context(message)
|
| 64 |
+
if not knowledge: knowledge = find_local_context(message)
|
| 65 |
|
| 66 |
+
# --- 核心修改:在系统提示词中强制要求 AI 保持静默,不要自我介绍 ---
|
| 67 |
full_system_prompt = (
|
| 68 |
f"{system_message}\n\n"
|
| 69 |
"【重要约束】:\n"
|
| 70 |
+
"1. 严禁进行任何形式的自我介绍或身份说明(例如:不要说'我是MG的专家'、'你好'等)。\n"
|
| 71 |
+
"2. 严禁包含任何开场白,直接针对用户问题进入专业分析。\n"
|
| 72 |
+
"3. 必须严格基于以下参考资料进行回答。\n\n"
|
| 73 |
f"参考资料:\n{knowledge}"
|
| 74 |
)
|
| 75 |
|
|
|
|
| 79 |
if a: messages.append({"role": "assistant", "content": a})
|
| 80 |
messages.append({"role": "user", "content": message})
|
| 81 |
|
| 82 |
+
url = f"https://generativelanguage.googleapis.com/v1beta/models/{MODEL_ID}:generateContent?key={GEMINI_API_KEY}"
|
| 83 |
+
try:
|
| 84 |
+
res = requests.post(url, json={"contents": [{"role":"user" if m["role"] in ["user","system"] else "model", "parts":[{"text":m["content"]}]} for m in messages]}, timeout=60)
|
| 85 |
+
reply = res.json()['candidates'][0]['content']['parts'][0]['text']
|
| 86 |
+
except:
|
| 87 |
+
reply = "系统繁忙,请稍后再试。"
|
| 88 |
|
| 89 |
+
# --- 唯一的身份开场白在这里定义,由代码控制,不给 AI 发挥空间 ---
|
| 90 |
header = "您好,我是 **MG Consult** 的国际税收 AI 专家。很高兴为您提供专业咨询。\n\n---\n\n"
|
| 91 |
yield header + reply
|
| 92 |
|
| 93 |
+
# 界面
|
| 94 |
demo = gr.ChatInterface(
|
| 95 |
fn=respond,
|
| 96 |
description=DESCRIPTION,
|
| 97 |
theme="soft",
|
| 98 |
+
css=".gradio-container {max-width: 950px !important} .description {margin-bottom: 20px}",
|
| 99 |
additional_inputs=[
|
| 100 |
+
gr.Textbox(value="你代表 MG Consult,是国际税收专家。请严格基于参考资料提供深度分析。请直接进入正文,严禁自我介绍。", label="系统指令"),
|
| 101 |
+
gr.Slider(512, 4096, 2048, label="回复长度限制"),
|
| 102 |
+
gr.Slider(0, 1, 0.05, label="严谨度 (Temperature)"),
|
| 103 |
+
gr.Slider(0, 1, 0.95, label="采样率 (Top-p)"),
|
| 104 |
],
|
| 105 |
)
|
| 106 |
|