Spaces:
Sleeping
Sleeping
Upload app.py
Browse files
app.py
CHANGED
|
@@ -225,25 +225,32 @@ def ner(text, model_type="bert"):
|
|
| 225 |
|
| 226 |
|
| 227 |
# ======================== 关系抽取(RE) ========================
|
| 228 |
-
def re_extract(entities, text):
|
| 229 |
# 参数校验
|
| 230 |
if not entities or not text:
|
|
|
|
| 231 |
return []
|
| 232 |
|
| 233 |
# 实体类型过滤(根据业务需求调整)
|
| 234 |
valid_entity_types = {"PER", "LOC", "ORG", "TITLE"}
|
| 235 |
filtered_entities = [e for e in entities if e.get("type") in valid_entity_types]
|
| 236 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 237 |
# --------------------- 处理单实体场景 ---------------------
|
| 238 |
if len(filtered_entities) == 1:
|
| 239 |
single_relations = []
|
| 240 |
ent = filtered_entities[0]
|
|
|
|
| 241 |
|
| 242 |
# 规则1:人物职位检测
|
| 243 |
if ent["type"] == "PER":
|
| 244 |
position_keywords = ["CEO", "经理", "总监", "工程师", "教授"]
|
| 245 |
for keyword in position_keywords:
|
| 246 |
if keyword in text:
|
|
|
|
| 247 |
single_relations.append({
|
| 248 |
"head": ent["text"],
|
| 249 |
"tail": keyword,
|
|
@@ -258,6 +265,7 @@ def re_extract(entities, text):
|
|
| 258 |
if verb in text:
|
| 259 |
match = re.search(fr"{ent['text']}{verb}(.*?)[,。]", text)
|
| 260 |
if match:
|
|
|
|
| 261 |
single_relations.append({
|
| 262 |
"head": ent["text"],
|
| 263 |
"tail": match.group(1).strip(),
|
|
@@ -269,9 +277,10 @@ def re_extract(entities, text):
|
|
| 269 |
# --------------------- 多实体关系抽取 ---------------------
|
| 270 |
relations = []
|
| 271 |
|
| 272 |
-
# 方案1:使用
|
| 273 |
-
if
|
| 274 |
try:
|
|
|
|
| 275 |
entity_list = [e["text"] for e in filtered_entities]
|
| 276 |
prompt = f"""请分析以下文本中的实体关系,严格按照JSON列表格式返回:
|
| 277 |
文本内容:{text}
|
|
@@ -282,9 +291,8 @@ def re_extract(entities, text):
|
|
| 282 |
3. 示例格式:[{{"head":"实体1", "tail":"实体2", "relation":"关系类型"}}]
|
| 283 |
请直接返回JSON,不要多余内容:"""
|
| 284 |
|
| 285 |
-
|
| 286 |
-
|
| 287 |
-
response = response[0]
|
| 288 |
|
| 289 |
# 增强JSON解析
|
| 290 |
try:
|
|
@@ -305,11 +313,13 @@ def re_extract(entities, text):
|
|
| 305 |
rel.get("relation") in valid_rel_types):
|
| 306 |
valid_relations.append(rel)
|
| 307 |
relations = valid_relations
|
|
|
|
|
|
|
| 308 |
except Exception as e:
|
| 309 |
print(f"[DEBUG] 关系解析失败: {str(e)}")
|
| 310 |
|
| 311 |
except Exception as e:
|
| 312 |
-
print(f"
|
| 313 |
|
| 314 |
# 方案2:规则兜底(当模型不可用或未抽取出关系时)
|
| 315 |
if len(relations) == 0:
|
|
@@ -317,18 +327,21 @@ def re_extract(entities, text):
|
|
| 317 |
location_matches = re.finditer(r'([^\s,。]+)[位于|坐落于|地处]([^\s,。]+)', text)
|
| 318 |
for match in location_matches:
|
| 319 |
head, tail = match.groups()
|
|
|
|
| 320 |
relations.append({"head": head, "tail": tail, "relation": "位于"})
|
| 321 |
|
| 322 |
# 规则2:A属于B
|
| 323 |
belong_matches = re.finditer(r'([^\s,。]+)(属于|隶属于)([^\s,。]+)', text)
|
| 324 |
for match in belong_matches:
|
| 325 |
head, _, tail = match.groups()
|
|
|
|
| 326 |
relations.append({"head": head, "tail": tail, "relation": "属于"})
|
| 327 |
|
| 328 |
# 规则3:人物-机构关系
|
| 329 |
person_org_pattern = r'([\u4e00-\u9fa5]{2,4})(现任|担任|就职于)([\u4e00-\u9fa5]+?公司|[\u4e00-\u9fa5]+?大学)'
|
| 330 |
for match in re.finditer(person_org_pattern, text):
|
| 331 |
head, _, tail = match.groups()
|
|
|
|
| 332 |
relations.append({"head": head, "tail": tail, "relation": "任职于"})
|
| 333 |
|
| 334 |
# 后��理:去重和验证
|
|
@@ -341,8 +354,11 @@ def re_extract(entities, text):
|
|
| 341 |
head_exists = any(e["text"] == rel["head"] for e in filtered_entities)
|
| 342 |
tail_exists = any(e["text"] == rel["tail"] for e in filtered_entities)
|
| 343 |
if head_exists and tail_exists:
|
|
|
|
| 344 |
final_relations.append(rel)
|
| 345 |
seen.add(key)
|
|
|
|
|
|
|
| 346 |
|
| 347 |
return final_relations
|
| 348 |
|
|
|
|
| 225 |
|
| 226 |
|
| 227 |
# ======================== 关系抽取(RE) ========================
|
| 228 |
+
def re_extract(entities, text, use_bert_model=True, bert_model=None):
|
| 229 |
# 参数校验
|
| 230 |
if not entities or not text:
|
| 231 |
+
print("[DEBUG] 参数校验失败,实体或文本为空")
|
| 232 |
return []
|
| 233 |
|
| 234 |
# 实体类型过滤(根据业务需求调整)
|
| 235 |
valid_entity_types = {"PER", "LOC", "ORG", "TITLE"}
|
| 236 |
filtered_entities = [e for e in entities if e.get("type") in valid_entity_types]
|
| 237 |
|
| 238 |
+
if not filtered_entities:
|
| 239 |
+
print("[DEBUG] 未找到有效的实体")
|
| 240 |
+
return []
|
| 241 |
+
|
| 242 |
# --------------------- 处理单实体场景 ---------------------
|
| 243 |
if len(filtered_entities) == 1:
|
| 244 |
single_relations = []
|
| 245 |
ent = filtered_entities[0]
|
| 246 |
+
print(f"[DEBUG] 处理单实体:{ent['text']},类型:{ent['type']}")
|
| 247 |
|
| 248 |
# 规则1:人物职位检测
|
| 249 |
if ent["type"] == "PER":
|
| 250 |
position_keywords = ["CEO", "经理", "总监", "工程师", "教授"]
|
| 251 |
for keyword in position_keywords:
|
| 252 |
if keyword in text:
|
| 253 |
+
print(f"[DEBUG] 发现职位关键词:{keyword}")
|
| 254 |
single_relations.append({
|
| 255 |
"head": ent["text"],
|
| 256 |
"tail": keyword,
|
|
|
|
| 265 |
if verb in text:
|
| 266 |
match = re.search(fr"{ent['text']}{verb}(.*?)[,。]", text)
|
| 267 |
if match:
|
| 268 |
+
print(f"[DEBUG] 发现位置关系:{ent['text']} {verb} {match.group(1)}")
|
| 269 |
single_relations.append({
|
| 270 |
"head": ent["text"],
|
| 271 |
"tail": match.group(1).strip(),
|
|
|
|
| 277 |
# --------------------- 多实体关系抽取 ---------------------
|
| 278 |
relations = []
|
| 279 |
|
| 280 |
+
# 方案1:使用BERT模型进行关系抽取
|
| 281 |
+
if use_bert_model and len(filtered_entities) >= 2:
|
| 282 |
try:
|
| 283 |
+
# 假设 BERT 模型是基于你自己训练的模型进行关系抽取
|
| 284 |
entity_list = [e["text"] for e in filtered_entities]
|
| 285 |
prompt = f"""请分析以下文本中的实体关系,严格按照JSON列表格式返回:
|
| 286 |
文本内容:{text}
|
|
|
|
| 291 |
3. 示例格式:[{{"head":"实体1", "tail":"实体2", "relation":"关系类型"}}]
|
| 292 |
请直接返回JSON,不要多余内容:"""
|
| 293 |
|
| 294 |
+
# 使用BERT模型进行关系抽取(这里假设模型函数是 `bert_model.predict`,具体调用方式按你模型接口调整)
|
| 295 |
+
response = bert_model.predict(prompt)
|
|
|
|
| 296 |
|
| 297 |
# 增强JSON解析
|
| 298 |
try:
|
|
|
|
| 313 |
rel.get("relation") in valid_rel_types):
|
| 314 |
valid_relations.append(rel)
|
| 315 |
relations = valid_relations
|
| 316 |
+
else:
|
| 317 |
+
print("[DEBUG] 未能解析出关系JSON")
|
| 318 |
except Exception as e:
|
| 319 |
print(f"[DEBUG] 关系解析失败: {str(e)}")
|
| 320 |
|
| 321 |
except Exception as e:
|
| 322 |
+
print(f"[DEBUG] BERT模型关系抽取异常: {str(e)}")
|
| 323 |
|
| 324 |
# 方案2:规则兜底(当模型不可用或未抽取出关系时)
|
| 325 |
if len(relations) == 0:
|
|
|
|
| 327 |
location_matches = re.finditer(r'([^\s,。]+)[位于|坐落于|地处]([^\s,。]+)', text)
|
| 328 |
for match in location_matches:
|
| 329 |
head, tail = match.groups()
|
| 330 |
+
print(f"[DEBUG] 发现位于关系:{head} 位于 {tail}")
|
| 331 |
relations.append({"head": head, "tail": tail, "relation": "位于"})
|
| 332 |
|
| 333 |
# 规则2:A属于B
|
| 334 |
belong_matches = re.finditer(r'([^\s,。]+)(属于|隶属于)([^\s,。]+)', text)
|
| 335 |
for match in belong_matches:
|
| 336 |
head, _, tail = match.groups()
|
| 337 |
+
print(f"[DEBUG] 发现属于关系:{head} 属于 {tail}")
|
| 338 |
relations.append({"head": head, "tail": tail, "relation": "属于"})
|
| 339 |
|
| 340 |
# 规则3:人物-机构关系
|
| 341 |
person_org_pattern = r'([\u4e00-\u9fa5]{2,4})(现任|担任|就职于)([\u4e00-\u9fa5]+?公司|[\u4e00-\u9fa5]+?大学)'
|
| 342 |
for match in re.finditer(person_org_pattern, text):
|
| 343 |
head, _, tail = match.groups()
|
| 344 |
+
print(f"[DEBUG] 发现人物职位关系:{head} {tail}")
|
| 345 |
relations.append({"head": head, "tail": tail, "relation": "任职于"})
|
| 346 |
|
| 347 |
# 后��理:去重和验证
|
|
|
|
| 354 |
head_exists = any(e["text"] == rel["head"] for e in filtered_entities)
|
| 355 |
tail_exists = any(e["text"] == rel["tail"] for e in filtered_entities)
|
| 356 |
if head_exists and tail_exists:
|
| 357 |
+
print(f"[DEBUG] 添加有效关系:{rel}")
|
| 358 |
final_relations.append(rel)
|
| 359 |
seen.add(key)
|
| 360 |
+
else:
|
| 361 |
+
print(f"[DEBUG] 无效关系:{rel}")
|
| 362 |
|
| 363 |
return final_relations
|
| 364 |
|