Spaces:

chen666-666
/

wechat-ner-re

Sleeping

File size: 23,184 Bytes

import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline, AutoModel
import gradio as gr
import re
import os
import json
import chardet
from sklearn.metrics import precision_score, recall_score, f1_score
import time
# ======================== 数据库模块 ========================
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from contextlib import contextmanager
import logging

# 配置日志
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

# 使用SQLAlchemy的连接池来管理数据库连接
DATABASE_URL = "mysql+pymysql://user:password@host/dbname"  # 请根据实际情况修改连接字符串

# 创建引擎（连接池）
engine = create_engine(DATABASE_URL, pool_size=10, max_overflow=20, echo=True)

# 创建session类
Session = sessionmaker(bind=engine)

@contextmanager
def get_db_connection():
    """

    使用上下文管理器获取数据库连接

    """
    session = None
    try:
        session = Session()  # 从连接池中获取一个连接
        logging.info("✅ 数据库连接已建立")
        yield session  # 使用session进行数据库操作
    except Exception as e:
        logging.error(f"❌ 数据库操作时发生错误: {e}")
        if session:
            session.rollback()  # 回滚事务
    finally:
        if session:
            try:
                session.commit()  # 提交事务
                logging.info("✅ 数据库事务已提交")
            except Exception as e:
                logging.error(f"❌ 提交事务时发生错误: {e}")
            finally:
                session.close()  # 关闭会话，释放连接
                logging.info("✅ 数据库连接已关闭")

def save_to_db(table, data):
    """

    将数据保存到数据库

    :param table: 表名

    :param data: 数据字典

    """
    try:
        valid_tables = ["entities", "relations"]  # 只允许保存到这些表
        if table not in valid_tables:
            raise ValueError(f"Invalid table: {table}")
        
        with get_db_connection() as conn:
            if conn:
                # 这里的操作假设使用了ORM模型来处理插入，实际根据你数据库的表结构来调整
                table_model = get_table_model(table)  # 假设你有一个方法来根据表名获得ORM模型
                new_record = table_model(**data)
                conn.add(new_record)
                conn.commit()  # 提交事务
    except Exception as e:
        logging.error(f"❌ 保存数据时发生错误: {e}")
        return False
    return True

def get_table_model(table_name):
    """

    根据表名获取ORM模型（这里假设你有一个映射到数据库表的模型）

    :param table_name: 表名

    :return: 对应的ORM模型

    """
    if table_name == "entities":
        from models import Entity  # 假设你已经定义了ORM模型
        return Entity
    elif table_name == "relations":
        from models import Relation  # 假设你已经定义了ORM模型
        return Relation
    else:
        raise ValueError(f"Unknown table: {table_name}")


# ======================== 模型加载 ========================
NER_MODEL_NAME = "uer/roberta-base-finetuned-cluener2020-chinese"
bert_tokenizer = AutoTokenizer.from_pretrained(NER_MODEL_NAME)
bert_ner_model = AutoModelForTokenClassification.from_pretrained(NER_MODEL_NAME)
bert_ner_pipeline = pipeline(
    "ner",
    model=bert_ner_model,
    tokenizer=bert_tokenizer,
    aggregation_strategy="first"
)

use_chatglm = False


# chatglm_model, chatglm_tokenizer = None, None
# use_chatglm = False
# try:
#     chatglm_model_name = "THUDM/chatglm-6b-int4"
#     chatglm_tokenizer = AutoTokenizer.from_pretrained(chatglm_model_name, trust_remote_code=True)
#     chatglm_model = AutoModel.from_pretrained(
#         chatglm_model_name,
#         trust_remote_code=True,
#         device_map="cpu",
#         torch_dtype=torch.float32
#     ).eval()
#     use_chatglm = True
#     print("✅ 4-bit量化版ChatGLM加载成功")
# except Exception as e:
#     print(f"❌ ChatGLM加载失败: {e}")

# ======================== 知识图谱结构 ========================
knowledge_graph = {"entities": set(), "relations": set()}


def update_knowledge_graph(entities, relations):
    # 保存实体
    for e in entities:
        if isinstance(e, dict) and 'text' in e and 'type' in e:
            save_to_db('entities', {
                'text': e['text'],
                'type': e['type'],
                'start_pos': e.get('start', -1),
                'end_pos': e.get('end', -1),
                'source': 'user_input'
            })

    # 保存关系
    for r in relations:
        if isinstance(r, dict) and all(k in r for k in ("head", "tail", "relation")):
            save_to_db('relations', {
                'head_entity': r['head'],
                'tail_entity': r['tail'],
                'relation_type': r['relation'],
                'source_text': ''  # 可添加原文关联
            })


def visualize_kg_text():
    nodes = [f"{ent[0]} ({ent[1]})" for ent in knowledge_graph["entities"]]
    edges = [f"{h} --[{r}]-> {t}" for h, t, r in knowledge_graph["relations"]]
    return "\n".join(["📌 实体:"] + nodes + ["", "📎 关系:"] + edges)

# ======================== 实体识别（NER） ========================
def merge_adjacent_entities(entities):
    if not entities:
        return entities

    merged = [entities[0]]
    for entity in entities[1:]:
        last = merged[-1]
        # 合并相邻的同类型实体
        if (entity["type"] == last["type"] and
                entity["start"] == last["end"]):
            last["text"] += entity["text"]
            last["end"] = entity["end"]
        else:
            merged.append(entity)

    return merged


def ner(text, model_type="bert"):
    start_time = time.time()

    # 如果使用的是 ChatGLM 模型，执行 ChatGLM 的NER
    if model_type == "chatglm" and use_chatglm:
        try:
            prompt = f"""请从以下文本中识别所有实体，严格按照JSON列表格式返回，每个实体包含text、type、start、end字段：

示例：[{{"text": "北京", "type": "LOC", "start": 0, "end": 2}}]

文本：{text}"""
            response = chatglm_model.chat(chatglm_tokenizer, prompt, temperature=0.1)
            if isinstance(response, tuple):
                response = response[0]

            try:
                json_str = re.search(r'\[.*\]', response, re.DOTALL).group()
                entities = json.loads(json_str)
                valid_entities = [ent for ent in entities if all(k in ent for k in ("text", "type", "start", "end"))]
                return valid_entities, time.time() - start_time
            except Exception as e:
                print(f"JSON解析失败: {e}")
                return [], time.time() - start_time
        except Exception as e:
            print(f"ChatGLM调用失败: {e}")
            return [], time.time() - start_time

    # 使用BERT NER
    text_chunks = [text[i:i + 510] for i in range(0, len(text), 510)]  # 安全分段
    raw_results = []
    for idx, chunk in enumerate(text_chunks):
        chunk_results = bert_ner_pipeline(chunk)
        for r in chunk_results:
            r["start"] += idx * 510
            r["end"] += idx * 510
        raw_results.extend(chunk_results)

    entities = [{
        "text": r['word'].replace(' ', ''),
        "start": r['start'],
        "end": r['end'],
        "type": LABEL_MAPPING.get(r.get('entity_group') or r.get('entity'), r.get('entity_group') or r.get('entity'))
    } for r in raw_results]

    entities = merge_adjacent_entities(entities)
    return entities, time.time() - start_time


# ------------------ 实体类型标准化 ------------------
LABEL_MAPPING = {
    "address": "LOC",
    "company": "ORG",
    "name": "PER",
    "organization": "ORG",
    "position": "TITLE",
    "government": "ORG",
    "scene": "LOC",
    "book": "WORK",
    "movie": "WORK",
    "game": "WORK"
}

# 提取实体
entities, processing_time = ner("Google in New York met Alice")

# 标准化实体类型
for e in entities:
    e["type"] = LABEL_MAPPING.get(e.get("type"), e.get("type"))

# 打印标准化后的实体
print(f"[DEBUG] 标准化后实体列表: {[{'text': e['text'], 'type': e['type']} for e in entities]}")

# 打印处理时间
print(f"处理时间: {processing_time:.2f}秒")


# ======================== 关系抽取（RE） ========================
import re
import json

def re_extract(entities, text, use_bert_model=True, bert_model=None):
    # ------------------ 参数校验 ------------------
    if not entities or not text:
        print("[DEBUG] 参数校验失败，实体或文本为空")
        return []

    valid_entity_types = {"PER", "LOC", "ORG", "TITLE"}
    filtered_entities = [e for e in entities if e.get("type") in valid_entity_types]

    if not filtered_entities:
        print("[DEBUG] 未找到有效的实体")
        return []

    # ------------------ 单实体场景 ------------------
    if len(filtered_entities) == 1:
        single_relations = []
        ent = filtered_entities[0]
        print(f"[DEBUG] 处理单实体：{ent['text']}，类型：{ent['type']}")

        if ent["type"] == "PER":
            position_keywords = ["CEO", "经理", "总监", "工程师", "教授", "首席"]
            # 基于关键词判断
            for keyword in position_keywords:
                if keyword in text:
                    print(f"[DEBUG] 发现职位关键词：{keyword}")
                    single_relations.append({
                        "head": ent["text"],
                        "tail": keyword,
                        "relation": "担任职位"
                    })
                    break
            # 基于句式：“张三是首席科学家”
            match = re.search(rf"{ent['text']}是(.{{1,10}}?)(?:，|。|的)?", text)
            if match:
                title = match.group(1)
                if any(t in title for t in position_keywords):
                    print(f"[DEBUG] 句式识别职位：{title}")
                    single_relations.append({
                        "head": ent["text"],
                        "tail": title,
                        "relation": "担任职位"
                    })

        elif ent["type"] in ["ORG", "LOC"]:
            location_verbs = ["位于", "坐落于", "地处"]
            for verb in location_verbs:
                match = re.search(fr"{ent['text']}{verb}(.+?)[，。]", text)
                if match:
                    print(f"[DEBUG] 发现位置关系：{ent['text']} {verb} {match.group(1)}")
                    single_relations.append({
                        "head": ent["text"],
                        "tail": match.group(1).strip(),
                        "relation": "位置"
                    })
                    break
        return single_relations

    # ------------------ 多实体关系抽取 ------------------
    relations = []

    if use_bert_model and len(filtered_entities) >= 2:
        try:
            entity_list = [e["text"] for e in filtered_entities]
            prompt = f"""请分析以下文本中的实体关系，严格按照JSON列表格式返回：

文本内容：{text}

候选实体：{entity_list}

要求：

1. 只返回存在明确关系的实体对

2. 关系类型使用：属于、位于、任职于、合作、其他

3. 示例格式：[{{"head":"实体1", "tail":"实体2", "relation":"关系类型"}}]

请直接返回JSON，不要多余内容："""

            response = bert_model.predict(prompt)  # 模型接口自定义
            json_str = re.search(r'(\[.*?\])', response, re.DOTALL)
            if json_str:
                json_str = json_str.group(1)
                json_str = re.sub(r'[\u201c\u201d]', '"', json_str)
                json_str = re.sub(r'(?<!,)\n', '', json_str)
                parsed = json.loads(json_str)

                valid_types = {"属于", "位于", "任职于", "合作", "其他"}
                entity_texts = set(e["text"] for e in filtered_entities)
                for rel in parsed:
                    if (
                        isinstance(rel, dict)
                        and rel.get("head") in entity_texts
                        and rel.get("tail") in entity_texts
                        and rel.get("relation") in valid_types
                    ):
                        print(f"[DEBUG] 模型抽取关系：{rel}")
                        relations.append(rel)
            else:
                print("[DEBUG] 未能解析出关系JSON")
        except Exception as e:
            print(f"[DEBUG] BERT模型关系抽取异常: {str(e)}")

    # ------------------ 规则兜底 ------------------
    if len(relations) == 0:
        print("[DEBUG] 启用规则兜底抽取关系")

        # A位于B
        for match in re.finditer(r'([^\s，。]+?)(?:位于|坐落于|地处)([^\s，。]+)', text):
            head, tail = match.groups()
            print(f"[DEBUG] 发现位于关系：{head} 位于 {tail}")
            relations.append({"head": head, "tail": tail, "relation": "位于"})

        # A属于B
        for match in re.finditer(r'([^\s，。]+?)(?:属于|隶属于)([^\s，。]+)', text):
            head, tail = match.groups()
            print(f"[DEBUG] 发现属于关系：{head} 属于 {tail}")
            relations.append({"head": head, "tail": tail, "relation": "属于"})

        # 人物-机构（张三 就职于 腾讯公司）
        person_org_pattern = r'([\u4e00-\u9fa5]{2,4})(?:现任|担任|就职于)([\u4e00-\u9fa5]+?公司|[\u4e00-\u9fa5]+?大学)'
        for match in re.finditer(person_org_pattern, text):
            head, tail = match.groups()
            print(f"[DEBUG] 发现任职关系：{head} 任职于 {tail}")
            relations.append({"head": head, "tail": tail, "relation": "任职于"})

    # ------------------ 去重与验证 ------------------
    seen = set()
    final_relations = []
    entity_text_set = set(e["text"] for e in filtered_entities)
    for rel in relations:
        key = (rel["head"], rel["tail"], rel["relation"])
        if key not in seen and rel["head"] in entity_text_set and rel["tail"] in entity_text_set:
            final_relations.append(rel)
            seen.add(key)
            print(f"[DEBUG] 添加有效关系：{rel}")
        else:
            print(f"[DEBUG] 忽略无效关系：{rel}")

    return final_relations


# ======================== 文本分析主流程 ========================
def process_text(text, model_type="bert"):
    entities, duration = ner(text, model_type)
    relations = re_extract(entities, text)
    update_knowledge_graph(entities, relations)

    DISPLAY_LABELS = {
        "PER": "人名",
        "LOC": "地点",
        "ORG": "机构",
        "TIME": "时间",
        "TITLE": "职位",
        "PRODUCT": "产品"
    }

    ent_text = "\n".join(
        f"{e['text']} ({DISPLAY_LABELS.get(e['type'], e['type'])}) [{e['start']}-{e['end']}]"
        for e in entities
    )
    rel_text = "\n".join(f"{r['head']} --[{r['relation']}]-> {r['tail']}" for r in relations)
    kg_text = visualize_kg_text()

    return ent_text, rel_text, kg_text, f"{duration:.2f} 秒"


def process_file(file, model_type="bert"):
    try:
        with open(file.name, 'rb') as f:
            content = f.read()

        if len(content) > 5 * 1024 * 1024:
            return "❌ 文件太大", "", "", ""

        # 检测编码
        try:
            encoding = chardet.detect(content)['encoding'] or 'utf-8'
            text = content.decode(encoding)
        except UnicodeDecodeError:
            # 尝试常见中文编码
            for enc in ['gb18030', 'utf-16', 'big5'] :
                try:
                    text = content.decode(enc)
                    break
                except:
                    continue
            else:
                return "❌ 编码解析失败", "", "", ""

        return process_text(text, model_type)
    except Exception as e:
        return f"❌ 文件处理错误: {str(e)}", "", "", ""



# ======================== 模型评估与自动标注 ========================
def convert_telegram_json_to_eval_format(path):
    with open(path, encoding="utf-8") as f:
        data = json.load(f)
    if isinstance(data, dict) and "text" in data:
        return [{"text": data["text"], "entities": [
            {"text": data["text"][e["start"]:e["end"]]} for e in data.get("entities", [])
        ]}]
    elif isinstance(data, list):
        return data
    elif isinstance(data, dict) and "messages" in data:
        result = []
        for m in data.get("messages", []):
            if isinstance(m.get("text"), str):
                result.append({"text": m["text"], "entities": []})
            elif isinstance(m.get("text"), list):
                txt = ''.join([x["text"] if isinstance(x, dict) else x for x in m["text"]])
                result.append({"text": txt, "entities": []})
        return result
    return []


def evaluate_ner_model(data, model_type):
    tp, fp, fn = 0, 0, 0
    POS_TOLERANCE = 1

    for item in data:
        text = item["text"]
        # 处理标注数据
        gold_entities = []
        for e in item.get("entities", []):
            if "text" in e and "type" in e:
                norm_type = LABEL_MAPPING.get(e["type"], e["type"])
                gold_entities.append({
                    "text": e["text"],
                    "type": norm_type,
                    "start": e.get("start", -1),
                    "end": e.get("end", -1)
                })

        # 获取预测结果
        pred_entities, _ = ner(text, model_type)

        # 初始化匹配状态
        matched_gold = [False] * len(gold_entities)
        matched_pred = [False] * len(pred_entities)

        # 遍历预测实体寻找匹配
        for p_idx, p in enumerate(pred_entities):
            for g_idx, g in enumerate(gold_entities):
                if not matched_gold[g_idx] and \
                        p["text"] == g["text"] and \
                        p["type"] == g["type"] and \
                        abs(p["start"] - g["start"]) <= POS_TOLERANCE and \
                        abs(p["end"] - g["end"]) <= POS_TOLERANCE:
                    matched_gold[g_idx] = True
                    matched_pred[p_idx] = True
                    break

        # 统计指标
        tp += sum(matched_pred)
        fp += len(pred_entities) - sum(matched_pred)
        fn += len(gold_entities) - sum(matched_gold)

    # 处理除零情况
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    return (f"Precision: {precision:.2f}\n"
            f"Recall: {recall:.2f}\n"
            f"F1: {f1:.2f}")


def auto_annotate(file, model_type):
    data = convert_telegram_json_to_eval_format(file.name)
    for item in data:
        ents, _ = ner(item["text"], model_type)
        item["entities"] = ents
    return json.dumps(data, ensure_ascii=False, indent=2)


def save_json(json_text):
    fname = f"auto_labeled_{int(time.time())}.json"
    with open(fname, "w", encoding="utf-8") as f:
        f.write(json_text)
    return fname


# ======================== 数据集导入 ========================
def import_dataset(path="D:/云边智算/暗语识别/filtered_results"):
    import os
    import json

    for filename in os.listdir(path):
        if filename.endswith('.json'):
            filepath = os.path.join(path, filename)
            with open(filepath, 'r', encoding='utf-8') as f:
                data = json.load(f)
                # 调用现有处理流程
                process_text(data['text'])
                print(f"已处理文件: {filename}")


# ======================== Gradio 界面 ========================
with gr.Blocks(css="""

    .kg-graph {height: 500px; overflow-y: auto;}

    .warning {color: #ff6b6b;}

""") as demo:
    gr.Markdown("# 🤖 聊天记录实体关系识别系统")

    with gr.Tab("📄 文本分析"):
        input_text = gr.Textbox(lines=6, label="输入文本")
        model_type = gr.Radio(["bert", "chatglm"], value="bert", label="选择模型")
        btn = gr.Button("开始分析")
        out1 = gr.Textbox(label="识别实体")
        out2 = gr.Textbox(label="识别关系")
        out3 = gr.Textbox(label="知识图谱")
        out4 = gr.Textbox(label="耗时")
        btn.click(fn=process_text, inputs=[input_text, model_type], outputs=[out1, out2, out3, out4])

    with gr.Tab("🗂 文件分析"):
        file_input = gr.File(file_types=[".txt", ".json"])
        file_btn = gr.Button("上传并分析")
        fout1, fout2, fout3, fout4 = gr.Textbox(), gr.Textbox(), gr.Textbox(), gr.Textbox()
        file_btn.click(fn=process_file, inputs=[file_input, model_type], outputs=[fout1, fout2, fout3, fout4])

    with gr.Tab("📊 模型评估"):
        eval_file = gr.File(label="上传标注 JSON")
        eval_model = gr.Radio(["bert", "chatglm"], value="bert")
        eval_btn = gr.Button("开始评估")
        eval_output = gr.Textbox(label="评估结果", lines=5)
        eval_btn.click(lambda f, m: evaluate_ner_model(convert_telegram_json_to_eval_format(f.name), m),
                       [eval_file, eval_model], eval_output)

    with gr.Tab("✏️ 自动标注"):
        raw_file = gr.File(label="上传 Telegram 原始 JSON")
        auto_model = gr.Radio(["bert", "chatglm"], value="bert")
        auto_btn = gr.Button("自动标注")
        marked_texts = gr.Textbox(label="标注结果", lines=20)
        download_btn = gr.Button("💾 下载标注文件")
        auto_btn.click(fn=auto_annotate, inputs=[raw_file, auto_model], outputs=marked_texts)
        download_btn.click(fn=save_json, inputs=marked_texts, outputs=gr.File())

    with gr.Tab("📂 数据管理"):
        gr.Markdown("### 数据集导入")
        dataset_path = gr.Textbox(
            value="D:/云边智算/暗语识别/filtered_results",
            label="数据集路径"
        )
        import_btn = gr.Button("导入数据集到数据库")
        import_output = gr.Textbox(label="导入日志")
        import_btn.click(fn=lambda: import_dataset(dataset_path.value), outputs=import_output)

demo.launch(server_name="0.0.0.0", server_port=7860)