Spaces:

lhlhlh
/

bagualu-ie

Runtime error

App Files Files Community

han liu commited on Dec 30, 2022

Commit

ff78ef7

•

1 Parent(s): ca2a245

init

Browse files

Files changed (8) hide show

app.py +182 -0
dataloaders/__init__.py +0 -0
dataloaders/dataset_utils.py +57 -0
dataloaders/item_decoder.py +320 -0
dataloaders/item_encoder.py +534 -0
models/__init__.py +2 -0
models/extract_model.py +71 -0
models/model.py +156 -0

app.py ADDED Viewed

	@@ -0,0 +1,182 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+## @Author: liuhan(liuhan@idea.edu.cn)
+## @Created: 2022/12/28 11:24:43
+# coding=utf-8
+# Copyright 2021 The IDEA Authors. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import List, Dict
+from logging import basicConfig
+import json
+import os
+import numpy as np
+from transformers import AutoTokenizer
+import argparse
+import copy
+import streamlit as st
+import time
+from models import BagualuIEModel, BagualuIEExtractModel
+class BagualuIEPipelines:
+    def __init__(self, args: argparse.Namespace) -> None:
+        self.args = args
+        # load model
+        self.model = BagualuIEModel.from_pretrained(args.pretrained_model_root)
+        # get tokenizer
+        added_token = [f"[unused{i + 1}]" for i in range(99)]
+        self.tokenizer = AutoTokenizer.from_pretrained(args.pretrained_model_root,
+                                                       additional_special_tokens=added_token)
+    def predict(self, test_data: List[dict], cuda: bool = True) -> List[dict]:
+        """ predict
+        Args:
+            test_data (List[dict]): test data
+            cuda (bool, optional): cuda. Defaults to True.
+        Returns:
+            List[dict]: result
+        """
+        result = []
+        if cuda:
+            self.model = self.model.cuda()
+        self.model.eval()
+        batch_size = self.args.batch_size
+        extract_model = BagualuIEExtractModel(self.tokenizer, self.args)
+        for i in range(0, len(test_data), batch_size):
+            batch_data = test_data[i: i + batch_size]
+            batch_result = extract_model.extract(batch_data, self.model, cuda)
+            result.extend(batch_result)
+        return result
+@st.experimental_memo()
+def load_model(model_path):
+    parser = argparse.ArgumentParser()
+    # pipeline arguments
+    group_parser = parser.add_argument_group("piplines args")
+    group_parser.add_argument("--pretrained_model_root", default="", type=str)
+    group_parser.add_argument("--load_checkpoints_path", default="", type=str)
+    group_parser.add_argument("--threshold_ent", default=0.3, type=float)
+    group_parser.add_argument("--threshold_rel", default=0.3, type=float)
+    group_parser.add_argument("--entity_multi_label", action="store_true", default=True)
+    group_parser.add_argument("--relation_multi_label", action="store_true", default=True)
+    # data model arguments
+    group_parser = parser.add_argument_group("data_model")
+    group_parser.add_argument("--batch_size", default=4, type=int)
+    group_parser.add_argument("--max_length", default=512, type=int)
+    # pytorch_lightning.Trainer参数
+    args = parser.parse_args()
+    args.pretrained_model_root = model_path
+    model = BagualuIEPipelines(args)
+    return model
+def main():
+    # model = load_model('/cognitive_comp/liuhan/pretrained/uniex_macbert_base_v7.1/')
+    model = load_model('IDEA-CCNL/Erlangshen-BERT-120M-IE-Chinese')
+    #
+    st.subheader("Erlangshen-BERT-120M-IE-Chinese Zero-shot 体验")
+    st.markdown("""
+            Erlangshen-BERT-120M-IE-Chinese是以110M参数的base模型为底座，基于大规模信息抽取数据进行预训练后的模型，
+            通过统一的抽取架构设计,可支持few-shot、zero-shot场景下的实体识别、关系三元组抽取任务。
+            更多信息见https://github.com/IDEA-CCNL/GTS-Engine/tree/main
+            模型效果见https://huggingface.co/IDEA-CCNL/Erlangshen-BERT-120M-IE-Chinese
+            """)
+    st.info("Please input the following information to experiencing Bagualu-IE「请输入以下信息开始体验 Bagualu-IE...」")
+    model_type = st.selectbox('Select task type「选择任务类型」',['Named Entity Recognition「命名实体识别」','Relation Extraction「关系抽取」'])
+    if '命名实体识别' in model_type:
+        example = st.selectbox('Example', ['Example: 人物信息', 'Example: 财经新闻'])
+    else:
+        example = st.selectbox('Example', ['Example: 雇佣关系', 'Example: 影视关系'])
+    form = st.form("参数设置")
+    if '命名实体识别' in model_type:
+        if '人物信息' in example:
+            sentences = form.text_area(
+                "Please input the context「请输入句子」",
+                "姚明，男，汉族，无党派人士，前中国职业篮球运动员。")
+            choice = form.text_input("Please input the choice「请输入抽取���体名称，用中文；分割」", "姓名；性别；民族；运动项目；政治面貌")
+        else:
+            sentences = form.text_area(
+                "Please input the context「请输入句子」",
+                "寒流吹响华尔街，摩根士丹利、高盛、瑞信三大银行裁员合计超过8千人")
+            choice = form.text_input("Please input the choice「请输入抽取实体名称，用中文；分割」", "裁员单位；裁员人数")
+    else:
+        if '雇佣关系' in example:
+            sentences = form.text_area(
+                "Please input the context「请输入句子」",
+                "东阳市企业家协会六届一次会员大会上，横店集团董事长、总裁徐永安当选为东阳市企业家协会会长。")
+            choice = form.text_input("Please input the choice「请输入抽取关系名称，用中文；分割（头实体类型|关系|尾实体类型）」", "企业|董事长|人物")
+        else:
+            sentences = form.text_area(
+                "Please input the context「请输入句子」",
+                "《傲骨贤妻第六季》是一套美国法律剧情电视连续剧，2014年9月29日在CBS上首播。")
+            choice = form.text_input("Please input the choice「请输入抽取关系名称，用中文；分割（头实体类型|关系|尾实体类型）」", "影视作品|上映时间|时间")
+    form.form_submit_button("Submit「点击一下，开始预测！」")
+    if '命名实体识别' in model_type:
+        data = [{"task": '实体识别',
+                "text": sentences,
+                "entity_list": [],
+                "choice": choice.split('；'),
+                }]
+    else:
+        choice = [one.split('|') for one in choice.split('；')]
+        data = [{"task": '关系抽取',
+                "text": sentences,
+                "entity_list": [],
+                "choice": choice,
+                }]
+    start = time.time()
+    # is_cuda= True if torch.cuda.is_available() else False
+    # result = model.predict(data, cuda=is_cuda)
+    # st.success(f"Prediction is successful, consumes {str(time.time()-start)} seconds")
+    # st.json(result[0])
+    rs = model.predict(data, False)
+    st.success(f"Prediction is successful, consumes {str(time.time() - start)} seconds")
+    st.json(rs[0])
+if __name__ == "__main__":
+    main()

dataloaders/__init__.py ADDED Viewed

File without changes

dataloaders/dataset_utils.py ADDED Viewed

	@@ -0,0 +1,57 @@

+# coding=utf-8
+# Copyright 2021 The IDEA Authors. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import collections
+from typing import List, Dict, Tuple
+def get_choice(spo_choice: list) -> tuple:
+    """ 把关系schema中的关系、实体获取出来
+    Args:
+        spo_choice (list): 关系schema
+    Returns:
+        tuple:
+            choice_ent (list)
+            choice_rel (list)
+            choice_head (list)
+            choice_tail (list)
+            entity2rel (dict)
+    """
+    choice_head = []
+    choice_tail = []
+    choice_ent = []
+    choice_rel = []
+    entity2rel = collections.defaultdict(list) # "subject|object" -> [relation]
+    for head, rel, tail in spo_choice:
+        if head not in choice_head:
+            choice_head.append(head)
+        if tail not in choice_tail:
+            choice_tail.append(tail)
+        if head not in choice_ent:
+            choice_ent.append(head)
+        if tail not in choice_ent:
+            choice_ent.append(tail)
+        if rel not in choice_rel:
+            choice_rel.append(rel)
+        entity2rel[head, tail].append(rel)
+    return choice_ent, choice_rel, choice_head, choice_tail, entity2rel

dataloaders/item_decoder.py ADDED Viewed

	@@ -0,0 +1,320 @@

+# coding=utf-8
+# Copyright 2021 The IDEA Authors. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# from collections import defaultdict
+from typing import List, Tuple, Dict
+import argparse
+import numpy as np
+from transformers import PreTrainedTokenizer
+from .item_encoder import entity_based_tokenize, get_entity_indices
+from .dataset_utils import get_choice
+class ItemDecoder(object):
+    """ Decoder
+    Args:
+        tokenizer (PreTrainedTokenizer): tokenizer
+        args (TrainingArgumentsIEStd): arguments
+    """
+    def __init__(self,
+                 tokenizer: PreTrainedTokenizer,
+                 args: argparse.Namespace) -> None:
+        self.tokenizer = tokenizer
+        self.max_length = args.max_length
+        self.threshold_entity = args.threshold_ent
+        self.threshold_rel = args.threshold_rel
+        self.entity_multi_label = args.entity_multi_label
+        self.relation_multi_label = args.relation_multi_label
+    def extract_entity_index(self,
+                             entity_logits: np.ndarray,
+                             ) -> List[Tuple[int, int]]:
+        """ extract entity index
+        Args:
+            entity_logits (np.ndarray): entity_logits
+        Returns:
+            List[Tuple[int, int]]: result
+        """
+        l, _, d = entity_logits.shape
+        result = []
+        for i in range(l):
+            for j in range(i, l):
+                if self.entity_multi_label:
+                    for k in range(d):
+                        entity_score = float(entity_logits[i, j, k])
+                        if entity_score > self.threshold_entity:
+                            result.append((i, j, k, entity_score))
+                else:
+                    k = np.argmax(entity_logits[i, j])
+                    entity_score = float(entity_logits[i, j, k])
+                    if entity_score > self.threshold_entity:
+                        result.append((i, j, k, entity_score))
+        return result
+    @staticmethod
+    def extract_entity(text: str,
+                       entity_idx: List[int],
+                       entity_type: str,
+                       entity_score: float,
+                       text_start_id: int,
+                       offset_mapping: List[List[int]]) -> dict:
+        """ extract entity
+        Args:
+            text (str): text
+            entity_idx (List[int]): entity indices
+            entity_type (str): entity type
+            entity_score (float): entity score
+            text_start_id (int): text_start_id
+            offset_mapping (List[List[int]]): offset mapping
+        Returns:
+            dict: entity
+        """
+        entity_start, entity_end = entity_idx[0] - text_start_id, entity_idx[1] - text_start_id
+        start_split = offset_mapping[entity_start] if 0 <= entity_start < len(offset_mapping) else []
+        end_split = offset_mapping[entity_end] if 0 <= entity_end < len(offset_mapping) else []
+        if not start_split or not end_split:
+            return None
+        start_idx, end_idx = start_split[0], end_split[-1]
+        entity_text = text[start_idx: end_idx]
+        if not entity_text:
+            return None
+        entity = {
+            "entity_text": entity_text,
+            "entity_type": entity_type,
+            "score": entity_score,
+            "entity_index": [start_idx, end_idx]
+        }
+        return entity
+    def decode_ner(self,
+                   text: str,
+                   choice: List[str],
+                   sample_span_logits: np.ndarray,
+                   offset_mapping: List[List[int]]
+                  ) -> List[dict]:
+        """ NER decode
+        Args:
+            text (str): text
+            choice (List[str]): choice
+            sample_span_logits (np.ndarray): sample span_logits
+            offset_mapping (List[List[int]]): offset mapping
+        Returns:
+            List[dict]: decoded entity list
+        """
+        entity_list = []
+        entity_idx_list = self.extract_entity_index(sample_span_logits)
+        for entity_start, entity_end, entity_type_idx, entity_score in entity_idx_list:
+            entity = self.extract_entity(text,
+                                         [entity_start, entity_end],
+                                         choice[entity_type_idx],
+                                         entity_score,
+                                         text_start_id=1,
+                                         offset_mapping=offset_mapping)
+            if entity is None:
+                continue
+            if entity not in entity_list:
+                entity_list.append(entity)
+        return entity_list
+    def decode_spo(self,
+                   text: str,
+                   choice: List[List[str]],
+                   sample_span_logits: np.ndarray,
+                   offset_mapping: List[List[int]]) -> tuple:
+        """ SPO decode
+        Args:
+            text (str): text
+            choice (List[List[str]]): choice
+            sample_span_logits (np.ndarray): sample span_logits
+            offset_mapping (List[List[int]): offset mapping
+        Returns:
+            List[dict]: decoded spo list
+            List[dict]: decoded entity list
+        """
+        spo_list = []
+        entity_list = []
+        choice_ent, choice_rel, choice_head, choice_tail, entity2rel = get_choice(choice)
+        entity_logits = sample_span_logits[:, :, : len(choice_ent)] # (seq_len, seq_len, num_entity)
+        relation_logits = sample_span_logits[:, :, len(choice_ent): ] # (seq_len, seq_len, num_relation)
+        entity_idx_list = self.extract_entity_index(entity_logits)
+        head_list = []
+        tail_list = []
+        for entity_start, entity_end, entity_type_idx, entity_score in entity_idx_list:
+            entity_type = choice_ent[entity_type_idx]
+            entity = self.extract_entity(text,
+                                         [entity_start, entity_end],
+                                         entity_type,
+                                         entity_score,
+                                         text_start_id=1,
+                                         offset_mapping=offset_mapping)
+            if entity is None:
+                continue
+            if entity_type in choice_head:
+                head_list.append((entity_start, entity_end, entity_type, entity))
+            if entity_type in choice_tail:
+                tail_list.append((entity_start, entity_end, entity_type, entity))
+        for head_start, head_end, subject_type, subject_dict in head_list:
+            for tail_start, tail_end, object_type, object_dict in tail_list:
+                if subject_dict == object_dict:
+                    continue
+                if (subject_type, object_type) not in entity2rel.keys():
+                    continue
+                relation_candidates = entity2rel[subject_type, object_type]
+                rel_idx = [choice_rel.index(r) for r in relation_candidates]
+                so_rel_logits = relation_logits[:, :, rel_idx]
+                if self.relation_multi_label:
+                    for idx, predicate in enumerate(relation_candidates):
+                        rel_score = so_rel_logits[head_start, tail_start, idx] + \
+                                    so_rel_logits[head_end, tail_end, idx]
+                        predicate_score = float(rel_score / 2)
+                        if predicate_score <= self.threshold_rel:
+                            continue
+                        if subject_dict not in entity_list:
+                            entity_list.append(subject_dict)
+                        if object_dict not in entity_list:
+                            entity_list.append(object_dict)
+                        spo = {
+                            "predicate": predicate,
+                            "score": predicate_score,
+                            "subject": subject_dict,
+                            "object": object_dict,
+                        }
+                        if spo not in spo_list:
+                            spo_list.append(spo)
+                else:
+                    hh_idx = np.argmax(so_rel_logits[head_start, head_end])
+                    tt_idx = np.argmax(so_rel_logits[tail_start, tail_end])
+                    hh_score = so_rel_logits[head_start, tail_start, hh_idx] + so_rel_logits[head_end, tail_end, hh_idx]
+                    tt_score = so_rel_logits[head_start, tail_start, tt_idx] + so_rel_logits[head_end, tail_end, tt_idx]
+                    predicate = relation_candidates[hh_idx] if hh_score > tt_score else relation_candidates[tt_idx]
+                    predicate_score = float(max(hh_score, tt_score) / 2)
+                    if predicate_score <= self.threshold_rel:
+                        continue
+                    if subject_dict not in entity_list:
+                        entity_list.append(subject_dict)
+                    if object_dict not in entity_list:
+                        entity_list.append(object_dict)
+                    spo = {
+                        "predicate": predicate,
+                        "score": predicate_score,
+                        "subject": subject_dict,
+                        "object": object_dict,
+                    }
+                    if spo not in spo_list:
+                        spo_list.append(spo)
+        return spo_list, entity_list
+    def decode(self,
+               item: Dict,
+               span_logits: np.ndarray,
+               label_mask:  np.ndarray,
+               ):
+        """ decode
+        Args:
+            task (str): task name
+            choice (list): choice
+            text (str): text
+            span_logits (np.ndarray): sample span_logits
+            label_mask (np.ndarray): label_mask
+        Raises:
+            NotImplementedError: raised if task name is not supported
+        Returns:
+            List[dict]: decoded entity list
+            List[dict]: decoded spo list
+        """
+        task, choice, text = item["task"], item["choice"], item["text"]
+        entity_indices = get_entity_indices(item.get("entity_list", []), item.get("spo_list", []))
+        _, offset_mapping = entity_based_tokenize(text, self.tokenizer, entity_indices,
+                                                  return_offsets_mapping=True)
+        assert span_logits.shape == label_mask.shape
+        span_logits = span_logits + (label_mask - 1) * 100000
+        spo_list = []
+        entity_list = []
+        if task in {"实体识别", "抽取任务"}:
+            entity_list = self.decode_ner(text,
+                                          choice,
+                                          span_logits,
+                                          offset_mapping)
+        elif task in {"关系抽取"}:
+            spo_list, entity_list = self.decode_spo(text,
+                                                    choice,
+                                                    span_logits,
+                                                    offset_mapping)
+        else:
+            raise NotImplementedError
+        return entity_list, spo_list

dataloaders/item_encoder.py ADDED Viewed

	@@ -0,0 +1,534 @@

+# coding=utf-8
+# Copyright 2021 The IDEA Authors. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=no-member
+from typing import List, Tuple, Dict, Union
+import numpy as np
+import torch
+import torch.nn as nn
+from transformers import PreTrainedTokenizer
+from .dataset_utils import get_choice
+def get_entity_indices(entity_list: List[dict], spo_list: List[dict]) -> List[List[int]]:
+    """ 获取样本中包含的实体位置信息
+    Args:
+        entity_list (List[dict]): 实体列表
+        spo_list (List[dict]): 三元组列表
+    Returns:
+        List[List[int]]: 实体位置信息
+    """
+    entity_indices = []
+    # 实体中的实体位置
+    for entity in entity_list:
+        entity_index = entity["entity_index"]
+        entity_indices.append(entity_index)
+    # 三元组中的实体位置
+    for spo in spo_list:
+        sub_idx = spo["subject"]["entity_index"]
+        obj_idx = spo["object"]["entity_index"]
+        entity_indices.append(sub_idx)
+        entity_indices.append(obj_idx)
+    return entity_indices
+def entity_based_tokenize(text: str,
+                          tokenizer: PreTrainedTokenizer,
+                          enitity_indices: List[Tuple[int, int]],
+                          max_len: int = -1,
+                          return_offsets_mapping: bool = False) \
+    -> Union[List[int], Tuple[List[int], List[Tuple[int, int]]]]:
+    """ 基于实体位置信息的编码，确保实体为连续1到多个token的合并，同时利用预训练模型词根信息
+    Args:
+        text (str): 文本
+        tokenizer (PreTrainedTokenizer): tokenizer
+        enitity_indices (List[Tuple[int, int]]): 实体位置信息
+        max_len (int, optional): 长度限制. Defaults to -1.
+        return_offsets_mapping (bool, optional): 是否返回offsets_mapping. Defaults to False.
+    Returns:
+        Union[List[int], Tuple[List[int], List[Tuple[int, int]]]]: 编码id
+    """
+    # 根据实体位置遍历出需要对文本进行切割的点
+    split_points = sorted(list({i for idx in enitity_indices for i in idx} | {0, len(text)}))
+    # 对文本进行切割
+    text_parts = []
+    for i in range(0, len(split_points) - 1):
+        text_parts.append(text[split_points[i]: split_points[i + 1]])
+    # 对切割后的文本进行编码
+    bias = 0
+    text_ids = []
+    offset_mapping = []
+    for part in text_parts:
+        part_encoded = tokenizer(part, add_special_tokens=False, return_offsets_mapping=True)
+        part_ids, part_mapping = part_encoded["input_ids"], part_encoded["offset_mapping"]
+        text_ids.extend(part_ids)
+        for start, end in part_mapping:
+            offset_mapping.append((start + bias, end + bias))
+        bias += len(part)
+    if max_len > 0:
+        text_ids = text_ids[: max_len]
+    # 是否返回offsets_mapping
+    if return_offsets_mapping:
+        return text_ids, offset_mapping
+    return text_ids
+class ItemEncoder(object):
+    """ Item Encoder
+    Args:
+        tokenizer (PreTrainedTokenizer): tokenizer
+        max_length (int): max length
+    """
+    def __init__(self, tokenizer: PreTrainedTokenizer, max_length: int) -> None:
+        self.tokenizer = tokenizer
+        self.max_length = max_length
+    def search_index(self,
+                     entity_idx: List[int],
+                     offset_mapping: List[Tuple[int, int]],
+                     bias: int = 0) -> Tuple[int, int]:
+        """ 查找实体在tokens中的索引
+        Args:
+            entity_idx (List[int]): entity index
+            offset_mapping (List[Tuple[int, int]]): text
+            bias (int): bias
+        Returns:
+            Tuple[int]: (start_idx, end_idx)
+        """
+        entity_start, entity_end = entity_idx
+        start_idx, end_idx = -1, -1
+        for token_idx, (start, end) in enumerate(offset_mapping):
+            if start == entity_start:
+                start_idx = token_idx
+            if end == entity_end:
+                end_idx = token_idx
+        assert start_idx >= 0 and end_idx >= 0
+        return start_idx + bias, end_idx + bias
+    @staticmethod
+    def get_position_ids(text_len: int,
+                        ent_ranges: List,
+                        rel_ranges: List) -> np.ndarray:
+        """ 获取position_ids
+        Args:
+            text_len (int): input length
+            ent_ranges (List[List[int, int]]): each entity ranges idx
+            rel_ranges (List[List[int, int]]): each relation ranges idx.
+        Returns:
+            np.ndarray: position_ids
+        """
+        # 一切从0开始算position，@liuhan
+        text_pos_ids = list(range(text_len))
+        ent_pos_ids, rel_pos_ids = [], []
+        for s, e in ent_ranges:
+            ent_pos_ids.extend(list(range(e - s)))
+        for s, e in rel_ranges:
+            rel_pos_ids.extend(list(range(e - s)))
+        position_ids = text_pos_ids + ent_pos_ids + rel_pos_ids
+        return position_ids
+    @staticmethod
+    def get_att_mask(input_len: int,
+                     ent_ranges: List,
+                     rel_ranges: List= None,
+                     choice_ent: List[str] = None,
+                     choice_rel: List[str] = None,
+                     entity2rel: dict = None,
+                     full_attent: bool = False) -> np.ndarray:
+        """ 获取att_mask，不同choice之间的attention_mask置零
+        Args:
+            input_len (int): input length
+            ent_ranges (List[List[int, int]]): each entity ranges idx
+            rel_ranges (List[List[int, int]]): each relation ranges idx. Defaults to None.
+            choice_ent (List[str], optional): choice entity. Defaults to None.
+            choice_rel (List[str], optional): choice relation. Defaults to None.
+            entity2rel (dict, optional): entity to relations. Defaults to None.
+            full_attent (bool, optional): is full attention or not. Defaults to None.
+        Returns:
+            np.ndarray: attention mask
+        """
+        # attention_mask.shape = (input_len, input_len)
+        attention_mask = np.ones((input_len, input_len))
+        if full_attent and not rel_ranges: # full-attention且没有关系情况下，返回全1
+            return attention_mask
+        # input_ids: [CLS] text [SEP] [unused1] ent1 [unused2] rel1 [unused3] event1
+        text_len = ent_ranges[0][0] # text长度
+        # 将text-实体之间的attention置零，text看不到实体,不受传入的entity个数、顺序影响 @liuhan
+        attention_mask[:text_len, text_len:] = 0
+        # 将实体-实体、实体关系之间的attention_mask置零
+        attention_mask[text_len:, text_len: ] = 0
+        # 将每个实体与自己的attention_mask置一
+        for s, e in ent_ranges:
+            attention_mask[s: e, s: e] = 1
+        # 没有关系的话，直接返回
+        if not rel_ranges:
+            return attention_mask
+        # 处理有关系情况
+        # 关系自身attention_mask置1
+        for s, e in rel_ranges:
+            attention_mask[s: e, s: e] = 1
+        # 将有关联的实体-关系置一
+        for head_tail, relations in entity2rel.items():
+            for entity_type in head_tail:
+                ent_idx = choice_ent.index(entity_type)
+                ent_s, _ = ent_ranges[ent_idx] # ent_s, ent_e
+                for relation_type in relations:
+                    rel_idx = choice_rel.index(relation_type)
+                    rel_s, rel_e = rel_ranges[rel_idx]
+                    attention_mask[rel_s: rel_e, ent_s] = 1 # 关系只看实体第一个的[unused1]
+        if full_attent: # full-attention且有关系情况下，让文本能看见关系
+            for s, e in rel_ranges:
+                attention_mask[: text_len, s: e] = 1
+        return attention_mask
+    def encode(self,
+               text: str,
+               task_name: str,
+               choice: List[str],
+               entity_list: List[dict],
+               spo_list: List[dict],
+               full_attent: bool = False,
+               with_label: bool = True) -> Dict[str, torch.Tensor]:
+        """ encode
+        Args:
+            text (str): text
+            task_name (str): task name
+            choice (List[str]): choice
+            entity_list (List[dict]): entity list
+            spo_list (List[dict]): spo list
+            full_attent (bool): full attention
+            with_label (bool): encoded with label. Defaults to True.
+        Returns:
+            Dict[str, torch.Tensor]: encoded
+        """
+        choice_ent, choice_rel, entity2rel = choice, [], {}
+        if isinstance(choice, list):
+            if isinstance(choice[0], list): # 关系抽取 & 实体识别
+                choice_ent, choice_rel, _, _, entity2rel = get_choice(choice)
+        elif isinstance(choice, dict):
+            # 事件类型
+            raise ValueError('event extract not supported now!')
+        else:
+            raise NotImplementedError
+        input_ids = []
+        text_ids = [] # text部分id
+        ent_ids = [] # entity部分id
+        rel_ids = [] # relation部分id
+        entity_labels_idx = []
+        relation_labels_idx = []
+        sep_ids = self.tokenizer.encode("[SEP]", add_special_tokens=False) # [SEP]的编码
+        cls_ids = self.tokenizer.encode("[CLS]", add_special_tokens=False) # [CLS]的编码
+        entity_op_ids = self.tokenizer.encode("[unused1]", add_special_tokens=False) # [unused1]的编码
+        relation_op_ids = self.tokenizer.encode("[unused2]", add_special_tokens=False) # [unused2]的编码
+        # 任务名称的编码
+        task_ids = self.tokenizer.encode(task_name, add_special_tokens=False)
+        # 实体标签的编码
+        for c in choice_ent:
+            c_ids = self.tokenizer.encode(c, add_special_tokens=False)[: self.max_length]
+            ent_ids += entity_op_ids + c_ids
+        # 关系标签的编码
+        for c in choice_rel:
+            c_ids = self.tokenizer.encode(c, add_special_tokens=False)[: self.max_length]
+            rel_ids += relation_op_ids + c_ids
+        # text的编码
+        entity_indices = get_entity_indices(entity_list, spo_list)
+        text_max_len = self.max_length - len(task_ids) - 3
+        text_ids, offset_mapping = entity_based_tokenize(text, self.tokenizer, entity_indices,
+                                                         max_len=text_max_len,
+                                                         return_offsets_mapping=True)
+        text_ids = cls_ids + text_ids + sep_ids
+        input_ids = text_ids + task_ids + sep_ids + ent_ids + rel_ids
+        token_type_ids = [0] * len(text_ids) + [0] * (len(task_ids) + 1) + \
+            [1] * len(ent_ids) + [1] * len(rel_ids)
+        entity_labels_idx = [i for i, id_ in enumerate(input_ids) if id_ == entity_op_ids[0]]
+        relation_labels_idx = [i for i, id_ in enumerate(input_ids) if id_ == relation_op_ids[0]]
+        ent_ranges = [] # 每个实体的起始范围
+        for i in range(len(entity_labels_idx) - 1):
+            ent_ranges.append([entity_labels_idx[i], entity_labels_idx[i + 1]])
+        if not relation_labels_idx:
+            ent_ranges.append([entity_labels_idx[-1], len(input_ids)])
+        else:
+            ent_ranges.append([entity_labels_idx[-1], relation_labels_idx[0]])
+        assert len(ent_ranges) == len(choice_ent)
+        rel_ranges = [] # 每个关系的起始范围
+        for i in range(len(relation_labels_idx) - 1):
+            rel_ranges.append([relation_labels_idx[i], relation_labels_idx[i + 1]])
+        if relation_labels_idx:
+            rel_ranges.append([relation_labels_idx[-1], len(input_ids)])
+        assert len(rel_ranges) == len(choice_rel)
+        # 所有unused的位置
+        label_token_idx = entity_labels_idx + relation_labels_idx
+        task_num_labels = len(label_token_idx)
+        input_len = len(input_ids)
+        text_len = len(text_ids)
+        # 计算mask
+        attention_mask = self.get_att_mask(input_len,
+                                           ent_ranges,
+                                           rel_ranges,
+                                           choice_ent,
+                                           choice_rel,
+                                           entity2rel,
+                                           full_attent)
+        # 计算label-mask
+        label_mask = np.ones((text_len, text_len, task_num_labels))
+        for i in range(text_len):
+            for j in range(text_len):
+                if j < i:
+                    for l in range(len(entity_labels_idx)):
+                        # entity部分的下三角可mask
+                        label_mask[i, j, l] = 0
+        # 计算position_ids
+        position_ids = self.get_position_ids(len(text_ids) + len(task_ids) + 1,
+                                             ent_ranges,
+                                             rel_ranges)
+        assert len(input_ids) == len(position_ids) == len(token_type_ids)
+        if not with_label:
+            return {
+                "input_ids": torch.tensor(input_ids).long(),
+                "attention_mask": torch.tensor(attention_mask).float(),
+                "position_ids": torch.tensor(position_ids).long(),
+                "token_type_ids": torch.tensor(token_type_ids).long(),
+                "label_token_idx": torch.tensor(label_token_idx).long(),
+                "label_mask":  torch.tensor(label_mask).float(),
+                "text_len": torch.tensor(text_len).long(),
+                "ent_ranges": ent_ranges,
+                "rel_ranges": rel_ranges,
+            }
+        # 输入的span_labels，只保留text部分
+        span_labels = np.zeros((text_len, text_len, task_num_labels))
+        # 将实体转成span
+        for entity in entity_list:
+            entity_type = entity["entity_type"]
+            entity_index = entity["entity_index"]
+            start_idx, end_idx = self.search_index(entity_index, offset_mapping, 1)
+            if start_idx < text_len and end_idx < text_len:
+                ent_label = choice_ent.index(entity_type)
+                span_labels[start_idx, end_idx, ent_label] = 1
+        # 将三元组转成span
+        for spo in spo_list:
+            sub_idx = spo["subject"]["entity_index"]
+            obj_idx = spo["object"]["entity_index"]
+            # 获取头实体、尾实体的开始、结束index
+            sub_start_idx, sub_end_idx = self.search_index(sub_idx, offset_mapping, 1)
+            obj_start_idx, obj_end_idx = self.search_index(obj_idx, offset_mapping, 1)
+            # 实体label置1
+            if sub_start_idx < text_len and sub_end_idx < text_len:
+                sub_label = choice_ent.index(spo["subject"]["entity_type"])
+                span_labels[sub_start_idx, sub_end_idx, sub_label] = 1
+            if obj_start_idx < text_len and obj_end_idx < text_len:
+                obj_label = choice_ent.index(spo["object"]["entity_type"])
+                span_labels[obj_start_idx, obj_end_idx, obj_label] = 1
+            # 有关系的sub/obj实体的start/end在realtion对应的label置1
+            if spo["predicate"] in choice_rel:
+                pre_label = choice_rel.index(spo["predicate"]) + len(choice_ent)
+                if sub_start_idx < text_len and obj_start_idx < text_len:
+                    span_labels[sub_start_idx, obj_start_idx, pre_label] = 1
+                if sub_end_idx < text_len and obj_end_idx < text_len:
+                    span_labels[sub_end_idx, obj_end_idx, pre_label] = 1
+        return {
+            "input_ids": torch.tensor(input_ids).long(),
+            "attention_mask": torch.tensor(attention_mask).float(),
+            "position_ids": torch.tensor(position_ids).long(),
+            "token_type_ids": torch.tensor(token_type_ids).long(),
+            "label_token_idx": torch.tensor(label_token_idx).long(),
+            "span_labels": torch.tensor(span_labels).float(),
+            "label_mask":  torch.tensor(label_mask).float(),
+            "text_len": torch.tensor(text_len).long(),
+            "ent_ranges": ent_ranges,
+            "rel_ranges": rel_ranges,
+        }
+    def encode_item(self, item: dict, with_label: bool = True) -> Dict[str, torch.Tensor]:  # pylint: disable=unused-argument
+        """ encode
+        Args:
+            item (dict): item
+            with_label (bool): encoded with label. Defaults to True.
+        Returns:
+            Dict[str, torch.Tensor]: encoded
+        """
+        return self.encode(text=item["text"],
+                           task_name=item["task"],
+                           choice=item["choice"],
+                           entity_list=item.get("entity_list", []),
+                           spo_list=item.get("spo_list", []),
+                           full_attent=item.get('full_attent', False),
+                           with_label=with_label)
+    @staticmethod
+    def collate(batch: List[Dict[str, torch.Tensor]]) -> Dict[str, torch.Tensor]:
+        """
+        Aggregate a batch data.
+        batch = [ins1_dict, ins2_dict, ..., insN_dict]
+        batch_data = {"sentence":[ins1_sentence, ins2_sentence...],
+        "input_ids":[ins1_input_ids, ins2_input_ids...], ...}
+        """
+        input_ids = nn.utils.rnn.pad_sequence(
+            sequences=[encoded["input_ids"] for encoded in batch],
+            batch_first=True,
+            padding_value=0)
+        label_token_idx = nn.utils.rnn.pad_sequence(
+            sequences=[encoded["label_token_idx"] for encoded in batch],
+            batch_first=True,
+            padding_value=0)
+        token_type_ids = nn.utils.rnn.pad_sequence(
+            sequences=[encoded["token_type_ids"] for encoded in batch],
+            batch_first=True,
+            padding_value=0)
+        position_ids = nn.utils.rnn.pad_sequence(
+            sequences=[encoded["position_ids"] for encoded in batch],
+            batch_first=True,
+            padding_value=0)
+        text_len = torch.tensor([encoded["text_len"] for encoded in batch]).long()
+        max_text_len = text_len.max()
+        batch_size, batch_max_length = input_ids.shape
+        _, batch_max_labels = label_token_idx.shape
+        attention_mask = torch.zeros((batch_size, batch_max_length, batch_max_length))
+        label_mask = torch.zeros((batch_size,
+                                  batch_max_length,
+                                  batch_max_length,
+                                  batch_max_labels))
+        for i, encoded in enumerate(batch):
+            input_len = encoded["attention_mask"].shape[0]
+            attention_mask[i, :input_len, :input_len] = encoded["attention_mask"]
+            _, cur_text_len, label_len = encoded['label_mask'].shape
+            label_mask[i, :cur_text_len, :cur_text_len, :label_len] = encoded['label_mask']
+        label_mask = label_mask[:, :max_text_len, :max_text_len, :]
+        batch_data = {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "position_ids": position_ids,
+            "token_type_ids": token_type_ids,
+            "label_token_idx": label_token_idx,
+            "label_mask": label_mask,
+            'text_len': text_len
+        }
+        if "span_labels" in batch[0].keys():
+            span_labels = torch.zeros((batch_size,
+                                       batch_max_length,
+                                       batch_max_length,
+                                       batch_max_labels))
+            for i, encoded in enumerate(batch):
+                input_len, _, sample_num_labels = encoded["span_labels"].shape
+                span_labels[i, :input_len, :input_len, :sample_num_labels] = encoded["span_labels"]
+            batch_data["span_labels"] = span_labels[:, :max_text_len, :max_text_len, :]
+        return batch_data
+    @staticmethod
+    def collate_expand(batch: List[Dict[str, torch.Tensor]]) -> Dict[str, torch.Tensor]:
+        """
+        Aggregate a batch data and expand to full attention
+        batch = [ins1_dict, ins2_dict, ..., insN_dict]
+        batch_data = {"sentence":[ins1_sentence, ins2_sentence...],
+        "input_ids":[ins1_input_ids, ins2_input_ids...], ...}
+        """
+        mask_atten_batch = ItemEncoder.collate(batch)
+        full_atten_batch = ItemEncoder.collate(batch)
+        # 对full_atten_batch进行改造
+        atten_mask = full_atten_batch['attention_mask']
+        b, _, _ = atten_mask.size()
+        for i in range(b):
+            ent_ranges, rel_ranges = batch[i]['ent_ranges'], batch[i]['rel_ranges']
+            text_len = ent_ranges[0][0] # text长度
+            if not rel_ranges:
+                assert len(ent_ranges) == 1, 'ent_ranges:%s' % ent_ranges
+                s, e = ent_ranges[0]
+                atten_mask[i, : text_len, s: e] = 1
+            else:
+                assert len(rel_ranges) == 1 and len(ent_ranges) <= 2, \
+                    'ent_ranges:%s, rel_ranges:%s' % (ent_ranges, rel_ranges)
+                s, e = rel_ranges[0]
+                atten_mask[i, : text_len, s: e] = 1
+        full_atten_batch['attention_mask'] = atten_mask
+        collate_batch = {}
+        for key, value in mask_atten_batch.items():
+            collate_batch[key] = torch.cat((value, full_atten_batch[key]), 0)
+        return collate_batch

models/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .model import BagualuIEModel
2	+ from .extract_model import BagualuIEExtractModel

models/extract_model.py ADDED Viewed

	@@ -0,0 +1,71 @@

+# coding=utf-8
+# Copyright 2021 The IDEA Authors. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import List
+import copy
+from transformers import PreTrainedTokenizer
+import argparse
+from dataloaders.item_encoder import ItemEncoder
+from dataloaders.item_decoder import ItemDecoder
+from .model import BagualuIEModel
+class BagualuIEExtractModel(object):
+    """ BagualuIEExtractModel
+    Args:
+        tokenizer (PreTrainedTokenizer): tokenizer
+        args (TrainingArgumentsIEStd): arguments
+    """
+    def __init__(self,
+                 tokenizer: PreTrainedTokenizer,
+                 args: argparse.Namespace) -> None:
+        self.encoder = ItemEncoder(tokenizer, args.max_length)
+        self.decoder = ItemDecoder(tokenizer, args)
+    def extract(self, batch_data: List[dict], model: BagualuIEModel, use_cuda: bool) -> List[dict]:
+        """ extract
+        Args:
+            batch_data (List[dict]): batch of data
+            model (BagualuIEModel): model
+        Returns:
+            List[dict]: batch of data
+        """
+        if use_cuda:
+            model = model.cuda()
+        model.eval()
+        batch_data = copy.deepcopy(batch_data)
+        batch = [self.encoder.encode_item(item, with_label=False) for item in batch_data]
+        batch = self.encoder.collate(batch)
+        if use_cuda:
+            batch = {k: v.cuda() for k, v in batch.items()}
+        span_logits = model(**batch).cpu().detach().numpy()
+        label_mask = batch["label_mask"].cpu().detach().numpy()
+        for i, item in enumerate(batch_data):
+            entity_list, spo_list = self.decoder.decode(item,
+                                                        span_logits[i],
+                                                        label_mask[i])
+            item["spo_list"] = spo_list
+            item["entity_list"] = entity_list
+        return batch_data

models/model.py ADDED Viewed

	@@ -0,0 +1,156 @@

+# coding=utf-8
+# Copyright 2021 The IDEA Authors. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=no-member
+import torch
+from torch import nn, Tensor
+from transformers import BertPreTrainedModel, BertModel, BertConfig
+class Triaffine(nn.Module):
+    """ Triaffine module
+    Args:
+        triaffine_hidden_size (int): Triaffine module hidden size
+    """
+    def __init__(self, triaffine_hidden_size: int) -> None:
+        super().__init__()
+        self.triaffine_hidden_size = triaffine_hidden_size
+        self.weight_start_end = nn.Parameter(
+            torch.zeros(triaffine_hidden_size,
+                        triaffine_hidden_size,
+                        triaffine_hidden_size))
+        nn.init.normal_(self.weight_start_end, mean=0, std=0.1)
+    def forward(self,
+                start_logits: Tensor,
+                end_logits: Tensor,
+                cls_logits: Tensor) -> Tensor:
+        """forward
+        Args:
+            start_logits (Tensor): start logits
+            end_logits (Tensor): end logits
+            cls_logits (Tensor): cls logits
+        Returns:
+            Tensor: span_logits
+        """
+        start_end_logits = torch.einsum("bxi,ioj,byj->bxyo",
+                                        start_logits,
+                                        self.weight_start_end,
+                                        end_logits)
+        span_logits = torch.einsum("bxyo,bzo->bxyz",
+                                   start_end_logits,
+                                   cls_logits)
+        return span_logits
+class MLPLayer(nn.Module):
+    """MLP layer
+    Args:
+        input_size (int): input size
+        output_size (int): output size
+    """
+    def __init__(self, input_size: int, output_size: int) -> None:
+        super().__init__()
+        self.linear = nn.Linear(in_features=input_size, out_features=output_size)
+        self.act = nn.GELU()
+    def forward(self, x: Tensor) -> Tensor: # pylint: disable=invalid-name
+        """ forward
+        Args:
+            x (Tensor): input
+        Returns:
+            Tensor: output
+        """
+        x = self.linear(x)
+        x = self.act(x)
+        return x
+class BagualuIEModel(BertPreTrainedModel):
+    """ BagualuIEModel
+    Args:
+        config (BertConfig): config
+    """
+    def __init__(self, config: BertConfig) -> None:
+        super().__init__(config)
+        self.bert = BertModel(config)
+        self.config = config
+        self.triaffine_hidden_size = 128
+        self.mlp_start = MLPLayer(self.config.hidden_size,
+                                  self.triaffine_hidden_size)
+        self.mlp_end = MLPLayer(self.config.hidden_size,
+                                self.triaffine_hidden_size)
+        self.mlp_cls = MLPLayer(self.config.hidden_size,
+                                self.triaffine_hidden_size)
+        self.triaffine = Triaffine(self.triaffine_hidden_size)
+    def forward(self,  # pylint: disable=unused-argument
+                input_ids: Tensor,
+                attention_mask: Tensor,
+                position_ids: Tensor,
+                token_type_ids: Tensor,
+                text_len: Tensor,
+                label_token_idx: Tensor,
+                **kwargs) -> Tensor:
+        """ forward
+        Args:
+            input_ids (Tensor): input_ids
+            attention_mask (Tensor): attention_mask
+            position_ids (Tensor): position_ids
+            token_type_ids (Tensor): token_type_ids
+            text_len (Tensor): query length
+            label_token_idx (Tensor, optional): label_token_idx
+        Returns:
+            Tensor: span logits
+        """
+        # bert forward
+        hidden_states = self.bert(input_ids=input_ids,
+                                  attention_mask=attention_mask,
+                                  position_ids=position_ids,
+                                  token_type_ids=token_type_ids,
+                                  output_hidden_states=True)[0]  # (bsz, seq, dim)
+        max_text_len = text_len.max()
+        # 获取start、end、cls的hidden_states
+        hidden_start_end = hidden_states[:, :max_text_len, :] # text部分表示
+        hidden_cls = hidden_states.gather(1, label_token_idx.unsqueeze(-1)\
+            .repeat(1, 1, self.config.hidden_size)) # (bsz, task, dim)
+        # Triaffine
+        span_logits = self.triaffine(self.mlp_start(hidden_start_end),
+                                     self.mlp_end(hidden_start_end),
+                                     self.mlp_cls(hidden_cls)).sigmoid()
+        return span_logits