Spaces:

DeepLearning101
/

IE101TW

Running

App Files Files Community

DeepLearning101 commited on Oct 15, 2023

Commit

08f4077

1 Parent(s): 62c36b1

第一次測試佈署更新

Browse files

Files changed (10) hide show

app.py +258 -0
applications/information_extraction/HugIE/api_test.py +234 -0
models/__init__.py +292 -0
requirements.txt +11 -0
wjn1996-hugnlp-hugie-large-zh/config.json +38 -0
wjn1996-hugnlp-hugie-large-zh/gitattributes.txt +34 -0
wjn1996-hugnlp-hugie-large-zh/special_tokens_map.json +7 -0
wjn1996-hugnlp-hugie-large-zh/tokenizer.json +0 -0
wjn1996-hugnlp-hugie-large-zh/tokenizer_config.json +14 -0
wjn1996-hugnlp-hugie-large-zh/vocab.txt +0 -0

app.py ADDED Viewed

	@@ -0,0 +1,258 @@

+# -*- coding: utf-8 -*-
+# @Time         : 2023/05/30
+# @Author       : TonTon H.-D. Huang Ph.D.
+# @Web          ：http://TWMAN.ORG
+# @EMail        ：TonTon@TWMAN.ORG
+# @File         : HugIE.py
+# @Description  ：毋需重新訓練的醫療診斷書醫囑文字分析
+import gradio as gr
+import json, re
+from applications.information_extraction.HugIE.api_test import HugIEAPI
+from dateutil import parser
+from datetime import datetime
+model_type = "bert"
+hugie_model_name_or_path = "./wjn1996-hugnlp-hugie-large-zh/" #如果不能連網，請自行下載並設定路徑
+hugie = HugIEAPI(model_type, hugie_model_name_or_path)
+def convert_to_ROC_date(date): #只轉換年月日等日期
+    date_regex = r'(\d{3,4}[-：:/.年]\d{1,2}[-：:/.月]\d{1,2}[日]?)'
+    time_regex = r'(\d{1,2}[-：:/.時]\d{1,2}[-：:/.分]\d{1,2}[秒]?)'
+    date_match = re.search(date_regex, date)
+    if date_match:
+        date_part = date_match.group(1)
+        parsed_date = parser.parse(date_part, fuzzy=True)
+        if str(date_part).startswith('20'):
+            ROC_year = int(date_part[:4])- 1911
+        else:
+            ROC_year = int(date_part[:3])
+        ROC_month = parsed_date.month
+        ROC_day = parsed_date.day
+        ROC_date = f"{ROC_year:03d}{ROC_month:02d}{ROC_day:02d}"
+        return ROC_date
+    else:
+        return date
+def convert_to_ROC_time(time): #只處理時間，看 ketword 和 relation 可以發現只有 relation2 才會需要處理時間
+    time_regex = r'(\d{1,2}[-：:/.時]\d{1,2}[-：:/.分](?:\d{1,2}[秒])?)'
+    time_match = re.search(time_regex, time)
+    if time_match:
+        time_part = time_match.group(1)
+        try:
+            parsed_time = datetime.strptime(time_part, "%H時%M分%S秒")
+        except ValueError:
+            parsed_time = datetime.strptime(time_part, "%H時%M分")
+            parsed_time = parsed_time.replace(second=0)
+        ROC_time = parsed_time.strftime("%H%M%S")
+        return ROC_time
+def extract_information(text):
+    keywords = {                                                           #視情況自己新增調整，不用重新訓練
+        'Hospital1': ['入院', '住入本院', '普通病房', '住院', '轉入一般病房', '入住本院'],  # 住院相關，普通病房
+        'Hospital2': ['出院', '離院'],                                      # 出院相關，普通病房
+        'Burn1': ['燒燙傷'],                                                # 燒燙傷類病房
+        'Burn2': ['燒燙傷'],                                                # 燒燙傷類病房
+        'ICU1': ['加護病房', '住院加護病房'],
+        'ICU2': ['轉普通病房', '轉入普通病房', '轉至普通病房', '轉回一般病房', '轉至兒科一般病房'],
+        'exclude_Clinic': ['門診追蹤', '門診複查', '門診持續追蹤', '急診求治', '繼續追蹤', '急診就診'],
+        'Clinic': ['牙科', '來院門診', '門診就診', '看診', '回診', '門診回診', '婦科就診', '門診治療', '來院就診', '本院診療', "本院門診", "經門診", "門診就醫", "由門診", "接受門診", "至診就診", "至門診複診"],
+        'Operation1': ['手術', '切除術', '置放術', '切片術', '幹細胞'],
+        'Operation2': ['左側乳房部分切除併前哨淋巴清除手術', '手術', '切除術', '置放術', '切片術', '幹細胞'],
+        'Emergency1': ['急診'],
+        'Emergency2': ['住入加護病房'],
+        'Chemotherapy': ['化學治療', '化療', '靜脈注射免疫藥物及標靶藥物治療'],
+        'Cancer': ['罹癌'],
+        'Radiation': ['放射線', '放射']
+    }
+    relations = {
+        'Hospital1': {'entity': '住院A', 'relation1': '開始日期'},
+        'Hospital2': {'entity': '住院A', 'relation1': '結束日期'},
+        'Burn1': {'entity': '燒燙傷病房B', 'relation1': '開始日期'},
+        'Burn2': {'entity': '燒燙傷病房B', 'relation1': '結束日期'},
+        'ICU1': {'entity': '加護病房C', 'relation1': '開始日期'},
+        'ICU2': {'entity': '加護病房C', 'relation1': '結束日期'},
+        'exclude_Clinic': {'entity': None},
+        'Clinic': {'entity': '門診D', 'relation1': '日期'},
+        'Operation1': {'entity': '手術F', 'relation1': '日期'},
+        'Operation2': {'entity': '手術F', 'relation1': '手術項目'},
+        'Emergency1': {'entity': '急診G', 'relation1': '開始日期', 'relation2': '開始時間'},
+        'Emergency2': {'entity': '急診G', 'relation1': '結束日期', 'relation2': '終止時間'},
+        'Chemotherapy': {'entity': '癌症化療H', 'relation1': '起訖日'},
+        'Cancer': {'entity': '罹癌I', 'relation1': '起訖日'},
+        'Radiation': {'entity': '癌症放射線J', 'relation1': '起訖日'}
+    }
+#A:住院、B:燒燙傷、C:加護病房、D:門診、F:手術、G:急診、H:癌症化療、I:罹癌、J:癌症放射線
+    results = []
+    for entity, keyword_list in keywords.items():
+        output = {
+            'entity': relations[entity]['entity'],
+            'relations': {}
+        }
+        for keyword in keyword_list:
+            if keyword in keywords['exclude_Clinic']:
+                continue
+            if keyword in text and entity in relations:
+                entity_relations = relations[entity]
+                relation1 = entity_relations.get('relation1')  # 取得關係1
+                relation2 = entity_relations.get('relation2')  # 取得關係2
+                if relation1:
+                    predictions, topk_predictions = hugie.request(text, keyword, relation=relation1)
+                    if predictions[0]:  # 如果有預測結果
+                        for prediction in predictions[0]:
+                            date_prediction = convert_to_ROC_date(prediction)
+                            if relation1 == '開始日期':
+                                relation_label = '受理_起始日'
+                                output['relations'].setdefault(relation_label, {
+                                    'relation': relation_label,
+                                    'predictions': []
+                                })
+                                if date_prediction[:7] not in output['relations'][relation_label]['predictions']:
+                                    output['relations'][relation_label]['predictions'].append(date_prediction[:7])
+                                elif date_prediction not in output['relations'][relation_label]['predictions']:
+                                    output['relations'][relation_label]['predictions'].append(date_prediction)
+                            elif relation1 == '結束日期':
+                                relation_label = '受理_終止日'
+                                output['relations'].setdefault(relation_label, {
+                                    'relation': relation_label,
+                                    'predictions': []
+                                })
+                                date_pattern = r"1[0-9]\d{3}(?:0[1-9]|1[0-2])(?:0[1-9]|[1-2]\d|3[01])(?:\d{4-6})?$" #抓年月日時分秒，懶得再修了
+                                match = re.match(date_pattern, date_prediction[:7])
+                                if match:
+                                    if date_prediction[:7] not in output['relations'][relation_label]['predictions']:
+                                        output['relations'][relation_label]['predictions'].append(date_prediction[:7])
+                                else:
+                                    if date_prediction not in output['relations'][relation_label]['predictions']:
+                                        output['relations'][relation_label]['predictions'].append(date_prediction)
+                            elif relation1 in ['起訖日', '日期']:
+                                relation_label = '受理_起始日'
+                                output['relations'].setdefault(relation_label, {
+                                    'relation': relation_label,
+                                    'predictions': []
+                                })
+                                date_pattern = r"1[0-9]\d{3}(?:0[1-9]|1[0-2])(?:0[1-9]|[1-2]\d|3[01])(?:\d{4-6})?$" #抓年月日時分秒，懶得再修了
+                                match = re.match(date_pattern, date_prediction[:7])
+                                if match:
+                                    if date_prediction[:7] not in output['relations'][relation_label]['predictions']:
+                                        output['relations'][relation_label]['predictions'].append(date_prediction[:7])
+                                else:
+                                    if date_prediction not in output['relations'][relation_label]['predictions']:
+                                        output['relations'][relation_label]['predictions'].append(date_prediction)
+                                relation_label = '受理_終止日'
+                                output['relations'].setdefault(relation_label, {
+                                    'relation': relation_label,
+                                    'predictions': []
+                                })
+                                date_pattern = r"1[0-9]\d{3}(?:0[1-9]|1[0-2])(?:0[1-9]|[1-2]\d|3[01])(?:\d{4-6})?$" #抓年月日時分秒，懶得再修了
+                                match = re.match(date_pattern, date_prediction[:7])
+                                if match:
+                                    if date_prediction[:7] not in output['relations'][relation_label]['predictions']:
+                                        output['relations'][relation_label]['predictions'].append(date_prediction[:7])
+                                else:
+                                    if date_prediction not in output['relations'][relation_label]['predictions']:
+                                        output['relations'][relation_label]['predictions'].append(date_prediction)
+                            elif relation1 == '手術項目':
+                                relation_label = '手術項目'
+                                output['relations'].setdefault(relation_label, {
+                                    'relation': relation_label,
+                                    'predictions': []
+                                })
+                                if date_prediction not in output['relations'][relation_label]['predictions']:
+                                    output['relations'][relation_label]['predictions'].append(date_prediction)
+                                ['predictions'].append(date_prediction)
+                if relation2:
+                    predictions, topk_predictions = hugie.request(text, keyword, relation=relation2)
+                    if predictions[0]:  # 如果有預測結果
+                        for prediction in predictions[0]:
+                            date_prediction = convert_to_ROC_time(prediction)
+                            if relation2 == '開始時間':
+                                relation_label = '受理_起始日時分秒'
+                                output['relations'][relation2] = {
+                                'relation': relation_label,
+                                'predictions': [date_prediction]
+                            }
+                            if relation2 == '終止時間':
+                                relation_label = '受理_終止日時分秒'
+                                output['relations'][relation2] = {
+                                'relation': relation_label,
+                                'predictions': [date_prediction]
+                            }
+        existing_entities = [result['entity'] for result in results]
+        if output['entity'] in existing_entities:
+            # 合併相同實體的關係
+            existing_result = next((result for result in results if result['entity'] == output['entity']), None)
+            existing_relations = existing_result['relations']
+            for relation, predictions in output['relations'].items():
+                existing_relations[relation] = predictions
+        else:
+            results.append(output)
+    results = [result for result in results if result['relations']]
+    return json.dumps(results, indent=4, ensure_ascii=False)
+title = "<p style='text-align: center'><a href='https://www.twman.org/AI/NLP' target='_blank'>醫囑分析：HugIE @ HugNLP</a>"
+description = """
+<p style='text-align: center'><a href="https://blog.twman.org/2023/07/HugIE.html" target='_blank'>基於機器閱讀理解(MRC)的指令微調(Instruction-tuning)的統一信息抽取框架之診斷書醫囑擷取分析</a></p><br>
+<p style='text-align: center'><a href="https://github.com/Deep-Learning-101" target='_blank'>https://github.com/Deep-Learning-101</a></p><br>
+<p style='text-align: center'><a href="https://github.com/Deep-Learning-101/Natural-Language-Processing-Paper" target='_blank'>https://github.com/Deep-Learning-101/Natural-Language-Processing-Paper</a></p><br>
+"""
+demo = gr.Interface(
+    fn=extract_information,
+    inputs=gr.components.Textbox(label="醫療診斷書之醫囑原始內容"),
+    outputs=gr.components.Textbox(label="醫療診斷書之醫囑擷取結果"),
+    examples = [
+        "患者因上述疾病,曾於112年02月13日12:15~112年02月13日13:43至本院急診治療,於112年02月13日轉灼傷中心普通病房,於112年02月17日接受傷口清創手術治療,於112年02月24日接受左上肢植皮重建手術治療,於112年03月03日轉出灼傷中心病房,於 112年03月09日病情穩定出院,曾於112年03月17日、112年03月21日、112年03月28日、112年04月07��、112年04月18日至本院門診治療,須穿著壓力衣避免疤痕增生,續門診追蹤",
+        "患者因甲狀腺乳突癌術後,依病歷記錄,患者接受王舒儀醫師於2023-03-29,郭仁富醫師於2023-05-02之本院門診追蹤治療,共計2次,並於2023-05-02至2023-05-03住院接受高劑量放射性碘隔離治療,現病況穩定予以出院,共計住院兩日,宜門診繼續追蹤治療。",
+        "1.患者因上述原因於202304-06在本院住院於2023-04-07施行開放性復位及鋼釘鋼板固定手術治療.術後應休養二個月患肢不宜提重物並使用手吊#六星明於2023-04-10計院續日診治蹤治療",
+        "病患曾於108-12-17至本院門診手術室接受右側經皮穿腎引留管換管手術治療，病患曾於108-12-17至本院門診治療",
+        "患者因上述原因曾於108年06月03日，12月06日，在本院門診接受子宮頸抹片追蹤檢查，建議返回長庚醫院後續癌症追蹤。",
+        "病人於民國108年09月14日從門診入院，住普通病房，於民國108年12月06日出院，特此證明。",
+        "該病患因上述疾病於民國108年5月18日至本院急診室就診，經傷口護理及診療後於當天出院，應於門診持續追蹤治療。",
+        "病人因上述症狀，於民國108年12月16日住院，接受自費欣普尼注射治療，並於民國108年12月17日出院，須門診追蹤治療。",
+        "該員於108年10月16日，因上述病情，入院施行治療，期間須使用呼吸器及氣墊床。於108年11月26日出院。",
+        "患肢不宜負重．宜休養3個月．宜使用三角巾固定．患者於民國108年01月23日至108年04月18日共至門診4次",
+        "病人因上述病症，於108年04月07日住入本院，接受支持性照護。108年04月10日出院於狀況穩定下予以出院。已安排後續放射線及化學治療。",
+        "病人因上述病情於108年05月25日入院至加護病房，於108年05月30日轉至普通病房，於108年06月03日出院。",
+        "病患曾於108年09月19日20:32~108年09月20日08:41至本院急診治療，於108年09月20日住院抗生素治療，108年09月26日出院．一週門診追蹤",
+    ],
+    title=title,
+    description=description,
+)
+demo.launch(debug=True)

applications/information_extraction/HugIE/api_test.py ADDED Viewed

	@@ -0,0 +1,234 @@

+import sys
+import os
+sys.path.append("./")
+sys.path.append("../")
+sys.path.append("../../")
+sys.path.append("../../../")
+from models import SPAN_EXTRACTION_MODEL_CLASSES
+from models import TOKENIZER_CLASSES
+import numpy as np
+import torch
+class HugIEAPI:
+    def __init__(self, model_type, hugie_model_name_or_path) -> None:
+        if model_type not in SPAN_EXTRACTION_MODEL_CLASSES[
+                "global_pointer"].keys():
+            raise KeyError(
+                "You must choose one of the following model: {}".format(
+                    ", ".join(
+                        list(SPAN_EXTRACTION_MODEL_CLASSES["global_pointer"].
+                             keys()))))
+        self.model_type = model_type
+        self.model = SPAN_EXTRACTION_MODEL_CLASSES["global_pointer"][
+            self.model_type].from_pretrained(hugie_model_name_or_path)
+        self.tokenizer = TOKENIZER_CLASSES[self.model_type].from_pretrained(
+            hugie_model_name_or_path)
+        self.max_seq_length = 512
+    def fush_multi_answer(self, has_answer, new_answer):
+        # 对于某个id测试集，出现多个example时（例如同一个测试样本使用了多个模板而生成了多个example），此时将预测的topk结果进行合并
+        # has为已经合并的结果，new为当前新产生的结果，
+        # has格式为 {"ans": {"prob": float(prob[index_ids[ei]]), "pos": (s, e)}, ...}
+        # new {"ans": {"prob": float(prob[index_ids[ei]]), "pos": (s, e)}, ...}
+        # print("has_answer=", has_answer)
+        for ans, value in new_answer.items():
+            if ans not in has_answer.keys():
+                has_answer[ans] = value
+            else:
+                has_answer[ans]["prob"] += value["prob"]
+                has_answer[ans]["pos"].extend(value["pos"])
+        return has_answer
+    def get_predict_result(self, probs, indices, examples):
+        probs = probs.squeeze(1)  # topk结果的概率
+        indices = indices.squeeze(1)  # topk结果的索引
+        # print("probs=", probs) # [n, m]
+        # print("indices=", indices) # [n, m]
+        predictions = {}
+        topk_predictions = {}
+        idx = 0
+        for prob, index in zip(probs, indices):
+            index_ids = torch.Tensor([i for i in range(len(index))]).long()
+            topk_answer = list()
+            answer = []
+            topk_answer_dict = dict()
+            # TODO 1. 调节阈值 2. 处理输出实体重叠问题
+            entity_index = index[prob > 0.1]
+            index_ids = index_ids[prob > 0.1]
+            for ei, entity in enumerate(entity_index):
+                # 1D index转2D index
+                start_end = np.unravel_index(
+                    entity, (self.max_seq_length, self.max_seq_length))
+                s = examples["offset_mapping"][idx][start_end[0]][0]
+                e = examples["offset_mapping"][idx][start_end[1]][1]
+                ans = examples["content"][idx][s:e]
+                if ans not in answer:
+                    answer.append(ans)
+                    # topk_answer.append({"answer": ans, "prob": float(prob[index_ids[ei]]), "pos": (s, e)})
+                    topk_answer_dict[ans] = {
+                        "prob":
+                        float(prob[index_ids[ei]]),
+                        "pos": [(s.detach().cpu().numpy().tolist(),
+                                 e.detach().cpu().numpy().tolist())]
+                    }
+            predictions[idx] = answer
+            if idx not in topk_predictions.keys():
+                # print("topk_answer_dict=", topk_answer_dict)
+                topk_predictions[idx] = topk_answer_dict
+            else:
+                # print("topk_predictions[id_]=", topk_predictions[id_])
+                topk_predictions[idx] = self.fush_multi_answer(
+                    topk_predictions[idx], topk_answer_dict)
+            idx += 1
+        for idx, values in topk_predictions.items():
+            # values {"ans": {}, ...}
+            answer_list = list()
+            for ans, value in values.items():
+                answer_list.append({
+                    "answer": ans,
+                    "prob": value["prob"],
+                    "pos": value["pos"]
+                })
+            topk_predictions[idx] = answer_list
+        return predictions, topk_predictions
+    def request(self, text: str, entity_type: str, relation: str = None):
+        assert text is not None and entity_type is not None
+        if relation is None:
+            instruction = "找到文章中所有【{}】类型的实体？文章：【{}】".format(entity_type, text)
+            pre_len = 21 - 2 + len(entity_type)
+        else:
+            instruction = "找到文章中【{}】的【{}】？文章：【{}】".format(
+                entity_type, relation, text)
+            pre_len = 19 - 4 + len(entity_type) + len(relation)
+        inputs = self.tokenizer(instruction,
+                                max_length=self.max_seq_length,
+                                padding="max_length",
+                                return_tensors="pt",
+                                return_offsets_mapping=True)
+        examples = {
+            "content": [instruction],
+            "offset_mapping": inputs["offset_mapping"]
+        }
+        batch_input = {
+            "input_ids": inputs["input_ids"],
+            "token_type_ids": inputs["token_type_ids"],
+            "attention_mask": inputs["attention_mask"],
+        }
+        outputs = self.model(**batch_input)
+        probs, indices = outputs["topk_probs"], outputs["topk_indices"]
+        predictions, topk_predictions = self.get_predict_result(
+            probs, indices, examples=examples)
+        return predictions, topk_predictions
+if __name__ == "__main__":
+    from applications.information_extraction.HugIE.api_test import HugIEAPI
+    model_type = "bert"
+    hugie_model_name_or_path = "wjn1996/wjn1996-hugnlp-hugie-large-zh"
+    hugie = HugIEAPI("bert", hugie_model_name_or_path)
+    text = "央广网北京2月23日消息 据中国地震台网正式测定，2月23日8时37分在塔吉克斯坦发生7.2级地震，震源深度10公里，震中位于北纬37.98度，东经73.29度，距我国边境线最近约82公里，地震造成新疆喀什等地震感强烈。"
+    ## named entity recognition
+    entity_type = "国家"
+    predictions, topk_predictions = hugie.request(text, entity_type)
+    print("entity_type:{}".format(entity_type))
+    print("predictions:\n{}".format(predictions))
+    print("topk_predictions:\n{}".format(topk_predictions))
+    print("\n\n")
+    ## event extraction
+    entity = "塔吉克斯坦地震"
+    relation = "震源深度"
+    predictions, topk_predictions = hugie.request(text,
+                                                  entity,
+                                                  relation=relation)
+    print("entity:{}, relation:{}".format(entity, relation))
+    print("predictions:\n{}".format(predictions))
+    print("topk_predictions:\n{}".format(topk_predictions))
+    print("\n\n")
+    ## event extraction
+    entity = "塔吉克斯坦地震"
+    relation = "震源位置"
+    predictions, topk_predictions = hugie.request(text,
+                                                  entity,
+                                                  relation=relation)
+    print("entity:{}, relation:{}".format(entity, relation))
+    print("predictions:\n{}".format(predictions))
+    print("topk_predictions:\n{}".format(topk_predictions))
+    print("\n\n")
+    ## event extraction
+    entity = "塔吉克斯坦地震"
+    relation = "时间"
+    predictions, topk_predictions = hugie.request(text,
+                                                  entity,
+                                                  relation=relation)
+    print("entity:{}, relation:{}".format(entity, relation))
+    print("predictions:\n{}".format(predictions))
+    print("topk_predictions:\n{}".format(topk_predictions))
+    print("\n\n")
+    ## event extraction
+    entity = "塔吉克斯坦地震"
+    relation = "影响"
+    predictions, topk_predictions = hugie.request(text,
+                                                  entity,
+                                                  relation=relation)
+    print("entity:{}, relation:{}".format(entity, relation))
+    print("predictions:\n{}".format(predictions))
+    print("topk_predictions:\n{}".format(topk_predictions))
+    print("\n\n")
+    """
+    Output results:
+    entity_type:国家
+predictions:
+{0: ["塔吉克斯坦"]}
+predictions:
+{0: [{"answer": "塔吉克斯坦", "prob": 0.9999997615814209, "pos": [(tensor(57), tensor(62))]}]}
+entity:塔吉克斯坦地震, relation:震源深度
+predictions:
+{0: ["10公里"]}
+predictions:
+{0: [{"answer": "10公里", "prob": 0.999994158744812, "pos": [(tensor(80), tensor(84))]}]}
+entity:塔吉克斯坦地震, relation:震源位置
+predictions:
+{0: ["10公里", "距我国边境线最近约82公里", "北纬37.98度，东经73.29度", "北纬37.98度，东经73.29度，距我国边境线最近约82公里"]}
+predictions:
+{0: [{"answer": "10公里", "prob": 0.9895901083946228, "pos": [(tensor(80), tensor(84))]}, {"answer": "距我国边境线最近约82公里", "prob": 0.8584909439086914, "pos": [(tensor(107), tensor(120))]}, {"answer": "北纬37.98度，东经73.29度", "prob": 0.7202121615409851, "pos": [(tensor(89), tensor(106))]}, {"answer": "北纬37.98度，东经73.29度，距我国边境线最近约82公里", "prob": 0.11628123372793198, "pos": [(tensor(89), tensor(120))]}]}
+entity:塔吉克斯坦地震, relation:时间
+predictions:
+{0: ["2月23日8时37分"]}
+predictions:
+{0: [{"answer": "2月23日8时37分", "prob": 0.9999995231628418, "pos": [(tensor(49), tensor(59))]}]}
+entity:塔吉克斯坦地震, relation:影响
+predictions:
+{0: ["新疆喀什等地震感强烈"]}
+predictions:
+{0: [{"answer": "新疆喀什等地震感强烈", "prob": 0.9525265693664551, "pos": [(tensor(123), tensor(133))]}]}
+    """

models/__init__.py ADDED Viewed

	@@ -0,0 +1,292 @@

+# -*- coding: utf-8 -*-
+# @Time    : 2021/12/6 3:35 下午
+# @Author  : JianingWang
+# @File    : __init__.py
+# from models.chid_mlm import BertForChidMLM
+from models.multiple_choice.duma import BertDUMAForMultipleChoice, AlbertDUMAForMultipleChoice, MegatronDumaForMultipleChoice
+from models.span_extraction.global_pointer import BertForEffiGlobalPointer, RobertaForEffiGlobalPointer, RoformerForEffiGlobalPointer, MegatronForEffiGlobalPointer
+from transformers import AutoModelForTokenClassification, AutoModelForSequenceClassification, AutoModelForMaskedLM, AutoModelForMultipleChoice, BertTokenizer, \
+    AutoModelForQuestionAnswering, AutoModelForCausalLM
+from transformers import AutoTokenizer
+from transformers.models.roformer import RoFormerTokenizer
+from transformers.models.bert import BertTokenizerFast, BertForTokenClassification, BertTokenizer
+from transformers.models.roberta.tokenization_roberta import RobertaTokenizer
+from transformers.models.gpt2.tokenization_gpt2_fast import GPT2TokenizerFast
+from transformers.models.gpt2.tokenization_gpt2 import GPT2Tokenizer
+from transformers.models.bart.tokenization_bart import BartTokenizer
+from transformers.models.t5.tokenization_t5 import T5Tokenizer
+from transformers.models.plbart.tokenization_plbart import PLBartTokenizer
+# from models.deberta import DebertaV2ForMultipleChoice, DebertaForMultipleChoice
+# from models.fengshen.models.longformer import LongformerForMultipleChoice
+from models.kg import BertForPretrainWithKG, BertForPretrainWithKGV2
+from models.language_modeling.mlm import BertForMaskedLM, RobertaForMaskedLM, AlbertForMaskedLM, RoFormerForMaskedLM
+# from models.sequence_classification.classification import build_cls_model
+from models.multiple_choice.multiple_choice_tag import BertForTagMultipleChoice, RoFormerForTagMultipleChoice, MegatronBertForTagMultipleChoice
+from models.multiple_choice.multiple_choice import MegatronBertForMultipleChoice, MegatronBertRDropForMultipleChoice
+from models.semeval7 import DebertaV2ForSemEval7MultiTask
+from models.sequence_matching.fusion_siamese import BertForFusionSiamese, BertForWSC
+# from roformer import RoFormerForTokenClassification, RoFormerForSequenceClassification
+from models.fewshot_learning.span_proto import SpanProto
+from models.fewshot_learning.token_proto import TokenProto
+from models.sequence_labeling.head_token_cls import (
+    BertSoftmaxForSequenceLabeling, BertCrfForSequenceLabeling,
+    RobertaSoftmaxForSequenceLabeling, RobertaCrfForSequenceLabeling,
+    AlbertSoftmaxForSequenceLabeling, AlbertCrfForSequenceLabeling,
+    MegatronBertSoftmaxForSequenceLabeling, MegatronBertCrfForSequenceLabeling,
+)
+from models.span_extraction.span_for_ner import BertSpanForNer, RobertaSpanForNer, AlbertSpanForNer, MegatronBertSpanForNer
+from models.language_modeling.mlm import BertForMaskedLM
+from models.language_modeling.kpplm import BertForWikiKGPLM, RoBertaKPPLMForProcessedWikiKGPLM, DeBertaKPPLMForProcessedWikiKGPLM
+from models.language_modeling.causal_lm import GPT2ForCausalLM
+from models.sequence_classification.head_cls import (
+    BertForSequenceClassification, BertPrefixForSequenceClassification,
+    BertPtuningForSequenceClassification, BertAdapterForSequenceClassification,
+    RobertaForSequenceClassification, RobertaPrefixForSequenceClassification,
+    RobertaPtuningForSequenceClassification,RobertaAdapterForSequenceClassification,
+    BartForSequenceClassification, GPT2ForSequenceClassification
+)
+from models.sequence_classification.masked_prompt_cls import (
+    PromptBertForSequenceClassification, PromptBertPtuningForSequenceClassification,
+    PromptBertPrefixForSequenceClassification, PromptBertAdapterForSequenceClassification,
+    PromptRobertaForSequenceClassification, PromptRobertaPtuningForSequenceClassification,
+    PromptRobertaPrefixForSequenceClassification, PromptRobertaAdapterForSequenceClassification
+)
+from models.sequence_classification.causal_prompt_cls import PromptGPT2ForSequenceClassification
+from models.code.code_classification import (
+    RobertaForCodeClassification, CodeBERTForCodeClassification,
+    GraphCodeBERTForCodeClassification, PLBARTForCodeClassification, CodeT5ForCodeClassification
+)
+from models.code.code_generation import (
+    PLBARTForCodeGeneration
+)
+from models.reinforcement_learning.actor import CausalActor
+from models.reinforcement_learning.critic import AutoModelCritic
+from models.reinforcement_learning.reward_model import (
+    RobertaForReward, GPT2ForReward
+)
+# Models for pre-training
+PRETRAIN_MODEL_CLASSES = {
+    "mlm": {
+        "bert": BertForMaskedLM,
+        "roberta": RobertaForMaskedLM,
+        "albert": AlbertForMaskedLM,
+        "roformer": RoFormerForMaskedLM,
+    },
+    "auto_mlm": AutoModelForMaskedLM,
+    "causal_lm": {
+        "gpt2": GPT2ForCausalLM,
+        "bart": None,
+        "t5": None,
+        "llama": None
+    },
+    "auto_causal_lm": AutoModelForCausalLM
+}
+CLASSIFICATION_MODEL_CLASSES = {
+    "auto_cls": AutoModelForSequenceClassification, # huggingface cls
+    "classification": AutoModelForSequenceClassification, # huggingface cls
+    "head_cls": {
+        "bert": BertForSequenceClassification,
+        "roberta": RobertaForSequenceClassification,
+        "bart": BartForSequenceClassification,
+        "gpt2": GPT2ForSequenceClassification
+    }, # use standard fine-tuning head for cls, e.g., bert+mlp
+    "head_prefix_cls": {
+        "bert": BertPrefixForSequenceClassification,
+        "roberta": RobertaPrefixForSequenceClassification,
+    }, # use standard fine-tuning head with prefix-tuning technique for cls, e.g., bert+mlp
+    "head_ptuning_cls": {
+        "bert": BertPtuningForSequenceClassification,
+        "roberta": RobertaPtuningForSequenceClassification,
+    }, # use standard fine-tuning head with p-tuning technique for cls, e.g., bert+mlp
+    "head_adapter_cls": {
+        "bert": BertAdapterForSequenceClassification,
+        "roberta": RobertaAdapterForSequenceClassification,
+    }, # use standard fine-tuning head with adapter-tuning technique for cls, e.g., bert+mlp
+    "masked_prompt_cls": {
+        "bert": PromptBertForSequenceClassification,
+        "roberta": PromptRobertaForSequenceClassification,
+        # "deberta": PromptDebertaForSequenceClassification,
+        # "deberta-v2": PromptDebertav2ForSequenceClassification,
+    }, # use masked lm head technique for prompt-based cls, e.g., bert+mlm
+    "masked_prompt_prefix_cls": {
+        "bert": PromptBertPrefixForSequenceClassification,
+        "roberta": PromptRobertaPrefixForSequenceClassification,
+    #     "deberta": PromptDebertaPrefixForSequenceClassification,
+    #     "deberta-v2": PromptDebertav2PrefixForSequenceClassification,
+    }, # use masked lm head with prefix-tuning technique for prompt-based cls, e.g., bert+mlm
+    "masked_prompt_ptuning_cls": {
+        "bert": PromptBertPtuningForSequenceClassification,
+        "roberta": PromptRobertaPtuningForSequenceClassification,
+    #     "deberta": PromptDebertaPtuningForSequenceClassification,
+    #     "deberta-v2": PromptDebertav2PtuningForSequenceClassification,
+    }, # use masked lm head with p-tuning technique for prompt-based cls, e.g., bert+mlm
+    "masked_prompt_adapter_cls": {
+        "bert": PromptBertAdapterForSequenceClassification,
+        "roberta": PromptRobertaAdapterForSequenceClassification,
+    }, # use masked lm head with adapter-tuning technique for prompt-based cls, e.g., bert+mlm
+    "causal_prompt_cls": {
+        "gpt2": PromptGPT2ForSequenceClassification,
+        "bart": None,
+        "t5": None,
+    }, # use causal lm head for prompt-tuning, e.g., gpt2+lm
+}
+TOKEN_CLASSIFICATION_MODEL_CLASSES = {
+    "auto_token_cls": AutoModelForTokenClassification,
+    "head_softmax_token_cls": {
+        "bert": BertSoftmaxForSequenceLabeling,
+        "roberta": RobertaSoftmaxForSequenceLabeling,
+        "albert": AlbertSoftmaxForSequenceLabeling,
+        "megatron": MegatronBertSoftmaxForSequenceLabeling,
+    },
+    "head_crf_token_cls": {
+        "bert": BertCrfForSequenceLabeling,
+        "roberta": RobertaCrfForSequenceLabeling,
+        "albert": AlbertCrfForSequenceLabeling,
+        "megatron": MegatronBertCrfForSequenceLabeling,
+    }
+}
+SPAN_EXTRACTION_MODEL_CLASSES = {
+    "global_pointer": {
+        "bert": BertForEffiGlobalPointer,
+        "roberta": RobertaForEffiGlobalPointer,
+        "roformer": RoformerForEffiGlobalPointer,
+        "megatronbert": MegatronForEffiGlobalPointer
+    },
+}
+FEWSHOT_MODEL_CLASSES = {
+    "sequence_proto": None,
+    "span_proto": SpanProto,
+    "token_proto": TokenProto,
+}
+CODE_MODEL_CLASSES = {
+    "code_cls": {
+        "roberta": RobertaForCodeClassification,
+        "codebert": CodeBERTForCodeClassification,
+        "graphcodebert": GraphCodeBERTForCodeClassification,
+        "codet5": CodeT5ForCodeClassification,
+        "plbart": PLBARTForCodeClassification,
+    },
+    "code_generation": {
+        # "roberta": RobertaForCodeGeneration,
+        # "codebert": BertForCodeGeneration,
+        # "graphcodebert": BertForCodeGeneration,
+        # "codet5": T5ForCodeGeneration,
+        "plbart": PLBARTForCodeGeneration,
+    },
+}
+REINFORCEMENT_MODEL_CLASSES = {
+    "causal_actor": CausalActor,
+    "auto_critic": AutoModelCritic,
+    "rl_reward": {
+        "roberta": RobertaForReward,
+        "gpt2": GPT2ForReward,
+        "gpt-neo": None,
+        "opt": None,
+        "llama": None,
+    }
+}
+# task_type 负责对应model类型
+OTHER_MODEL_CLASSES = {
+    # sequence labeling
+    "bert_span_ner": BertSpanForNer,
+    "roberta_span_ner": RobertaSpanForNer,
+    "albert_span_ner": AlbertSpanForNer,
+    "megatronbert_span_ner": MegatronBertSpanForNer,
+    # sequence matching
+    "fusion_siamese": BertForFusionSiamese,
+    # multiple choice
+    "multi_choice": AutoModelForMultipleChoice,
+    "multi_choice_megatron": MegatronBertForMultipleChoice,
+    "multi_choice_megatron_rdrop": MegatronBertRDropForMultipleChoice,
+    "megatron_multi_choice_tag": MegatronBertForTagMultipleChoice,
+    "roformer_multi_choice_tag": RoFormerForTagMultipleChoice,
+    "multi_choice_tag": BertForTagMultipleChoice,
+    "duma": BertDUMAForMultipleChoice,
+    "duma_albert": AlbertDUMAForMultipleChoice,
+    "duma_megatron": MegatronDumaForMultipleChoice,
+    # language modeling
+    # "bert_mlm_acc": BertForMaskedLMWithACC,
+    # "roformer_mlm_acc": RoFormerForMaskedLMWithACC,
+    "bert_pretrain_kg": BertForPretrainWithKG,
+    "bert_pretrain_kg_v2": BertForPretrainWithKGV2,
+    "kpplm_roberta": RoBertaKPPLMForProcessedWikiKGPLM,
+    "kpplm_deberta": DeBertaKPPLMForProcessedWikiKGPLM,
+    # other
+    "clue_wsc": BertForWSC,
+    "semeval7multitask": DebertaV2ForSemEval7MultiTask,
+    # "debertav2_multi_choice": DebertaV2ForMultipleChoice,
+    # "deberta_multi_choice": DebertaForMultipleChoice,
+    # "qa": AutoModelForQuestionAnswering,
+    # "roformer_cls": RoFormerForSequenceClassification,
+    # "roformer_ner": RoFormerForTokenClassification,
+    # "fensheng_multi_choice": LongformerForMultipleChoice,
+    # "chid_mlm": BertForChidMLM,
+}
+# MODEL_CLASSES = dict(list(PRETRAIN_MODEL_CLASSES.items()) + list(OTHER_MODEL_CLASSES.items()))
+MODEL_CLASSES_LIST = [
+    PRETRAIN_MODEL_CLASSES,
+    CLASSIFICATION_MODEL_CLASSES,
+    TOKEN_CLASSIFICATION_MODEL_CLASSES,
+    SPAN_EXTRACTION_MODEL_CLASSES,
+    FEWSHOT_MODEL_CLASSES,
+    CODE_MODEL_CLASSES,
+    REINFORCEMENT_MODEL_CLASSES,
+    OTHER_MODEL_CLASSES,
+]
+MODEL_CLASSES = dict()
+for model_class in MODEL_CLASSES_LIST:
+    MODEL_CLASSES = dict(list(MODEL_CLASSES.items()) + list(model_class.items()))
+# model_type 负责对应tokenizer
+TOKENIZER_CLASSES = {
+    # for natural language processing
+    "auto": AutoTokenizer,
+    "bert": BertTokenizerFast,
+    "roberta": RobertaTokenizer,
+    "wobert": RoFormerTokenizer,
+    "roformer": RoFormerTokenizer,
+    "bigbird": BertTokenizerFast,
+    "erlangshen": BertTokenizerFast,
+    "deberta": BertTokenizer,
+    "roformer_v2": BertTokenizerFast,
+    "gpt2": GPT2Tokenizer,
+    "megatronbert": BertTokenizerFast,
+    "bart": BartTokenizer,
+    "t5": T5Tokenizer,
+    # for programming language processing
+    "codebert": RobertaTokenizer,
+    "graphcodebert": RobertaTokenizer,
+    "codet5": RobertaTokenizer,
+    "plbart": PLBartTokenizer
+}

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+jieba
+roformer
+scikit-learn
+sentence-transformers
+sentencepiece
+torch==1.12.1
+transformers==4.21.2
+tqdm
+ujson
+gradio==2.3.0
+gradio_client==0.2.7

wjn1996-hugnlp-hugie-large-zh/config.json ADDED Viewed

	@@ -0,0 +1,38 @@

+{
+  "RoPE": true,
+  "_name_or_path": "/wjn/projects/information_extraction/HugIE/outputs/zh_instruction/chinese-macbert-large/chinese-macbert-large",
+  "architectures": [
+    "BertForEffiGlobalPointer"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "bos_token_id": 0,
+  "classifier_dropout": null,
+  "directionality": "bidi",
+  "ent_type_size": 1,
+  "eos_token_id": 2,
+  "finetuning_task": "laic",
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "inner_dim": 64,
+  "intermediate_size": 4096,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 24,
+  "output_past": true,
+  "pad_token_id": 0,
+  "pooler_fc_size": 768,
+  "pooler_num_attention_heads": 12,
+  "pooler_num_fc_layers": 3,
+  "pooler_size_per_head": 128,
+  "pooler_type": "first_token_transform",
+  "position_embedding_type": "absolute",
+  "torch_dtype": "float32",
+  "transformers_version": "4.21.2",
+  "type_vocab_size": 2,
+  "use_cache": true,
+  "vocab_size": 21128
+}

wjn1996-hugnlp-hugie-large-zh/gitattributes.txt ADDED Viewed

	@@ -0,0 +1,34 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

wjn1996-hugnlp-hugie-large-zh/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "cls_token": "[CLS]",
+  "mask_token": "[MASK]",
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "unk_token": "[UNK]"
+}

wjn1996-hugnlp-hugie-large-zh/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

wjn1996-hugnlp-hugie-large-zh/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "cls_token": "[CLS]",
+  "do_lower_case": true,
+  "mask_token": "[MASK]",
+  "name_or_path": "/wjn/projects/information_extraction/HugIE/outputs/zh_instruction/chinese-macbert-large/chinese-macbert-large",
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "special_tokens_map_file": "/wjn/pre-trained-lm/chinese-macbert-large/special_tokens_map.json",
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "BertTokenizer",
+  "unk_token": "[UNK]",
+  "use_fast": true
+}

wjn1996-hugnlp-hugie-large-zh/vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff