# -*- coding: utf-8 -*- # @Time : 2023/05/30 # @Author : TonTon H.-D. Huang Ph.D. # @Web :http://TWMAN.ORG # @EMail :TonTon@TWMAN.ORG # @File : HugIE.py # @Description :毋需重新訓練的醫療診斷書醫囑文字分析 import gradio as gr import json, re from applications.information_extraction.HugIE.api_test import HugIEAPI from dateutil import parser from datetime import datetime model_type = "bert" hugie_model_name_or_path = "https://huggingface.co/wjn1996/wjn1996-hugnlp-hugie-large-zh/" #如果不能連網,請自行下載並設定路徑 hugie = HugIEAPI(model_type, hugie_model_name_or_path) def convert_to_ROC_date(date): #只轉換年月日等日期 date_regex = r'(\d{3,4}[-::/.年]\d{1,2}[-::/.月]\d{1,2}[日]?)' time_regex = r'(\d{1,2}[-::/.時]\d{1,2}[-::/.分]\d{1,2}[秒]?)' date_match = re.search(date_regex, date) if date_match: date_part = date_match.group(1) parsed_date = parser.parse(date_part, fuzzy=True) if str(date_part).startswith('20'): ROC_year = int(date_part[:4])- 1911 else: ROC_year = int(date_part[:3]) ROC_month = parsed_date.month ROC_day = parsed_date.day ROC_date = f"{ROC_year:03d}{ROC_month:02d}{ROC_day:02d}" return ROC_date else: return date def convert_to_ROC_time(time): #只處理時間,看 ketword 和 relation 可以發現只有 relation2 才會需要處理時間 time_regex = r'(\d{1,2}[-::/.時]\d{1,2}[-::/.分](?:\d{1,2}[秒])?)' time_match = re.search(time_regex, time) if time_match: time_part = time_match.group(1) try: parsed_time = datetime.strptime(time_part, "%H時%M分%S秒") except ValueError: parsed_time = datetime.strptime(time_part, "%H時%M分") parsed_time = parsed_time.replace(second=0) ROC_time = parsed_time.strftime("%H%M%S") return ROC_time def extract_information(text): keywords = { #視情況自己新增調整,不用重新訓練 'Hospital1': ['入院', '住入本院', '普通病房', '住院', '轉入一般病房', '入住本院'], # 住院相關,普通病房 'Hospital2': ['出院', '離院'], # 出院相關,普通病房 'Burn1': ['燒燙傷'], # 燒燙傷類病房 'Burn2': ['燒燙傷'], # 燒燙傷類病房 'ICU1': ['加護病房', '住院加護病房'], 'ICU2': ['轉普通病房', '轉入普通病房', '轉至普通病房', '轉回一般病房', '轉至兒科一般病房'], 'exclude_Clinic': ['門診追蹤', '門診複查', '門診持續追蹤', '急診求治', '繼續追蹤', '急診就診'], 'Clinic': ['牙科', '來院門診', '門診就診', '看診', '回診', '門診回診', '婦科就診', '門診治療', '來院就診', '本院診療', "本院門診", "經門診", "門診就醫", "由門診", "接受門診", "至診就診", "至門診複診"], 'Operation1': ['手術', '切除術', '置放術', '切片術', '幹細胞'], 'Operation2': ['左側乳房部分切除併前哨淋巴清除手術', '手術', '切除術', '置放術', '切片術', '幹細胞'], 'Emergency1': ['急診'], 'Emergency2': ['住入加護病房'], 'Chemotherapy': ['化學治療', '化療', '靜脈注射免疫藥物及標靶藥物治療'], 'Cancer': ['罹癌'], 'Radiation': ['放射線', '放射'] } relations = { 'Hospital1': {'entity': '住院A', 'relation1': '開始日期'}, 'Hospital2': {'entity': '住院A', 'relation1': '結束日期'}, 'Burn1': {'entity': '燒燙傷病房B', 'relation1': '開始日期'}, 'Burn2': {'entity': '燒燙傷病房B', 'relation1': '結束日期'}, 'ICU1': {'entity': '加護病房C', 'relation1': '開始日期'}, 'ICU2': {'entity': '加護病房C', 'relation1': '結束日期'}, 'exclude_Clinic': {'entity': None}, 'Clinic': {'entity': '門診D', 'relation1': '日期'}, 'Operation1': {'entity': '手術F', 'relation1': '日期'}, 'Operation2': {'entity': '手術F', 'relation1': '手術項目'}, 'Emergency1': {'entity': '急診G', 'relation1': '開始日期', 'relation2': '開始時間'}, 'Emergency2': {'entity': '急診G', 'relation1': '結束日期', 'relation2': '終止時間'}, 'Chemotherapy': {'entity': '癌症化療H', 'relation1': '起訖日'}, 'Cancer': {'entity': '罹癌I', 'relation1': '起訖日'}, 'Radiation': {'entity': '癌症放射線J', 'relation1': '起訖日'} } #A:住院、B:燒燙傷、C:加護病房、D:門診、F:手術、G:急診、H:癌症化療、I:罹癌、J:癌症放射線 results = [] for entity, keyword_list in keywords.items(): output = { 'entity': relations[entity]['entity'], 'relations': {} } for keyword in keyword_list: if keyword in keywords['exclude_Clinic']: continue if keyword in text and entity in relations: entity_relations = relations[entity] relation1 = entity_relations.get('relation1') # 取得關係1 relation2 = entity_relations.get('relation2') # 取得關係2 if relation1: predictions, topk_predictions = hugie.request(text, keyword, relation=relation1) if predictions[0]: # 如果有預測結果 for prediction in predictions[0]: date_prediction = convert_to_ROC_date(prediction) if relation1 == '開始日期': relation_label = '受理_起始日' output['relations'].setdefault(relation_label, { 'relation': relation_label, 'predictions': [] }) if date_prediction[:7] not in output['relations'][relation_label]['predictions']: output['relations'][relation_label]['predictions'].append(date_prediction[:7]) elif date_prediction not in output['relations'][relation_label]['predictions']: output['relations'][relation_label]['predictions'].append(date_prediction) elif relation1 == '結束日期': relation_label = '受理_終止日' output['relations'].setdefault(relation_label, { 'relation': relation_label, 'predictions': [] }) date_pattern = r"1[0-9]\d{3}(?:0[1-9]|1[0-2])(?:0[1-9]|[1-2]\d|3[01])(?:\d{4-6})?$" #抓年月日時分秒,懶得再修了 match = re.match(date_pattern, date_prediction[:7]) if match: if date_prediction[:7] not in output['relations'][relation_label]['predictions']: output['relations'][relation_label]['predictions'].append(date_prediction[:7]) else: if date_prediction not in output['relations'][relation_label]['predictions']: output['relations'][relation_label]['predictions'].append(date_prediction) elif relation1 in ['起訖日', '日期']: relation_label = '受理_起始日' output['relations'].setdefault(relation_label, { 'relation': relation_label, 'predictions': [] }) date_pattern = r"1[0-9]\d{3}(?:0[1-9]|1[0-2])(?:0[1-9]|[1-2]\d|3[01])(?:\d{4-6})?$" #抓年月日時分秒,懶得再修了 match = re.match(date_pattern, date_prediction[:7]) if match: if date_prediction[:7] not in output['relations'][relation_label]['predictions']: output['relations'][relation_label]['predictions'].append(date_prediction[:7]) else: if date_prediction not in output['relations'][relation_label]['predictions']: output['relations'][relation_label]['predictions'].append(date_prediction) relation_label = '受理_終止日' output['relations'].setdefault(relation_label, { 'relation': relation_label, 'predictions': [] }) date_pattern = r"1[0-9]\d{3}(?:0[1-9]|1[0-2])(?:0[1-9]|[1-2]\d|3[01])(?:\d{4-6})?$" #抓年月日時分秒,懶得再修了 match = re.match(date_pattern, date_prediction[:7]) if match: if date_prediction[:7] not in output['relations'][relation_label]['predictions']: output['relations'][relation_label]['predictions'].append(date_prediction[:7]) else: if date_prediction not in output['relations'][relation_label]['predictions']: output['relations'][relation_label]['predictions'].append(date_prediction) elif relation1 == '手術項目': relation_label = '手術項目' output['relations'].setdefault(relation_label, { 'relation': relation_label, 'predictions': [] }) if date_prediction not in output['relations'][relation_label]['predictions']: output['relations'][relation_label]['predictions'].append(date_prediction) ['predictions'].append(date_prediction) if relation2: predictions, topk_predictions = hugie.request(text, keyword, relation=relation2) if predictions[0]: # 如果有預測結果 for prediction in predictions[0]: date_prediction = convert_to_ROC_time(prediction) if relation2 == '開始時間': relation_label = '受理_起始日時分秒' output['relations'][relation2] = { 'relation': relation_label, 'predictions': [date_prediction] } if relation2 == '終止時間': relation_label = '受理_終止日時分秒' output['relations'][relation2] = { 'relation': relation_label, 'predictions': [date_prediction] } existing_entities = [result['entity'] for result in results] if output['entity'] in existing_entities: # 合併相同實體的關係 existing_result = next((result for result in results if result['entity'] == output['entity']), None) existing_relations = existing_result['relations'] for relation, predictions in output['relations'].items(): existing_relations[relation] = predictions else: results.append(output) results = [result for result in results if result['relations']] return json.dumps(results, indent=4, ensure_ascii=False) title = "
醫囑分析:HugIE @ HugNLP" description = """
基於機器閱讀理解(MRC)的指令微調(Instruction-tuning)的統一信息抽取框架之診斷書醫囑擷取分析
https://github.com/Deep-Learning-101
https://github.com/Deep-Learning-101/Natural-Language-Processing-Paper