Spaces:

rmph
/

NER_skripsi

Runtime error

App Files Files Community

ki-ki13 commited on Dec 2, 2023

Commit

d26f6fb

•

1 Parent(s): 8dadc22

first init

Browse files

Files changed (11) hide show

app.py +18 -0
format_entity.py +19 -0
model.py +43 -0
model/config.json +49 -0
model/pytorch_model.bin +3 -0
model/special_tokens_map.json +7 -0
model/tokenizer.json +0 -0
model/tokenizer_config.json +14 -0
model/training_args.bin +3 -0
model/vocab.txt +0 -0
sample_app.py +41 -0

app.py ADDED Viewed

	@@ -0,0 +1,18 @@

+import gradio as gr
+from model import predict_ner
+iface = gr.Interface(
+    fn = predict_ner,
+    inputs="text",
+    outputs="html",
+    title="Named Entity Recognition for Electronic Medical Record using Bidirectional Long Short Term Memory with ClinicalBERT",
+    description="This tool identifies and highlights entities in text. <br><span style='background-color:red;color: white;'>Red</span> is for problems, <br><span style='background-color:blue;color:white;'>Blue</span> is for tests, and <br><span style='background-color:green;color: white;'>Green</span> is for treatments.",
+    css="span { font-weight: bold; } .problem { background-color: red } .test { background-color: blue } .treatment { background-color: green}",
+    examples=[
+        ["The patient presented with symptoms of fever and cough. A chest X-ray was performed to assess the condition."],
+        ["After diagnosis, the physician prescribed antibiotics for the treatment of the infection."],
+        ["The patient, a 55-year-old male, presented to the emergency department with complaints of chest pain and shortness of breath. He has a history of hypertension and diabetes. On physical examination, the patient appeared diaphoretic, and his blood pressure was elevated at 160/90 mmHg. An electrocardiogram (ECG) was performed, which showed ST-segment elevation in the anterior leads, consistent with an acute myocardial infarction. The patient was immediately started on aspirin, nitroglycerin, and clopidogrel and was taken for emergent cardiac catheterization. Coronary angiography revealed a critical stenosis in the left anterior descending artery, which was successfully stented. The patient's chest pain improved, and he was admitted to the cardiac care unit for further monitoring. Laboratory tests showed elevated troponin levels, confirming the myocardial infarction. The patient was counseled on lifestyle modifications, including diet and exercise, and was prescribed medications for long-term management. He was discharged home in stable condition with instructions to follow up with his cardiologist in one week."]
+])
+iface.launch()

format_entity.py ADDED Viewed

	@@ -0,0 +1,19 @@

+def format_entities(input_text, predicted_labels):
+    formatted_text = ""
+    entity_open = False
+    for token, label in zip(input_text, predicted_labels):
+        if label.startswith("B-"):
+            if entity_open:
+                formatted_text += "</span>"
+            formatted_text += f'<span class="{label[2:]}">{token} ({label}) '
+            entity_open = True
+        elif label.startswith("I-"):
+            formatted_text += token + " "
+        else:
+            if entity_open:
+                formatted_text += "</span>"
+            formatted_text += token + " "
+            entity_open = False
+    if entity_open:
+        formatted_text += "</span>"
+    return formatted_text

model.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import torch
+from format_entity import format_entities
+from transformers import DistilBertForTokenClassification, DistilBertTokenizer
+DRIVE_BASE_PATH = "model/"
+model_path = f"{DRIVE_BASE_PATH}"
+model = DistilBertForTokenClassification.from_pretrained(model_path)
+tokenizer = DistilBertTokenizer.from_pretrained(model_path)
+def predict_ner(input_text):
+    # Tokenize the input text
+    inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=128)
+    # Make predictions
+    with torch.no_grad():
+        outputs = model(**inputs)
+    # Process the NER results
+    labels = outputs.logits.argmax(dim=2)
+    predicted_labels = [model.config.id2label[label_id] for label_id in labels[0].tolist()]
+    # probabilities = torch.nn.functional.softmax(outputs.logits, dim=2)
+    # Exclude [SEP] and [CLS] tokens from tokenized_text and predicted_labels
+    tokenized_text = tokenizer.tokenize(tokenizer.decode(inputs["input_ids"][0]))
+    token_label_pairs = [
+        (token, label) for token, label in zip(tokenized_text, predicted_labels)
+        if token not in ["[SEP]", "[CLS]"]
+    ]
+    # Format the results vertically, excluding [SEP] and [CLS]
+    formatted_results = format_entities(
+        [pair[0] for pair in token_label_pairs],
+        [pair[1] for pair in token_label_pairs]
+    )
+    # Get top 3 probabilities and labels
+    # top_n_probs, top_n_labels = get_top_n_probs(probabilities[0], 6, list(model.config.id2label.values()))
+    return formatted_results

model/config.json ADDED Viewed

	@@ -0,0 +1,49 @@

+{
+  "_name_or_path": "nlpie/clinical-distilbert",
+  "activation": "gelu",
+  "adapters": {
+    "adapters": {},
+    "config_map": {},
+    "fusion_config_map": {},
+    "fusions": {}
+  },
+  "architectures": [
+    "DistilBertForTokenClassification"
+  ],
+  "attention_dropout": 0.1,
+  "dim": 768,
+  "dropout": 0.1,
+  "hidden_dim": 3072,
+  "id2label": {
+    "0": "I-treatment",
+    "1": "O",
+    "2": "B-test",
+    "3": "I-problem",
+    "4": "B-treatment",
+    "5": "I-test",
+    "6": "B-problem"
+  },
+  "initializer_range": 0.02,
+  "label2id": {
+    "I-treatment": 0,
+    "O": 1,
+    "B-test": 2,
+    "I-problem": 3,
+    "B-treatment": 4,
+    "I-test": 5,
+    "B-problem": 6
+  },
+  "max_position_embeddings": 512,
+  "model_type": "distilbert",
+  "n_heads": 12,
+  "n_layers": 6,
+  "output_past": true,
+  "pad_token_id": 0,
+  "qa_dropout": 0.1,
+  "seq_classif_dropout": 0.2,
+  "sinusoidal_pos_embds": false,
+  "tie_weights_": true,
+  "torch_dtype": "float32",
+  "transformers_version": "4.27.0.dev0",
+  "vocab_size": 28996
+}

model/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:081f66e415a24009bcdebad613beebe627a1a7b07855549ea02494fe0ce23f74
+size 260818805

model/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "cls_token": "[CLS]",
+  "mask_token": "[MASK]",
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "unk_token": "[UNK]"
+}

model/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

model/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "cls_token": "[CLS]",
+  "do_lower_case": false,
+  "mask_token": "[MASK]",
+  "model_max_length": 512,
+  "name_or_path": "distilbert-base-cased",
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "special_tokens_map_file": null,
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "DistilBertTokenizer",
+  "unk_token": "[UNK]"
+}

model/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:145be841037ec5f2d5313b58d1c40166d68fc7cf901a253a8827bcbb768cfae8
+size 3503

model/vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

sample_app.py ADDED Viewed

	@@ -0,0 +1,41 @@

+from transformers import DistilBertForTokenClassification, DistilBertTokenizer
+import torch
+DRIVE_BASE_PATH = "model/"
+model_path = f"{DRIVE_BASE_PATH}"
+model = DistilBertForTokenClassification.from_pretrained(model_path)
+tokenizer = DistilBertTokenizer.from_pretrained(model_path)
+def predict_ner(input_text):
+    # Tokenize the input text
+    inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=128)
+    # Make predictions
+    with torch.no_grad():
+        outputs = model(**inputs)
+    # Process the NER results
+    labels = outputs.logits.argmax(dim=2)
+    predicted_labels = [model.config.id2label[label_id] for label_id in labels[0].tolist()]
+    # predicted_labels = [label_mapping.get(model.config.id2label[label_id], "O") for label_id in labels[0].tolist()]
+    tokenized_text = tokenizer.tokenize(tokenizer.decode(inputs["input_ids"][0]))
+    # Pair tokens with their labels, excluding [SEP] and [CLS]
+    token_label_pairs = [(token, label) for token, label in zip(tokenized_text, predicted_labels) if token not in ["[SEP]", "[CLS]"]]
+    # Format the results vertically, excluding [SEP] and [CLS]
+    formatted_results = []
+    for token, label in token_label_pairs:
+        formatted_results.append(f"Token: {token}, Label: {label}")
+    return {"text": input_text, "formatted_results": formatted_results}
+input_text = """Also , due to worsening renal function , she was started on octreotide / midodrine / albumin for hepatorenal
+syndrome ( Cr 3.3 at its worst ) which resolved prior to her discharge ."""
+result = predict_ner(input_text)
+print(result['text'])
+# print(result['token_probabilities'])
+for item in result['formatted_results']:
+    print(item)