arosyihuddin commited on
Commit
ecfd12f
·
1 Parent(s): 87e0b7e
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ venv
app.py CHANGED
@@ -1,10 +1,7 @@
1
- import sys
2
- sys.path.append("/home/pstar7/Documents/gradio/src")
3
-
4
  from transformers import BertTokenizerFast
5
  from gradio_pdf import PDF
6
- from BertModel import *
7
- from pdf_predict import *
8
  import gradio as gr
9
 
10
  ids_to_labels = {0: 'B_ADVO', 1: 'B_ARTV', 2: 'B_CRIA', 3: 'B_DEFN', 4: 'B_JUDG', 5: 'B_JUDP', 6: 'B_PENA', 7: 'B_PROS', 8: 'B_PUNI', 9: 'B_REGI', 10: 'B_TIMV', 11: 'B_VERN', 12: 'I_ADVO', 13: 'I_ARTV', 14: 'I_CRIA', 15: 'I_DEFN', 16: 'I_JUDG', 17: 'I_JUDP', 18: 'I_PENA', 19: 'I_PROS', 20: 'I_PUNI', 21: 'I_REGI', 22: 'I_TIMV', 23: 'I_VERN', 24: 'O'}
@@ -24,9 +21,9 @@ def predict(doc : str, model : str) -> str:
24
  use_model = model_indonlu
25
  use_tokenizer = tokenizer_indonlu
26
 
27
- result = pdf_predict(use_model, use_tokenizer, doc, ids_to_labels, model)
28
 
29
- return result
30
 
31
  iface = gr.Interface(
32
  fn=predict,
 
 
 
 
1
  from transformers import BertTokenizerFast
2
  from gradio_pdf import PDF
3
+ from src.bert import *
4
+ from src.legalNER import *
5
  import gradio as gr
6
 
7
  ids_to_labels = {0: 'B_ADVO', 1: 'B_ARTV', 2: 'B_CRIA', 3: 'B_DEFN', 4: 'B_JUDG', 5: 'B_JUDP', 6: 'B_PENA', 7: 'B_PROS', 8: 'B_PUNI', 9: 'B_REGI', 10: 'B_TIMV', 11: 'B_VERN', 12: 'I_ADVO', 13: 'I_ARTV', 14: 'I_CRIA', 15: 'I_DEFN', 16: 'I_JUDG', 17: 'I_JUDP', 18: 'I_PENA', 19: 'I_PROS', 20: 'I_PUNI', 21: 'I_REGI', 22: 'I_TIMV', 23: 'I_VERN', 24: 'O'}
 
21
  use_model = model_indonlu
22
  use_tokenizer = tokenizer_indonlu
23
 
24
+ ner = LegalNER(use_model, use_tokenizer, doc, ids_to_labels, model)
25
 
26
+ return ner.display()
27
 
28
  iface = gr.Interface(
29
  fn=predict,
src/__pycache__/bert.cpython-310.pyc ADDED
Binary file (881 Bytes). View file
 
src/__pycache__/helper.cpython-310.pyc ADDED
Binary file (2.26 kB). View file
 
src/__pycache__/legalNER.cpython-310.pyc ADDED
Binary file (4.17 kB). View file
 
src/align_word_ids.py DELETED
@@ -1,27 +0,0 @@
1
- def align_word_ids(texts, tokenizer, label_all_tokens):
2
-
3
- tokenized_inputs = tokenizer(texts, padding='max_length', max_length=512, truncation=True)
4
-
5
- word_ids = tokenized_inputs.word_ids()
6
-
7
- previous_word_idx = None
8
- label_ids = []
9
-
10
- for word_idx in word_ids:
11
-
12
- if word_idx is None:
13
- label_ids.append(-100)
14
-
15
- elif word_idx != previous_word_idx:
16
- try:
17
- label_ids.append(1)
18
- except:
19
- label_ids.append(-100)
20
- else:
21
- try:
22
- label_ids.append(1 if label_all_tokens else -100)
23
- except:
24
- label_ids.append(-100)
25
- previous_word_idx = word_idx
26
-
27
- return label_ids
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/{BertModel.py → bert.py} RENAMED
File without changes
src/convertTotext.py DELETED
@@ -1,22 +0,0 @@
1
- def convertTotext(data_token, prediction_label):
2
- prev_tag = 'O'
3
- result = {}
4
- temp = ''
5
- for i, word in enumerate(data_token):
6
- if prediction_label[i] != 'O':
7
- if prev_tag == 'O' and temp != '':
8
- temp = ''
9
-
10
- if '##' in word:
11
- temp += word.replace('##', '')
12
-
13
- else:
14
- temp += ' ' + word
15
- else:
16
- if temp != "":
17
- result[prev_tag.replace("I_", "B_")] = temp.strip()
18
- temp = ""
19
-
20
- prev_tag = prediction_label[i]
21
-
22
- return result
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/{clean_text.py → helper.py} RENAMED
@@ -1,3 +1,5 @@
 
 
1
  import re
2
 
3
  def clean_text(text):
@@ -12,4 +14,22 @@ def clean_text(text):
12
  text = re.sub(r'Hal. \d+ dari \d+ .*', '', text)
13
  text = re.sub(r' +|[\uf0fc\uf0a7\uf0a8\uf0b7]', ' ', text)
14
  text = re.sub(r'[\u2026]+|\.{3,}', '', text)
15
- return text.strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import PyPDF2
2
+ import requests
3
  import re
4
 
5
  def clean_text(text):
 
14
  text = re.sub(r'Hal. \d+ dari \d+ .*', '', text)
15
  text = re.sub(r' +|[\uf0fc\uf0a7\uf0a8\uf0b7]', ' ', text)
16
  text = re.sub(r'[\u2026]+|\.{3,}', '', text)
17
+ return text.strip()
18
+
19
+ def read_pdf(pdf):
20
+ try:
21
+ pdf_text = ''
22
+ pdf_file = open(pdf, 'rb')
23
+ pdf_reader = PyPDF2.PdfReader(pdf_file)
24
+
25
+ for page_num in range(len(pdf_reader.pages)):
26
+ page = pdf_reader.pages[page_num]
27
+ text = clean_text(page.extract_text())
28
+
29
+ pdf_text += text
30
+
31
+ pdf_file.close()
32
+ return pdf_text.strip()
33
+
34
+ except requests.exceptions.RequestException as e:
35
+ print("Error:", e)
src/legalNER.py ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.helper import *
2
+ import gradio as gr
3
+ import torch
4
+
5
+ class LegalNER():
6
+ def __init__(self, model, tokenizer, pdf_file, ids_to_labels, check_point='IndoBERT (IndoLEM)', label_all_tokens=True):
7
+ self.model = model
8
+ self.tokenizer = tokenizer
9
+ self.pdf = pdf_file
10
+ self.check_point = check_point
11
+ self.label_all_tokens = label_all_tokens
12
+ self.ids_to_labels = ids_to_labels
13
+ self.prediction_label = ''
14
+ self.label_convert = {'B_VERN' : 'Nomor Putusan',
15
+ 'B_DEFN' : 'Nama Terdakwa',
16
+ 'B_CRIA' : 'Tindak Pidana',
17
+ 'B_ARTV' : 'Melanggar KUHP',
18
+ 'B_PENA' : 'Tuntutan Hukum',
19
+ 'B_PUNI' : 'Putusan Hukum',
20
+ 'B_TIMV' : 'Tanggal Putusan',
21
+ 'B_JUDP' : 'Hakim Ketua',
22
+ 'B_JUDG' : 'Hakim Anggota',
23
+ 'B_REGI' : 'Panitera',
24
+ 'B_PROS' : 'Penuntut Umum',
25
+ 'B_ADVO' : 'Pengacara',
26
+ }
27
+
28
+ def align_word_ids(self, texts):
29
+
30
+ tokenized_inputs = self.tokenizer(texts, padding='max_length', max_length=512, truncation=True)
31
+
32
+ word_ids = tokenized_inputs.word_ids()
33
+
34
+ previous_word_idx = None
35
+ label_ids = []
36
+
37
+ for word_idx in word_ids:
38
+
39
+ if word_idx is None:
40
+ label_ids.append(-100)
41
+
42
+ elif word_idx != previous_word_idx:
43
+ try:
44
+ label_ids.append(1)
45
+ except:
46
+ label_ids.append(-100)
47
+ else:
48
+ try:
49
+ label_ids.append(1 if self.label_all_tokens else -100)
50
+ except:
51
+ label_ids.append(-100)
52
+ previous_word_idx = word_idx
53
+
54
+ return label_ids
55
+
56
+ def labelToText(self, data_token):
57
+ prev_tag = 'O'
58
+ result = {}
59
+ temp = ''
60
+
61
+ # Menganggabungkan semua token menjadi satu kalimat sesuai dengan labelnya
62
+ for i, word in enumerate(data_token):
63
+ if self.prediction_label[i] != 'O':
64
+ if prev_tag == 'O' and temp != '':
65
+ temp = ''
66
+
67
+ if '##' in word:
68
+ temp += word.replace('##', '')
69
+
70
+ else:
71
+ temp += ' ' + word
72
+ else:
73
+ if temp != "":
74
+ result[prev_tag.replace("I_", "B_")] = temp.strip()
75
+ temp = ""
76
+
77
+ prev_tag = self.prediction_label[i]
78
+
79
+ return result
80
+
81
+ def labelConverter(self, entity):
82
+ # Memilih prediksi entitas yang paling bagus
83
+ entity_result = {}
84
+ for i in entity:
85
+ if len(list(i.keys())) > 1:
86
+ for y in i.items():
87
+ if y[0] not in entity_result:
88
+ entity_result[y[0]] = y[1]
89
+ else:
90
+ if len(entity_result[y[0]]) < len(y[1]):
91
+ entity_result[y[0]] = y[1]
92
+ else:
93
+ if tuple(i.items())[0] not in entity_result:
94
+ entity_result[tuple(i.items())[0][0]] = tuple(i.items())[0][1]
95
+
96
+ # Mengkonversi hasil dalam bentuk String
97
+ result = ''
98
+ for i, (label, data) in enumerate(entity_result.items()):
99
+ if label in ['B_PENA', 'B_ARTV', 'B_PROS']:
100
+ result += f'{i+1}. {self.label_convert[label]}\t = {data.capitalize()}\n'
101
+ elif label in ['B_JUDP', 'B_CRIA']:
102
+ result += f'{i+1}. {self.label_convert[label]}\t\t\t = {data.capitalize()}\n'
103
+ elif label in ['B_ADVO', 'B_REGI']:
104
+ result += f'{i+1}. {self.label_convert[label]}\t\t\t\t\t = {data.capitalize()}\n'
105
+ else:
106
+ result += f'{i+1}. {self.label_convert[label]}\t\t = {data.capitalize()}\n'
107
+
108
+ return result
109
+
110
+ def display(self, progress=gr.Progress()):
111
+ file_pdf = read_pdf(self.pdf)
112
+ sentence_file = file_pdf.split(';')
113
+
114
+ use_cuda = torch.cuda.is_available()
115
+ device = torch.device("cuda" if use_cuda else "cpu")
116
+ if use_cuda:
117
+ self.model = self.model.cuda()
118
+
119
+ file_check_point = 'model/IndoLEM/model_fold_4.pth' if self.check_point == 'IndoBERT (IndoLEM)' else 'model/IndoNLU/model_fold_4.pth'
120
+
121
+ model_weights = torch.load(file_check_point, map_location=torch.device(device))
122
+ self.model.load_state_dict(model_weights)
123
+
124
+ label_extraction = []
125
+ for text in progress.tqdm(sentence_file, desc="Ekstraksi Entitas"):
126
+ toknize = self.tokenizer(text, padding='max_length', max_length = 512, truncation=True, return_tensors="pt")
127
+ input_ids = toknize['input_ids'].to(device)
128
+ mask = toknize['attention_mask'].to(device)
129
+
130
+ logits = self.model(input_ids, mask, None)
131
+ label_ids = torch.Tensor(self.align_word_ids(text)).unsqueeze(0).to(device)
132
+ logits_clean = logits[0][label_ids != -100]
133
+ predictions = logits_clean.argmax(dim=1).tolist()
134
+ prediction_label = [self.ids_to_labels[i] for i in predictions]
135
+
136
+ input_ids_conv = self.tokenizer.convert_ids_to_tokens(toknize['input_ids'][0])
137
+ data_token = [word for word in input_ids_conv if word not in ['[CLS]', '[SEP]', '[PAD]']]
138
+ self.prediction_label = prediction_label
139
+ labelConv = self.labelToText(data_token)
140
+
141
+ if labelConv:
142
+ label_extraction.append(labelConv)
143
+
144
+ return self.labelConverter(label_extraction)
src/pdf_predict.py DELETED
@@ -1,48 +0,0 @@
1
- from tqdm import tqdm
2
- import torch
3
- from read_file import *
4
- from align_word_ids import *
5
- from convertTotext import *
6
-
7
- def pdf_predict(model, tokenizer, file_path, ids_to_labels, check_point='IndoBERT (IndoLEM)'):
8
- file_pdf = read_pdf(file_path)
9
- sentence_file = file_pdf.split(';')
10
-
11
- use_cuda = torch.cuda.is_available()
12
- device = torch.device("cuda" if use_cuda else "cpu")
13
- if use_cuda:
14
- model = model.cuda()
15
-
16
- file_check_point = 'model/IndoLEM/model_fold_4.pth' if check_point == 'IndoBERT (IndoLEM)' else 'model/IndoNLU/model_fold_4.pth'
17
-
18
- model_weights = torch.load(file_check_point, map_location=torch.device(device))
19
- model.load_state_dict(model_weights)
20
-
21
- label_extraction = []
22
- for text in tqdm(sentence_file, desc="Prediction Sentence"):
23
- toknize = tokenizer(text, padding='max_length', max_length = 512, truncation=True, return_tensors="pt")
24
- input_ids = toknize['input_ids'].to(device)
25
- mask = toknize['attention_mask'].to(device)
26
-
27
- logits = model(input_ids, mask, None)
28
- label_ids = torch.Tensor(align_word_ids(text, tokenizer, True)).unsqueeze(0).to(device)
29
- logits_clean = logits[0][label_ids != -100]
30
- predictions = logits_clean.argmax(dim=1).tolist()
31
- prediction_label = [ids_to_labels[i] for i in predictions]
32
-
33
- input_ids_conv = tokenizer.convert_ids_to_tokens(toknize['input_ids'][0])
34
- data_token = [word for word in input_ids_conv if word not in ['[CLS]', '[SEP]', '[PAD]']]
35
- nerExtraction = convertTotext(data_token, prediction_label)
36
-
37
- if nerExtraction:
38
- label_extraction.append(nerExtraction)
39
- # print(f"\nText : {text}")
40
- # print(f"Predict Label : {prediction_label}")
41
- # print()
42
-
43
- # print(f"Hasil Ekstrak NER:")
44
- # print(nerExtraction)
45
- # print(f"Panjang Token : {len(data_token)}, Panjang Predict Label : {len(prediction_label)}")
46
- # print()
47
-
48
- return label_extraction
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/read_file.py DELETED
@@ -1,21 +0,0 @@
1
- import PyPDF2
2
- from clean_text import *
3
- import requests
4
-
5
- def read_pdf(file_pdf):
6
- try:
7
- pdf_text = ''
8
- pdf_file = open(file_pdf, 'rb')
9
- pdf_reader = PyPDF2.PdfReader(pdf_file)
10
-
11
- for page_num in range(len(pdf_reader.pages)):
12
- page = pdf_reader.pages[page_num]
13
- text = clean_text(page.extract_text())
14
-
15
- pdf_text += text
16
-
17
- pdf_file.close()
18
- return pdf_text.strip()
19
-
20
- except requests.exceptions.RequestException as e:
21
- print("Error:", e)