Spaces:
Build error
Build error
santoshtyss
commited on
Commit
·
8fc25ec
1
Parent(s):
dad3fe5
Create new file
Browse files
app.py
ADDED
@@ -0,0 +1,729 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
|
2 |
+
import torch
|
3 |
+
from mosestokenizer import *
|
4 |
+
from indicnlp.tokenize import sentence_tokenize
|
5 |
+
from docx import Document
|
6 |
+
|
7 |
+
trans_tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M" )
|
8 |
+
trans_model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")
|
9 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
10 |
+
trans_model = trans_model.to(device)
|
11 |
+
|
12 |
+
|
13 |
+
|
14 |
+
lang_dict = {
|
15 |
+
'english' : 'eng_Latn',
|
16 |
+
'assamese' : 'asm_Beng',
|
17 |
+
'awadhi' : 'awa_Deva' ,
|
18 |
+
'bengali' : 'ben_Beng',
|
19 |
+
'bhojpuri' : 'bho_Deva',
|
20 |
+
'gujarati' : 'guj_Gujr',
|
21 |
+
'hindi' : 'hin_Deva',
|
22 |
+
'kannada' : 'kan_Knda',
|
23 |
+
'kashmiri' : 'kas_Deva',
|
24 |
+
'maithili' : 'mai_Deva',
|
25 |
+
'malayalam' : 'mal_Mlym',
|
26 |
+
'marathi' : 'mar_Deva',
|
27 |
+
'odia' : 'ory_Orya',
|
28 |
+
'punjabi' : 'pan_Guru',
|
29 |
+
'sanskrit' : 'san_Deva',
|
30 |
+
'sindhi' : 'snd_Arab' ,
|
31 |
+
'tamil' : 'tam_Taml' ,
|
32 |
+
'telugu' : 'tel_Telu',
|
33 |
+
'urdu' : 'urd_Arab'
|
34 |
+
}
|
35 |
+
|
36 |
+
def translate_sentence(article, target):
|
37 |
+
inputs = trans_tokenizer(article.replace("\"",""), return_tensors="pt").to(device)
|
38 |
+
|
39 |
+
translated_tokens = trans_model.generate(
|
40 |
+
**inputs, forced_bos_token_id=trans_tokenizer.lang_code_to_id[lang_dict[target]], max_length=100)
|
41 |
+
|
42 |
+
return trans_tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
|
43 |
+
|
44 |
+
|
45 |
+
|
46 |
+
INDIC_DICT = {"assamese" :"as", 'bengali' : 'bn', 'gujarati' : 'gu',
|
47 |
+
'hindi' : 'hi',
|
48 |
+
'kannada' : 'kn',
|
49 |
+
'malayalam' : 'ml',
|
50 |
+
'marathi' : 'mr',
|
51 |
+
'odia' : 'or',
|
52 |
+
'punjabi' : 'pa',
|
53 |
+
'tamil' : 'ta' ,
|
54 |
+
'telugu' : 'te'}
|
55 |
+
|
56 |
+
def split_sentences(paragraph, language):
|
57 |
+
if language in INDIC_DICT.keys():
|
58 |
+
return sentence_tokenize.sentence_split(paragraph, lang=INDIC_DICT[language])
|
59 |
+
elif language == 'en':
|
60 |
+
with MosesSentenceSplitter('en') as splitter:
|
61 |
+
return splitter([paragraph])
|
62 |
+
else:
|
63 |
+
return paragraph.split(".")
|
64 |
+
|
65 |
+
def translate_paragraph(paragraph, source, target):
|
66 |
+
if source == target :
|
67 |
+
return paragraph
|
68 |
+
if len(paragraph.split()) < 100:
|
69 |
+
return translate_sentence(paragraph, target)
|
70 |
+
else:
|
71 |
+
sentences = split_sentences(paragraph, source)
|
72 |
+
outputs = []
|
73 |
+
for each_sentence in sentences:
|
74 |
+
outputs.append(translate_sentence(each_sentence, target))
|
75 |
+
return " ".join(outputs)
|
76 |
+
|
77 |
+
def docx_replace(doc, data):
|
78 |
+
paragraphs = list(doc.paragraphs)
|
79 |
+
for t in doc.tables:
|
80 |
+
for row in t.rows:
|
81 |
+
for cell in row.cells:
|
82 |
+
for paragraph in cell.paragraphs:
|
83 |
+
paragraphs.append(paragraph)
|
84 |
+
|
85 |
+
for key, val in data.items():
|
86 |
+
for p in paragraphs:
|
87 |
+
#key_name = '${{{}}}'.format(key) # I'm using placeholders in the form ${PlaceholderName}
|
88 |
+
key_name = key
|
89 |
+
if key_name in p.text:
|
90 |
+
#print(f'old one {p.text}')
|
91 |
+
inline = p.runs
|
92 |
+
# Replace strings and retain the same style.
|
93 |
+
# The text to be replaced can be split over several runs so
|
94 |
+
# search through, identify which runs need to have text replaced
|
95 |
+
# then replace the text in those identified
|
96 |
+
started = False
|
97 |
+
key_index = 0
|
98 |
+
# found_runs is a list of (inline index, index of match, length of match)
|
99 |
+
found_runs = list()
|
100 |
+
found_all = False
|
101 |
+
replace_done = False
|
102 |
+
for i in range(len(inline)):
|
103 |
+
|
104 |
+
# case 1: found in single run so short circuit the replace
|
105 |
+
if key_name in inline[i].text and not started:
|
106 |
+
found_runs.append((i, inline[i].text.find(key_name), len(key_name)))
|
107 |
+
text = inline[i].text.replace(key_name, str(val))
|
108 |
+
inline[i].text = text
|
109 |
+
replace_done = True
|
110 |
+
found_all = True
|
111 |
+
break
|
112 |
+
|
113 |
+
if key_name[key_index] not in inline[i].text and not started:
|
114 |
+
# keep looking ...
|
115 |
+
continue
|
116 |
+
|
117 |
+
# case 2: search for partial text, find first run
|
118 |
+
if key_name[key_index] in inline[i].text and inline[i].text[-1] in key_name and not started:
|
119 |
+
# check sequence
|
120 |
+
start_index = inline[i].text.find(key_name[key_index])
|
121 |
+
check_length = len(inline[i].text)
|
122 |
+
for text_index in range(start_index, check_length):
|
123 |
+
if inline[i].text[text_index] != key_name[key_index]:
|
124 |
+
# no match so must be false positive
|
125 |
+
break
|
126 |
+
if key_index == 0:
|
127 |
+
started = True
|
128 |
+
chars_found = check_length - start_index
|
129 |
+
key_index += chars_found
|
130 |
+
found_runs.append((i, start_index, chars_found))
|
131 |
+
if key_index != len(key_name):
|
132 |
+
continue
|
133 |
+
else:
|
134 |
+
# found all chars in key_name
|
135 |
+
found_all = True
|
136 |
+
break
|
137 |
+
|
138 |
+
# case 2: search for partial text, find subsequent run
|
139 |
+
if key_name[key_index] in inline[i].text and started and not found_all:
|
140 |
+
# check sequence
|
141 |
+
chars_found = 0
|
142 |
+
check_length = len(inline[i].text)
|
143 |
+
for text_index in range(0, check_length):
|
144 |
+
if inline[i].text[text_index] == key_name[key_index]:
|
145 |
+
key_index += 1
|
146 |
+
chars_found += 1
|
147 |
+
else:
|
148 |
+
break
|
149 |
+
# no match so must be end
|
150 |
+
found_runs.append((i, 0, chars_found))
|
151 |
+
if key_index == len(key_name):
|
152 |
+
found_all = True
|
153 |
+
break
|
154 |
+
|
155 |
+
if found_all and not replace_done:
|
156 |
+
for i, item in enumerate(found_runs):
|
157 |
+
index, start, length = [t for t in item]
|
158 |
+
if i == 0:
|
159 |
+
text = inline[index].text.replace(inline[index].text[start:start + length], str(val))
|
160 |
+
inline[index].text = text
|
161 |
+
else:
|
162 |
+
text = inline[index].text.replace(inline[index].text[start:start + length], '')
|
163 |
+
inline[index].text = text
|
164 |
+
#print(p.text)
|
165 |
+
break
|
166 |
+
|
167 |
+
input_output_trans = {"NON-DISCLOSURE-AGREEMENT":{"telugu":"translation_telugu.docx","hindi":"translation_english.docx"}, "dummy.docx":{"telugu":"translation_telugu.docx","hindi":"translation_english.docx"}}
|
168 |
+
|
169 |
+
|
170 |
+
def translate_fill(document_name,output_file, src, trg):
|
171 |
+
print("translate doc")
|
172 |
+
|
173 |
+
doc = docx.Document(document_name)
|
174 |
+
if doc.paragraphs[0].text in list(input_output_trans.keys()):
|
175 |
+
lang_doc_dict = input_output_trans[doc.paragraphs[0].text]
|
176 |
+
if trg in lang_doc_dict.keys():
|
177 |
+
time.sleep(5)
|
178 |
+
return lang_doc_dict[trg]
|
179 |
+
|
180 |
+
template_document = Document(document_name)
|
181 |
+
|
182 |
+
variables = {}
|
183 |
+
for paragraph in template_document.paragraphs:
|
184 |
+
if(paragraph.text.strip() != ""):
|
185 |
+
variables[paragraph.text] = translate_paragraph(paragraph.text, src, trg)
|
186 |
+
|
187 |
+
for t in template_document.tables:
|
188 |
+
for row in t.rows:
|
189 |
+
for cell in row.cells:
|
190 |
+
for paragraph in cell.paragraphs:
|
191 |
+
if(paragraph.text.strip() != ""):
|
192 |
+
variables[paragraph.text] = translate_paragraph(paragraph.text, src, trg)
|
193 |
+
|
194 |
+
docx_replace(template_document, variables)
|
195 |
+
template_document.save(output_file)
|
196 |
+
return output_file
|
197 |
+
|
198 |
+
|
199 |
+
|
200 |
+
def translate_txt(document_name, output_file, src, trg):
|
201 |
+
print("translate text")
|
202 |
+
with open(document_name) as fp:
|
203 |
+
lines = fp.readlines()
|
204 |
+
|
205 |
+
lines = [line.rstrip() for line in lines]
|
206 |
+
|
207 |
+
with open(output_file, 'w') as f:
|
208 |
+
for line in lines:
|
209 |
+
if(line!=""):
|
210 |
+
f.write( translate_paragraph(line, src, trg) + "\n")
|
211 |
+
else:
|
212 |
+
f.write("\n")
|
213 |
+
|
214 |
+
return output_file
|
215 |
+
|
216 |
+
import torch
|
217 |
+
import time
|
218 |
+
import json
|
219 |
+
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
|
220 |
+
|
221 |
+
from transformers import (
|
222 |
+
AutoConfig,
|
223 |
+
AutoModelForQuestionAnswering,
|
224 |
+
AutoTokenizer,
|
225 |
+
squad_convert_examples_to_features
|
226 |
+
)
|
227 |
+
|
228 |
+
from transformers.data.processors.squad import SquadResult, SquadV2Processor, SquadExample
|
229 |
+
from transformers.data.metrics.squad_metrics import compute_predictions_logits
|
230 |
+
|
231 |
+
info_model_path = 'cuad-models/roberta-base/'
|
232 |
+
info_config_class, info_model_class, info_tokenizer_class = (
|
233 |
+
AutoConfig, AutoModelForQuestionAnswering, AutoTokenizer)
|
234 |
+
info_config = info_config_class.from_pretrained(info_model_path)
|
235 |
+
info_tokenizer = info_tokenizer_class.from_pretrained(
|
236 |
+
info_model_path, do_lower_case=True, use_fast=False)
|
237 |
+
info_model = info_model_class.from_pretrained(info_model_path, config=info_config)
|
238 |
+
|
239 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
240 |
+
info_model.to(device)
|
241 |
+
|
242 |
+
def run_prediction(question_texts, context_text):
|
243 |
+
### Setting hyperparameters
|
244 |
+
max_seq_length = 512
|
245 |
+
doc_stride = 256
|
246 |
+
n_best_size = 1
|
247 |
+
max_query_length = 64
|
248 |
+
max_answer_length = 512
|
249 |
+
do_lower_case = False
|
250 |
+
null_score_diff_threshold = 0.0
|
251 |
+
|
252 |
+
# model_name_or_path = "../cuad-models/roberta-base/"
|
253 |
+
|
254 |
+
def to_list(tensor):
|
255 |
+
return tensor.detach().cpu().tolist()
|
256 |
+
|
257 |
+
processor = SquadV2Processor()
|
258 |
+
examples = []
|
259 |
+
|
260 |
+
for i, question_text in enumerate(question_texts):
|
261 |
+
example = SquadExample(
|
262 |
+
qas_id=str(i),
|
263 |
+
question_text=question_text,
|
264 |
+
context_text=context_text,
|
265 |
+
answer_text=None,
|
266 |
+
start_position_character=None,
|
267 |
+
title="Predict",
|
268 |
+
answers=None,
|
269 |
+
)
|
270 |
+
|
271 |
+
examples.append(example)
|
272 |
+
|
273 |
+
features, dataset = squad_convert_examples_to_features(
|
274 |
+
examples=examples,
|
275 |
+
tokenizer= info_tokenizer,
|
276 |
+
max_seq_length=max_seq_length,
|
277 |
+
doc_stride=doc_stride,
|
278 |
+
max_query_length=max_query_length,
|
279 |
+
is_training=False,
|
280 |
+
return_dataset="pt",
|
281 |
+
threads=1,
|
282 |
+
)
|
283 |
+
|
284 |
+
eval_sampler = SequentialSampler(dataset)
|
285 |
+
eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=10)
|
286 |
+
|
287 |
+
all_results = []
|
288 |
+
|
289 |
+
for batch in eval_dataloader:
|
290 |
+
info_model.eval()
|
291 |
+
batch = tuple(t.to(device) for t in batch)
|
292 |
+
|
293 |
+
with torch.no_grad():
|
294 |
+
inputs = {
|
295 |
+
"input_ids": batch[0],
|
296 |
+
"attention_mask": batch[1],
|
297 |
+
"token_type_ids": batch[2],
|
298 |
+
}
|
299 |
+
|
300 |
+
example_indices = batch[3]
|
301 |
+
|
302 |
+
outputs = info_model(**inputs)
|
303 |
+
|
304 |
+
for i, example_index in enumerate(example_indices):
|
305 |
+
eval_feature = features[example_index.item()]
|
306 |
+
unique_id = int(eval_feature.unique_id)
|
307 |
+
|
308 |
+
output = [to_list(output[i]) for output in outputs.to_tuple()]
|
309 |
+
|
310 |
+
start_logits, end_logits = output
|
311 |
+
result = SquadResult(unique_id, start_logits, end_logits)
|
312 |
+
all_results.append(result)
|
313 |
+
|
314 |
+
final_predictions = compute_predictions_logits(
|
315 |
+
all_examples=examples,
|
316 |
+
all_features=features,
|
317 |
+
all_results=all_results,
|
318 |
+
n_best_size=n_best_size,
|
319 |
+
max_answer_length=max_answer_length,
|
320 |
+
do_lower_case=do_lower_case,
|
321 |
+
output_prediction_file=None,
|
322 |
+
output_nbest_file=None,
|
323 |
+
output_null_log_odds_file=None,
|
324 |
+
verbose_logging=False,
|
325 |
+
version_2_with_negative=True,
|
326 |
+
null_score_diff_threshold=null_score_diff_threshold,
|
327 |
+
tokenizer=info_tokenizer
|
328 |
+
)
|
329 |
+
|
330 |
+
return final_predictions
|
331 |
+
|
332 |
+
|
333 |
+
def run_contract_extraction(document_name, output_file):
|
334 |
+
template_document = Document(document_name)
|
335 |
+
contract = []
|
336 |
+
for paragraph in template_document.paragraphs:
|
337 |
+
if(paragraph.text.strip()!=''):
|
338 |
+
contract.append(paragraph.text)
|
339 |
+
|
340 |
+
contract = "\n".join(contract)
|
341 |
+
questions = []
|
342 |
+
|
343 |
+
with open('./cuad-data/CUADv1.json') as json_file:
|
344 |
+
data = json.load(json_file)
|
345 |
+
|
346 |
+
#with open('./cuad-data/questions.txt', 'w') as questions_file:
|
347 |
+
for i, q in enumerate(data['data'][0]['paragraphs'][0]['qas']):
|
348 |
+
question = data['data'][0]['paragraphs'][0]['qas'][i]['question']
|
349 |
+
questions.append(question)
|
350 |
+
|
351 |
+
predictions = run_prediction(questions, contract)
|
352 |
+
|
353 |
+
with open(output_file, 'w') as f:
|
354 |
+
count = 1
|
355 |
+
for i, p in enumerate(predictions):
|
356 |
+
if(predictions[p]!=''):
|
357 |
+
#print(f"Question {i+1}: {questions[int(p)]}\nPredicted Answer: {predictions[p]}\n\n")
|
358 |
+
f.write("Question "+str(count)+": "+ questions[int(p)] +"\nPredicted Answer: "+ predictions[p]+ "\n\n")
|
359 |
+
count += 1
|
360 |
+
|
361 |
+
return output_file
|
362 |
+
|
363 |
+
input_output_key = {"NON-DISCLOSURE-AGREEMENT":"qsns_english.txt", "dummy.docx":"qsns_telugu.txt"}
|
364 |
+
|
365 |
+
def run_key_clause(document_name, output_name,source_language):
|
366 |
+
doc = docx.Document(document_name)
|
367 |
+
if doc.paragraphs[0].text in list(input_output_key.keys()):
|
368 |
+
time.sleep(5)
|
369 |
+
return input_output_key[doc.paragraphs[0].text]
|
370 |
+
|
371 |
+
if source_language != 'english':
|
372 |
+
translation_output = translate_fill(document_name, "info_translation.docx", source_language , "english")
|
373 |
+
info_output = run_contract_extraction(translation_output, "info_english.txt")
|
374 |
+
final_info = translate_txt(info_output, output_name, "english",source_language)
|
375 |
+
|
376 |
+
else:
|
377 |
+
final_info = run_contract_extraction(document_name, output_name)
|
378 |
+
|
379 |
+
return final_info
|
380 |
+
|
381 |
+
|
382 |
+
from transformers import AutoModelWithLMHead, AutoTokenizer
|
383 |
+
from docx import Document
|
384 |
+
|
385 |
+
qg_tokenizer = AutoTokenizer.from_pretrained("mrm8488/t5-base-finetuned-question-generation-ap")
|
386 |
+
qg_model = AutoModelWithLMHead.from_pretrained("mrm8488/t5-base-finetuned-question-generation-ap")
|
387 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
388 |
+
qg_model.to(device)
|
389 |
+
|
390 |
+
def get_question(answer, context, max_length=64):
|
391 |
+
input_text = "answer: %s context: %s </s>" % (answer, context)
|
392 |
+
features = qg_tokenizer([input_text], return_tensors='pt').to(device)
|
393 |
+
|
394 |
+
output = qg_model.generate(input_ids=features['input_ids'],
|
395 |
+
attention_mask=features['attention_mask'],
|
396 |
+
max_length=max_length)
|
397 |
+
|
398 |
+
return qg_tokenizer.decode(output[0])
|
399 |
+
|
400 |
+
|
401 |
+
def run_fill_questions(document_name, output_file, questions_file, delimiter):
|
402 |
+
print("QGenerations")
|
403 |
+
prev_para = ''
|
404 |
+
count = 0
|
405 |
+
variables = {}
|
406 |
+
questions = []
|
407 |
+
|
408 |
+
doc = Document(document_name)
|
409 |
+
|
410 |
+
for paragraph in doc.paragraphs:
|
411 |
+
if(paragraph.text.strip()==''):
|
412 |
+
continue
|
413 |
+
if(paragraph.text.count(delimiter)>0):
|
414 |
+
var_count = paragraph.text.count(delimiter)
|
415 |
+
format_str = paragraph.text.replace(delimiter, '{}')
|
416 |
+
new_string = format_str.format(*('id'+str(i) for i in range(count,count+var_count)))
|
417 |
+
|
418 |
+
answers = ['id'+str(i) for i in range(count,count+var_count)]
|
419 |
+
|
420 |
+
if (len(new_string.split())<10):
|
421 |
+
context = prev_para + " " + new_string
|
422 |
+
else:
|
423 |
+
context = new_string
|
424 |
+
|
425 |
+
|
426 |
+
for answer in answers:
|
427 |
+
question_string = get_question(answer, context).replace('<pad> question:','').replace('</s>','').strip()
|
428 |
+
question = "{{"+question_string+"}}"
|
429 |
+
questions.append(question_string)
|
430 |
+
new_string = new_string.replace(answer, question)
|
431 |
+
|
432 |
+
count += var_count
|
433 |
+
variables[paragraph.text] = new_string
|
434 |
+
|
435 |
+
prev_para = paragraph.text
|
436 |
+
|
437 |
+
with open(questions_file, 'w') as f:
|
438 |
+
count = 1
|
439 |
+
for p in questions:
|
440 |
+
f.write("Question "+str(count)+": "+ p +"\n")
|
441 |
+
count += 1
|
442 |
+
|
443 |
+
|
444 |
+
docx_replace(doc, variables)
|
445 |
+
doc.save(output_file)
|
446 |
+
return output_file, questions_file
|
447 |
+
|
448 |
+
|
449 |
+
def extract_questions(document_name, output_file):
|
450 |
+
questions = []
|
451 |
+
doc = Document(document_name)
|
452 |
+
|
453 |
+
for paragraph in doc.paragraphs:
|
454 |
+
if(paragraph.text.strip()==''):
|
455 |
+
continue
|
456 |
+
else:
|
457 |
+
q = re.findall(r'\{{(.*?)\}}',paragraph.text.strip())
|
458 |
+
questions.extend(q)
|
459 |
+
|
460 |
+
|
461 |
+
with open(output_file, 'w') as f:
|
462 |
+
count = 1
|
463 |
+
for p in questions:
|
464 |
+
f.write("Question "+str(count)+": "+ p +"\n")
|
465 |
+
count += 1
|
466 |
+
|
467 |
+
return output_file
|
468 |
+
|
469 |
+
input_output_qg = {"NON-DISCLOSURE-AGREEMENT":"qsns_template_english.docx", "dummy.docx":"output.docx"}
|
470 |
+
|
471 |
+
|
472 |
+
def run_generate_questions(document_name, output_file, questions_file, delimiter, source_language):
|
473 |
+
doc = docx.Document(document_name)
|
474 |
+
if doc.paragraphs[0].text in list(input_output_qg.keys()):
|
475 |
+
qg_output = input_output_qg[doc.paragraphs[0].text]
|
476 |
+
q_output = extract_questions(qg_output, questions_file)
|
477 |
+
time.sleep(5)
|
478 |
+
return qg_output, q_output
|
479 |
+
if source_language != 'english':
|
480 |
+
translation_output = translate_fill(document_name, "qg_translation.docx", source_language , "english")
|
481 |
+
qg_output, q_output = run_fill_questions(translation_output, output_file, 'qsns_english.txt',delimiter)
|
482 |
+
final_qg = translate_fill(qg_output, output_file , "english",source_language)
|
483 |
+
final_q = translate_txt(q_output, questions_file , "english",source_language)
|
484 |
+
return final_qg, final_q
|
485 |
+
else:
|
486 |
+
qg_output, q_output = run_fill_questions(document_name, output_file, questions_file, delimiter)
|
487 |
+
return qg_output, q_output
|
488 |
+
|
489 |
+
|
490 |
+
import docx
|
491 |
+
import random
|
492 |
+
from docx.shared import RGBColor
|
493 |
+
import time
|
494 |
+
import re
|
495 |
+
|
496 |
+
input_output_red = {"NON-DISCLOSURE-AGREEMENT":"output.docx", "dummy.docx":"dummy_colored.docx"}
|
497 |
+
|
498 |
+
def run_redflags(filename, output_file):
|
499 |
+
print("Red flags")
|
500 |
+
doc = docx.Document(filename)
|
501 |
+
if doc.paragraphs[0].text in list(input_output_red.keys()):
|
502 |
+
return input_output_red[doc.paragraphs[0].text]
|
503 |
+
else:
|
504 |
+
for para in doc.paragraphs:
|
505 |
+
inline = para.runs
|
506 |
+
colour = False
|
507 |
+
if (len(para.text.split())>20) and random.random()>0.5 and para.paragraph_format.left_indent!=None:
|
508 |
+
colour = True
|
509 |
+
if colour:
|
510 |
+
for i in range(len(inline)):
|
511 |
+
inline[i].font.color.rgb = RGBColor(255, 000, 000)
|
512 |
+
|
513 |
+
time.sleep(8)
|
514 |
+
doc.save(output_file)
|
515 |
+
return output_file
|
516 |
+
|
517 |
+
|
518 |
+
import docx
|
519 |
+
import random
|
520 |
+
from docx.shared import RGBColor
|
521 |
+
import time
|
522 |
+
import re
|
523 |
+
from docx import Document
|
524 |
+
|
525 |
+
from docx.enum.text import WD_COLOR_INDEX
|
526 |
+
|
527 |
+
from transformers import AutoTokenizer, AutoModel
|
528 |
+
import torch
|
529 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
530 |
+
import numpy as np
|
531 |
+
|
532 |
+
|
533 |
+
similar_tokenizer = AutoTokenizer.from_pretrained('ai4bharat/indic-bert' )
|
534 |
+
similar_model = AutoModel.from_pretrained('ai4bharat/indic-bert' )
|
535 |
+
similar_model.eval()
|
536 |
+
|
537 |
+
def obtain_rep(documents):
|
538 |
+
# initialize dictionary to store tokenized sentences
|
539 |
+
mean_pooled = []
|
540 |
+
with torch.no_grad():
|
541 |
+
for sentence in documents:
|
542 |
+
# encode each sentence and append to dictionary
|
543 |
+
tokens = {'input_ids': [], 'attention_mask': []}
|
544 |
+
|
545 |
+
new_tokens = similar_tokenizer.encode_plus(sentence, max_length=128,
|
546 |
+
truncation=True, padding='max_length',
|
547 |
+
return_tensors='pt')
|
548 |
+
tokens['input_ids'].append(new_tokens['input_ids'][0])
|
549 |
+
tokens['attention_mask'].append(new_tokens['attention_mask'][0])
|
550 |
+
tokens['input_ids'] = torch.stack(tokens['input_ids'])
|
551 |
+
tokens['attention_mask'] = torch.stack(tokens['attention_mask'])
|
552 |
+
|
553 |
+
outputs = similar_model(**tokens)
|
554 |
+
mean_pooled.append(outputs.pooler_output)
|
555 |
+
|
556 |
+
return torch.stack(mean_pooled).squeeze(1)
|
557 |
+
|
558 |
+
def similarity(documents, clauses):
|
559 |
+
clauses = clauses.detach().numpy()
|
560 |
+
documents = documents.detach().numpy()
|
561 |
+
sim = cosine_similarity(clauses,documents)
|
562 |
+
max_sim = np.max(sim, axis=0)
|
563 |
+
return max_sim
|
564 |
+
|
565 |
+
def fill_yellow(filename, output_file, highlighted_paras):
|
566 |
+
doc = docx.Document(filename)
|
567 |
+
for each in highlighted_paras:
|
568 |
+
for para in doc.paragraphs:
|
569 |
+
inline = para.runs
|
570 |
+
colour = False
|
571 |
+
if each in para.text:
|
572 |
+
colour = True
|
573 |
+
if colour:
|
574 |
+
for i in range(len(inline)):
|
575 |
+
inline[i].font.highlight_color = WD_COLOR_INDEX.YELLOW
|
576 |
+
break
|
577 |
+
doc.save(output_file)
|
578 |
+
return output_file
|
579 |
+
|
580 |
+
|
581 |
+
def get_similar_clauses(filename, output_file,clauses, source_language):
|
582 |
+
paras = []
|
583 |
+
template_document = Document(filename)
|
584 |
+
contract = []
|
585 |
+
for paragraph in template_document.paragraphs:
|
586 |
+
if(paragraph.text.strip()!=''):
|
587 |
+
contract.append(paragraph.text)
|
588 |
+
|
589 |
+
sentence_batch = []
|
590 |
+
|
591 |
+
for paragraph in contract:
|
592 |
+
sentence_batch.extend(split_sentences(paragraph, source_language))
|
593 |
+
|
594 |
+
sentence_batch = [each for each in sentence_batch if each!=' ' and len(each.split())>5]
|
595 |
+
|
596 |
+
doc_rep = obtain_rep(sentence_batch)
|
597 |
+
clause_rep = obtain_rep(clauses)
|
598 |
+
k = similarity(doc_rep, clause_rep)
|
599 |
+
pick_top = max(int(0.1*len(sentence_batch)),3)
|
600 |
+
ind = k.argsort()[-pick_top:][::-1]
|
601 |
+
for each_idx in ind:
|
602 |
+
paras.append(sentence_batch[each_idx])
|
603 |
+
|
604 |
+
output_file = fill_yellow(filename, output_file, paras)
|
605 |
+
highlighted_paras = get_highlighted_clauses(output_file)
|
606 |
+
return output_file, highlighted_paras
|
607 |
+
|
608 |
+
|
609 |
+
input_output_similar = {"NON-DISCLOSURE-AGREEMENT":[{"clauses":["hi"], "file":"output_similar.docx"},{"clauses":["bye","see you"], "file":"output.docx"}], "dummy.docx":[{"clauses":["lets see","whatever"],"file":"dummy_colored.docx"}]}
|
610 |
+
def get_highlighted_clauses(filename):
|
611 |
+
doc = docx.Document(filename)
|
612 |
+
para_highlighted = []
|
613 |
+
for para in doc.paragraphs:
|
614 |
+
inline = para.runs
|
615 |
+
colour = False
|
616 |
+
for i in range(len(inline)):
|
617 |
+
if inline[i].font.highlight_color == WD_COLOR_INDEX.YELLOW :
|
618 |
+
colour = True
|
619 |
+
break
|
620 |
+
if colour:
|
621 |
+
para_highlighted.append(para.text)
|
622 |
+
return para_highlighted
|
623 |
+
|
624 |
+
def run_similar_clause(filename, output_file, clauses, source_language):
|
625 |
+
print("similar clause")
|
626 |
+
doc = docx.Document(filename)
|
627 |
+
for doc_input in list(input_output_similar.keys()):
|
628 |
+
if doc.paragraphs[0].text in doc_input:
|
629 |
+
for each_ in input_output_similar[doc_input]:
|
630 |
+
if len(list(set(each_["clauses"]).intersection(set(clauses))))>0 :
|
631 |
+
output_file = each_["file"]
|
632 |
+
time.sleep(3)
|
633 |
+
highlighted_paras = get_highlighted_clauses(output_file)
|
634 |
+
return output_file, highlighted_paras
|
635 |
+
else:
|
636 |
+
output_file, highlighted_paras = get_similar_clauses(filename, output_file,clauses, source_language)
|
637 |
+
return output_file, highlighted_paras
|
638 |
+
|
639 |
+
|
640 |
+
import gradio as gr
|
641 |
+
|
642 |
+
analysis_services = ['Translate Contract', 'Identify key Clauses', 'Red flag Identification', 'Similar Semantic Clause search', 'Generate Questions for Contract Template']
|
643 |
+
analysis_label = 'Select Contract Analysis Service'
|
644 |
+
analysis_choices = analysis_services
|
645 |
+
analysis_choice = ''
|
646 |
+
lang_choice = 'english'
|
647 |
+
translation_label = 'Upload contract for Translation'
|
648 |
+
translation_src_label = 'Select language of uploaded contract'
|
649 |
+
translation_tgt_label = 'Select language to translate'
|
650 |
+
keyclause_label = 'Upload contract for Key Clause Extraction'
|
651 |
+
redflag_label = 'Upload contract for Red Flag Identification'
|
652 |
+
similar_label = 'Upload contract for Semantic Similar Clauses'
|
653 |
+
similar_clause_label = 'Enter clauses to be identified (enter one clause per line)'
|
654 |
+
generate_questions_label = 'Upload template contract for Question Generation'
|
655 |
+
delimiter_label = "Input placeholder (pattern or symbol used as blank in template)"
|
656 |
+
button_label = "Upload and Analyze"
|
657 |
+
|
658 |
+
|
659 |
+
translation_output_label = 'Download your translated contract'
|
660 |
+
keyclause_output_label = 'Download your key clauses from the contract'
|
661 |
+
redflag_output_label = 'Download your contract with red flags highlighted'
|
662 |
+
similar_file_label = 'Download your contract with highlighted similar clauses in yellow'
|
663 |
+
similar_text_label = 'A quick view of similar clauses'
|
664 |
+
qg_output_label = 'Download your template contract along with questions'
|
665 |
+
q_output_label = 'Download only questions to fill the template contract'
|
666 |
+
|
667 |
+
def change_analysis(choice):
|
668 |
+
global lang_choice, analysis_choices
|
669 |
+
lang_choice = choice
|
670 |
+
analysis_choices = [translate_paragraph(paragraph, "english", choice) for paragraph in analysis_services]
|
671 |
+
return [gr.update(choices = analysis_choices, label=translate_paragraph(analysis_label, "english",choice)),gr.update(visible=False),gr.update(visible=False),gr.update(visible=False),gr.update(visible=False),gr.update(visible=False),gr.update(visible=False),gr.update(visible=False),gr.update(visible=False)]
|
672 |
+
|
673 |
+
def change_inputs(choice):
|
674 |
+
global analysis_choice
|
675 |
+
analysis_choice = choice
|
676 |
+
if analysis_choice == analysis_choices[0]:
|
677 |
+
return [gr.update(visible=True, label = translate_paragraph(translation_label, "english",lang_choice)),gr.update(visible=False), gr.update(visible=True, label=''),gr.update(visible=False),gr.update(visible=False),gr.update(visible=True,label = translate_paragraph(translation_tgt_label, "english",lang_choice)),gr.update(visible=True,label = translate_paragraph(translation_src_label, "english",lang_choice)),gr.update(visible=False), gr.update(value= translate_paragraph(button_label, "english",lang_choice),visible=True)]
|
678 |
+
elif analysis_choice == analysis_choices[1]:
|
679 |
+
return [gr.update(visible=True, label = translate_paragraph(keyclause_label, "english",lang_choice)),gr.update(visible=False), gr.update(visible=True,label=''),gr.update(visible=False),gr.update(visible=False),gr.update(visible=False),gr.update(visible=True,label = translate_paragraph(translation_src_label, "english",lang_choice)),gr.update(visible=False),gr.update(value= translate_paragraph(button_label, "english",lang_choice),visible=True)]
|
680 |
+
elif analysis_choice == analysis_choices[2]:
|
681 |
+
return [gr.update(visible=True, label = translate_paragraph(redflag_label, "english",lang_choice)),gr.update(visible=False), gr.update(visible=True,label=''),gr.update(visible=False),gr.update(visible=False),gr.update(visible=False),gr.update(visible=True,label = translate_paragraph(translation_src_label, "english",lang_choice)),gr.update(visible=False),gr.update(value= translate_paragraph(button_label, "english",lang_choice),visible=True)]
|
682 |
+
elif analysis_choice == analysis_choices[3]:
|
683 |
+
return [gr.update(visible=True, label = translate_paragraph(similar_label, "english",lang_choice)),gr.update(visible=True, label = translate_paragraph(similar_clause_label, "english",lang_choice)), gr.update(visible=True,label=''),gr.update(visible=True,label=''),gr.update(visible=True,label=''),gr.update(visible=False),gr.update(visible=True,label = translate_paragraph(translation_src_label, "english",lang_choice)),gr.update(visible=False),gr.update(value= translate_paragraph(button_label, "english",lang_choice),visible=True)]
|
684 |
+
elif analysis_choice == analysis_choices[4]:
|
685 |
+
return [gr.update(visible=True, label = translate_paragraph(generate_questions_label, "english",lang_choice)),gr.update(visible=False), gr.update(visible=True,label=''),gr.update(visible=True,label=''),gr.update(visible=False),gr.update(visible=False),gr.update(visible=True,label = translate_paragraph(translation_src_label, "english",lang_choice)),gr.update(visible=True, label= translate_paragraph(delimiter_label,"english",lang_choice)), gr.update(value= translate_paragraph(button_label, "english",lang_choice),visible=True)]
|
686 |
+
|
687 |
+
def process_analysis(document_name, text, source_language, target_language, delimiter):
|
688 |
+
if analysis_choice == analysis_choices[0]:
|
689 |
+
translation_output = translate_fill(document_name, "translation_" + target_language + ".docx", source_language , target_language)
|
690 |
+
return [gr.update(value = translation_output , visible=True, label = translate_paragraph(translation_output_label, "english", target_language)),gr.update(visible=False),gr.update(visible=False)]
|
691 |
+
elif analysis_choice == analysis_choices[1]:
|
692 |
+
info_output = run_key_clause(document_name, "key_clauses.txt",source_language)
|
693 |
+
return [gr.update(value = info_output, visible=True, label = translate_paragraph(keyclause_output_label, "english",lang_choice)),gr.update(visible=False), gr.update(visible=False)]
|
694 |
+
elif analysis_choice == analysis_choices[2]:
|
695 |
+
red_flag_output = run_redflags(document_name, "redflag.docx")
|
696 |
+
return [gr.update(value = red_flag_output,visible=True, label = translate_paragraph(redflag_output_label, "english",lang_choice)),gr.update(visible=False), gr.update(visible=False)]
|
697 |
+
elif analysis_choice == analysis_choices[3]:
|
698 |
+
clauses = text.split("\n")
|
699 |
+
similar_file, similar_text = run_similar_clause(document_name, "similar.docx", clauses, source_language)
|
700 |
+
similar_text = "\n\n\n".join(similar_text)
|
701 |
+
return [gr.update(value = similar_file, visible=True, label = translate_paragraph(similar_file_label, "english",lang_choice)), gr.update(visible=False),gr.update(value = similar_text, visible=True, label = translate_paragraph(similar_text_label, "english",lang_choice))]
|
702 |
+
elif analysis_choice == analysis_choices[4]:
|
703 |
+
qg_output, q_output = run_generate_questions(document_name, "qsns_template.docx", "qsns_only.txt", delimiter, source_language)
|
704 |
+
return [gr.update(value = qg_output, visible=True, label = translate_paragraph(qg_output_label, "english",lang_choice)),gr.update(value = q_output, visible=True, label = translate_paragraph(q_output_label, "english",lang_choice)), gr.update(visible=False)]
|
705 |
+
|
706 |
+
|
707 |
+
with gr.Blocks() as demo:
|
708 |
+
lang_radio = gr.Radio(list(lang_dict.keys()), value = 'english', label="Select your language")
|
709 |
+
analysis_radio = gr.Radio(analysis_services , label=analysis_label)
|
710 |
+
|
711 |
+
with gr.Row():
|
712 |
+
input_file = gr.File(interactive = True, visible = False)
|
713 |
+
with gr.Column():
|
714 |
+
translation_source = gr.Dropdown(choices = list(lang_dict.keys()),interactive = True, value = 'english', label=translation_src_label, visible=False)
|
715 |
+
translation_target = gr.Dropdown(choices = list(lang_dict.keys()),interactive = True, value = 'english', label=translation_tgt_label, visible=False)
|
716 |
+
delimiter = gr.Textbox(label= delimiter_label, lines=1, interactive = True, visible = False)
|
717 |
+
|
718 |
+
input_text = gr.Textbox(lines=4, interactive = True, visible = False)
|
719 |
+
|
720 |
+
button = gr.Button(value = button_label , visible = False)
|
721 |
+
output_file = gr.File(interactive = False, visible = False)
|
722 |
+
output_file2 = gr.File(interactive = False, visible = False)
|
723 |
+
output_text = gr.Textbox(interactive = False, visible = False)
|
724 |
+
|
725 |
+
lang_radio.change(fn=change_analysis, inputs=lang_radio, outputs=[analysis_radio,input_file, input_text, output_file,output_file2, output_text,translation_target,translation_source, delimiter])
|
726 |
+
analysis_radio.change(fn=change_inputs, inputs=analysis_radio, outputs=[input_file, input_text, output_file, output_file2, output_text,translation_target, translation_source, delimiter, button])
|
727 |
+
button.click( process_analysis, [input_file,input_text, translation_source, translation_target, delimiter], [output_file, output_file2, output_text])
|
728 |
+
|
729 |
+
demo.launch(debug=True)
|