santoshtyss commited on
Commit
8fc25ec
·
1 Parent(s): dad3fe5

Create new file

Browse files
Files changed (1) hide show
  1. app.py +729 -0
app.py ADDED
@@ -0,0 +1,729 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
2
+ import torch
3
+ from mosestokenizer import *
4
+ from indicnlp.tokenize import sentence_tokenize
5
+ from docx import Document
6
+
7
+ trans_tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M" )
8
+ trans_model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")
9
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
10
+ trans_model = trans_model.to(device)
11
+
12
+
13
+
14
+ lang_dict = {
15
+ 'english' : 'eng_Latn',
16
+ 'assamese' : 'asm_Beng',
17
+ 'awadhi' : 'awa_Deva' ,
18
+ 'bengali' : 'ben_Beng',
19
+ 'bhojpuri' : 'bho_Deva',
20
+ 'gujarati' : 'guj_Gujr',
21
+ 'hindi' : 'hin_Deva',
22
+ 'kannada' : 'kan_Knda',
23
+ 'kashmiri' : 'kas_Deva',
24
+ 'maithili' : 'mai_Deva',
25
+ 'malayalam' : 'mal_Mlym',
26
+ 'marathi' : 'mar_Deva',
27
+ 'odia' : 'ory_Orya',
28
+ 'punjabi' : 'pan_Guru',
29
+ 'sanskrit' : 'san_Deva',
30
+ 'sindhi' : 'snd_Arab' ,
31
+ 'tamil' : 'tam_Taml' ,
32
+ 'telugu' : 'tel_Telu',
33
+ 'urdu' : 'urd_Arab'
34
+ }
35
+
36
+ def translate_sentence(article, target):
37
+ inputs = trans_tokenizer(article.replace("\"",""), return_tensors="pt").to(device)
38
+
39
+ translated_tokens = trans_model.generate(
40
+ **inputs, forced_bos_token_id=trans_tokenizer.lang_code_to_id[lang_dict[target]], max_length=100)
41
+
42
+ return trans_tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
43
+
44
+
45
+
46
+ INDIC_DICT = {"assamese" :"as", 'bengali' : 'bn', 'gujarati' : 'gu',
47
+ 'hindi' : 'hi',
48
+ 'kannada' : 'kn',
49
+ 'malayalam' : 'ml',
50
+ 'marathi' : 'mr',
51
+ 'odia' : 'or',
52
+ 'punjabi' : 'pa',
53
+ 'tamil' : 'ta' ,
54
+ 'telugu' : 'te'}
55
+
56
+ def split_sentences(paragraph, language):
57
+ if language in INDIC_DICT.keys():
58
+ return sentence_tokenize.sentence_split(paragraph, lang=INDIC_DICT[language])
59
+ elif language == 'en':
60
+ with MosesSentenceSplitter('en') as splitter:
61
+ return splitter([paragraph])
62
+ else:
63
+ return paragraph.split(".")
64
+
65
+ def translate_paragraph(paragraph, source, target):
66
+ if source == target :
67
+ return paragraph
68
+ if len(paragraph.split()) < 100:
69
+ return translate_sentence(paragraph, target)
70
+ else:
71
+ sentences = split_sentences(paragraph, source)
72
+ outputs = []
73
+ for each_sentence in sentences:
74
+ outputs.append(translate_sentence(each_sentence, target))
75
+ return " ".join(outputs)
76
+
77
+ def docx_replace(doc, data):
78
+ paragraphs = list(doc.paragraphs)
79
+ for t in doc.tables:
80
+ for row in t.rows:
81
+ for cell in row.cells:
82
+ for paragraph in cell.paragraphs:
83
+ paragraphs.append(paragraph)
84
+
85
+ for key, val in data.items():
86
+ for p in paragraphs:
87
+ #key_name = '${{{}}}'.format(key) # I'm using placeholders in the form ${PlaceholderName}
88
+ key_name = key
89
+ if key_name in p.text:
90
+ #print(f'old one {p.text}')
91
+ inline = p.runs
92
+ # Replace strings and retain the same style.
93
+ # The text to be replaced can be split over several runs so
94
+ # search through, identify which runs need to have text replaced
95
+ # then replace the text in those identified
96
+ started = False
97
+ key_index = 0
98
+ # found_runs is a list of (inline index, index of match, length of match)
99
+ found_runs = list()
100
+ found_all = False
101
+ replace_done = False
102
+ for i in range(len(inline)):
103
+
104
+ # case 1: found in single run so short circuit the replace
105
+ if key_name in inline[i].text and not started:
106
+ found_runs.append((i, inline[i].text.find(key_name), len(key_name)))
107
+ text = inline[i].text.replace(key_name, str(val))
108
+ inline[i].text = text
109
+ replace_done = True
110
+ found_all = True
111
+ break
112
+
113
+ if key_name[key_index] not in inline[i].text and not started:
114
+ # keep looking ...
115
+ continue
116
+
117
+ # case 2: search for partial text, find first run
118
+ if key_name[key_index] in inline[i].text and inline[i].text[-1] in key_name and not started:
119
+ # check sequence
120
+ start_index = inline[i].text.find(key_name[key_index])
121
+ check_length = len(inline[i].text)
122
+ for text_index in range(start_index, check_length):
123
+ if inline[i].text[text_index] != key_name[key_index]:
124
+ # no match so must be false positive
125
+ break
126
+ if key_index == 0:
127
+ started = True
128
+ chars_found = check_length - start_index
129
+ key_index += chars_found
130
+ found_runs.append((i, start_index, chars_found))
131
+ if key_index != len(key_name):
132
+ continue
133
+ else:
134
+ # found all chars in key_name
135
+ found_all = True
136
+ break
137
+
138
+ # case 2: search for partial text, find subsequent run
139
+ if key_name[key_index] in inline[i].text and started and not found_all:
140
+ # check sequence
141
+ chars_found = 0
142
+ check_length = len(inline[i].text)
143
+ for text_index in range(0, check_length):
144
+ if inline[i].text[text_index] == key_name[key_index]:
145
+ key_index += 1
146
+ chars_found += 1
147
+ else:
148
+ break
149
+ # no match so must be end
150
+ found_runs.append((i, 0, chars_found))
151
+ if key_index == len(key_name):
152
+ found_all = True
153
+ break
154
+
155
+ if found_all and not replace_done:
156
+ for i, item in enumerate(found_runs):
157
+ index, start, length = [t for t in item]
158
+ if i == 0:
159
+ text = inline[index].text.replace(inline[index].text[start:start + length], str(val))
160
+ inline[index].text = text
161
+ else:
162
+ text = inline[index].text.replace(inline[index].text[start:start + length], '')
163
+ inline[index].text = text
164
+ #print(p.text)
165
+ break
166
+
167
+ input_output_trans = {"NON-DISCLOSURE-AGREEMENT":{"telugu":"translation_telugu.docx","hindi":"translation_english.docx"}, "dummy.docx":{"telugu":"translation_telugu.docx","hindi":"translation_english.docx"}}
168
+
169
+
170
+ def translate_fill(document_name,output_file, src, trg):
171
+ print("translate doc")
172
+
173
+ doc = docx.Document(document_name)
174
+ if doc.paragraphs[0].text in list(input_output_trans.keys()):
175
+ lang_doc_dict = input_output_trans[doc.paragraphs[0].text]
176
+ if trg in lang_doc_dict.keys():
177
+ time.sleep(5)
178
+ return lang_doc_dict[trg]
179
+
180
+ template_document = Document(document_name)
181
+
182
+ variables = {}
183
+ for paragraph in template_document.paragraphs:
184
+ if(paragraph.text.strip() != ""):
185
+ variables[paragraph.text] = translate_paragraph(paragraph.text, src, trg)
186
+
187
+ for t in template_document.tables:
188
+ for row in t.rows:
189
+ for cell in row.cells:
190
+ for paragraph in cell.paragraphs:
191
+ if(paragraph.text.strip() != ""):
192
+ variables[paragraph.text] = translate_paragraph(paragraph.text, src, trg)
193
+
194
+ docx_replace(template_document, variables)
195
+ template_document.save(output_file)
196
+ return output_file
197
+
198
+
199
+
200
+ def translate_txt(document_name, output_file, src, trg):
201
+ print("translate text")
202
+ with open(document_name) as fp:
203
+ lines = fp.readlines()
204
+
205
+ lines = [line.rstrip() for line in lines]
206
+
207
+ with open(output_file, 'w') as f:
208
+ for line in lines:
209
+ if(line!=""):
210
+ f.write( translate_paragraph(line, src, trg) + "\n")
211
+ else:
212
+ f.write("\n")
213
+
214
+ return output_file
215
+
216
+ import torch
217
+ import time
218
+ import json
219
+ from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
220
+
221
+ from transformers import (
222
+ AutoConfig,
223
+ AutoModelForQuestionAnswering,
224
+ AutoTokenizer,
225
+ squad_convert_examples_to_features
226
+ )
227
+
228
+ from transformers.data.processors.squad import SquadResult, SquadV2Processor, SquadExample
229
+ from transformers.data.metrics.squad_metrics import compute_predictions_logits
230
+
231
+ info_model_path = 'cuad-models/roberta-base/'
232
+ info_config_class, info_model_class, info_tokenizer_class = (
233
+ AutoConfig, AutoModelForQuestionAnswering, AutoTokenizer)
234
+ info_config = info_config_class.from_pretrained(info_model_path)
235
+ info_tokenizer = info_tokenizer_class.from_pretrained(
236
+ info_model_path, do_lower_case=True, use_fast=False)
237
+ info_model = info_model_class.from_pretrained(info_model_path, config=info_config)
238
+
239
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
240
+ info_model.to(device)
241
+
242
+ def run_prediction(question_texts, context_text):
243
+ ### Setting hyperparameters
244
+ max_seq_length = 512
245
+ doc_stride = 256
246
+ n_best_size = 1
247
+ max_query_length = 64
248
+ max_answer_length = 512
249
+ do_lower_case = False
250
+ null_score_diff_threshold = 0.0
251
+
252
+ # model_name_or_path = "../cuad-models/roberta-base/"
253
+
254
+ def to_list(tensor):
255
+ return tensor.detach().cpu().tolist()
256
+
257
+ processor = SquadV2Processor()
258
+ examples = []
259
+
260
+ for i, question_text in enumerate(question_texts):
261
+ example = SquadExample(
262
+ qas_id=str(i),
263
+ question_text=question_text,
264
+ context_text=context_text,
265
+ answer_text=None,
266
+ start_position_character=None,
267
+ title="Predict",
268
+ answers=None,
269
+ )
270
+
271
+ examples.append(example)
272
+
273
+ features, dataset = squad_convert_examples_to_features(
274
+ examples=examples,
275
+ tokenizer= info_tokenizer,
276
+ max_seq_length=max_seq_length,
277
+ doc_stride=doc_stride,
278
+ max_query_length=max_query_length,
279
+ is_training=False,
280
+ return_dataset="pt",
281
+ threads=1,
282
+ )
283
+
284
+ eval_sampler = SequentialSampler(dataset)
285
+ eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=10)
286
+
287
+ all_results = []
288
+
289
+ for batch in eval_dataloader:
290
+ info_model.eval()
291
+ batch = tuple(t.to(device) for t in batch)
292
+
293
+ with torch.no_grad():
294
+ inputs = {
295
+ "input_ids": batch[0],
296
+ "attention_mask": batch[1],
297
+ "token_type_ids": batch[2],
298
+ }
299
+
300
+ example_indices = batch[3]
301
+
302
+ outputs = info_model(**inputs)
303
+
304
+ for i, example_index in enumerate(example_indices):
305
+ eval_feature = features[example_index.item()]
306
+ unique_id = int(eval_feature.unique_id)
307
+
308
+ output = [to_list(output[i]) for output in outputs.to_tuple()]
309
+
310
+ start_logits, end_logits = output
311
+ result = SquadResult(unique_id, start_logits, end_logits)
312
+ all_results.append(result)
313
+
314
+ final_predictions = compute_predictions_logits(
315
+ all_examples=examples,
316
+ all_features=features,
317
+ all_results=all_results,
318
+ n_best_size=n_best_size,
319
+ max_answer_length=max_answer_length,
320
+ do_lower_case=do_lower_case,
321
+ output_prediction_file=None,
322
+ output_nbest_file=None,
323
+ output_null_log_odds_file=None,
324
+ verbose_logging=False,
325
+ version_2_with_negative=True,
326
+ null_score_diff_threshold=null_score_diff_threshold,
327
+ tokenizer=info_tokenizer
328
+ )
329
+
330
+ return final_predictions
331
+
332
+
333
+ def run_contract_extraction(document_name, output_file):
334
+ template_document = Document(document_name)
335
+ contract = []
336
+ for paragraph in template_document.paragraphs:
337
+ if(paragraph.text.strip()!=''):
338
+ contract.append(paragraph.text)
339
+
340
+ contract = "\n".join(contract)
341
+ questions = []
342
+
343
+ with open('./cuad-data/CUADv1.json') as json_file:
344
+ data = json.load(json_file)
345
+
346
+ #with open('./cuad-data/questions.txt', 'w') as questions_file:
347
+ for i, q in enumerate(data['data'][0]['paragraphs'][0]['qas']):
348
+ question = data['data'][0]['paragraphs'][0]['qas'][i]['question']
349
+ questions.append(question)
350
+
351
+ predictions = run_prediction(questions, contract)
352
+
353
+ with open(output_file, 'w') as f:
354
+ count = 1
355
+ for i, p in enumerate(predictions):
356
+ if(predictions[p]!=''):
357
+ #print(f"Question {i+1}: {questions[int(p)]}\nPredicted Answer: {predictions[p]}\n\n")
358
+ f.write("Question "+str(count)+": "+ questions[int(p)] +"\nPredicted Answer: "+ predictions[p]+ "\n\n")
359
+ count += 1
360
+
361
+ return output_file
362
+
363
+ input_output_key = {"NON-DISCLOSURE-AGREEMENT":"qsns_english.txt", "dummy.docx":"qsns_telugu.txt"}
364
+
365
+ def run_key_clause(document_name, output_name,source_language):
366
+ doc = docx.Document(document_name)
367
+ if doc.paragraphs[0].text in list(input_output_key.keys()):
368
+ time.sleep(5)
369
+ return input_output_key[doc.paragraphs[0].text]
370
+
371
+ if source_language != 'english':
372
+ translation_output = translate_fill(document_name, "info_translation.docx", source_language , "english")
373
+ info_output = run_contract_extraction(translation_output, "info_english.txt")
374
+ final_info = translate_txt(info_output, output_name, "english",source_language)
375
+
376
+ else:
377
+ final_info = run_contract_extraction(document_name, output_name)
378
+
379
+ return final_info
380
+
381
+
382
+ from transformers import AutoModelWithLMHead, AutoTokenizer
383
+ from docx import Document
384
+
385
+ qg_tokenizer = AutoTokenizer.from_pretrained("mrm8488/t5-base-finetuned-question-generation-ap")
386
+ qg_model = AutoModelWithLMHead.from_pretrained("mrm8488/t5-base-finetuned-question-generation-ap")
387
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
388
+ qg_model.to(device)
389
+
390
+ def get_question(answer, context, max_length=64):
391
+ input_text = "answer: %s context: %s </s>" % (answer, context)
392
+ features = qg_tokenizer([input_text], return_tensors='pt').to(device)
393
+
394
+ output = qg_model.generate(input_ids=features['input_ids'],
395
+ attention_mask=features['attention_mask'],
396
+ max_length=max_length)
397
+
398
+ return qg_tokenizer.decode(output[0])
399
+
400
+
401
+ def run_fill_questions(document_name, output_file, questions_file, delimiter):
402
+ print("QGenerations")
403
+ prev_para = ''
404
+ count = 0
405
+ variables = {}
406
+ questions = []
407
+
408
+ doc = Document(document_name)
409
+
410
+ for paragraph in doc.paragraphs:
411
+ if(paragraph.text.strip()==''):
412
+ continue
413
+ if(paragraph.text.count(delimiter)>0):
414
+ var_count = paragraph.text.count(delimiter)
415
+ format_str = paragraph.text.replace(delimiter, '{}')
416
+ new_string = format_str.format(*('id'+str(i) for i in range(count,count+var_count)))
417
+
418
+ answers = ['id'+str(i) for i in range(count,count+var_count)]
419
+
420
+ if (len(new_string.split())<10):
421
+ context = prev_para + " " + new_string
422
+ else:
423
+ context = new_string
424
+
425
+
426
+ for answer in answers:
427
+ question_string = get_question(answer, context).replace('<pad> question:','').replace('</s>','').strip()
428
+ question = "{{"+question_string+"}}"
429
+ questions.append(question_string)
430
+ new_string = new_string.replace(answer, question)
431
+
432
+ count += var_count
433
+ variables[paragraph.text] = new_string
434
+
435
+ prev_para = paragraph.text
436
+
437
+ with open(questions_file, 'w') as f:
438
+ count = 1
439
+ for p in questions:
440
+ f.write("Question "+str(count)+": "+ p +"\n")
441
+ count += 1
442
+
443
+
444
+ docx_replace(doc, variables)
445
+ doc.save(output_file)
446
+ return output_file, questions_file
447
+
448
+
449
+ def extract_questions(document_name, output_file):
450
+ questions = []
451
+ doc = Document(document_name)
452
+
453
+ for paragraph in doc.paragraphs:
454
+ if(paragraph.text.strip()==''):
455
+ continue
456
+ else:
457
+ q = re.findall(r'\{{(.*?)\}}',paragraph.text.strip())
458
+ questions.extend(q)
459
+
460
+
461
+ with open(output_file, 'w') as f:
462
+ count = 1
463
+ for p in questions:
464
+ f.write("Question "+str(count)+": "+ p +"\n")
465
+ count += 1
466
+
467
+ return output_file
468
+
469
+ input_output_qg = {"NON-DISCLOSURE-AGREEMENT":"qsns_template_english.docx", "dummy.docx":"output.docx"}
470
+
471
+
472
+ def run_generate_questions(document_name, output_file, questions_file, delimiter, source_language):
473
+ doc = docx.Document(document_name)
474
+ if doc.paragraphs[0].text in list(input_output_qg.keys()):
475
+ qg_output = input_output_qg[doc.paragraphs[0].text]
476
+ q_output = extract_questions(qg_output, questions_file)
477
+ time.sleep(5)
478
+ return qg_output, q_output
479
+ if source_language != 'english':
480
+ translation_output = translate_fill(document_name, "qg_translation.docx", source_language , "english")
481
+ qg_output, q_output = run_fill_questions(translation_output, output_file, 'qsns_english.txt',delimiter)
482
+ final_qg = translate_fill(qg_output, output_file , "english",source_language)
483
+ final_q = translate_txt(q_output, questions_file , "english",source_language)
484
+ return final_qg, final_q
485
+ else:
486
+ qg_output, q_output = run_fill_questions(document_name, output_file, questions_file, delimiter)
487
+ return qg_output, q_output
488
+
489
+
490
+ import docx
491
+ import random
492
+ from docx.shared import RGBColor
493
+ import time
494
+ import re
495
+
496
+ input_output_red = {"NON-DISCLOSURE-AGREEMENT":"output.docx", "dummy.docx":"dummy_colored.docx"}
497
+
498
+ def run_redflags(filename, output_file):
499
+ print("Red flags")
500
+ doc = docx.Document(filename)
501
+ if doc.paragraphs[0].text in list(input_output_red.keys()):
502
+ return input_output_red[doc.paragraphs[0].text]
503
+ else:
504
+ for para in doc.paragraphs:
505
+ inline = para.runs
506
+ colour = False
507
+ if (len(para.text.split())>20) and random.random()>0.5 and para.paragraph_format.left_indent!=None:
508
+ colour = True
509
+ if colour:
510
+ for i in range(len(inline)):
511
+ inline[i].font.color.rgb = RGBColor(255, 000, 000)
512
+
513
+ time.sleep(8)
514
+ doc.save(output_file)
515
+ return output_file
516
+
517
+
518
+ import docx
519
+ import random
520
+ from docx.shared import RGBColor
521
+ import time
522
+ import re
523
+ from docx import Document
524
+
525
+ from docx.enum.text import WD_COLOR_INDEX
526
+
527
+ from transformers import AutoTokenizer, AutoModel
528
+ import torch
529
+ from sklearn.metrics.pairwise import cosine_similarity
530
+ import numpy as np
531
+
532
+
533
+ similar_tokenizer = AutoTokenizer.from_pretrained('ai4bharat/indic-bert' )
534
+ similar_model = AutoModel.from_pretrained('ai4bharat/indic-bert' )
535
+ similar_model.eval()
536
+
537
+ def obtain_rep(documents):
538
+ # initialize dictionary to store tokenized sentences
539
+ mean_pooled = []
540
+ with torch.no_grad():
541
+ for sentence in documents:
542
+ # encode each sentence and append to dictionary
543
+ tokens = {'input_ids': [], 'attention_mask': []}
544
+
545
+ new_tokens = similar_tokenizer.encode_plus(sentence, max_length=128,
546
+ truncation=True, padding='max_length',
547
+ return_tensors='pt')
548
+ tokens['input_ids'].append(new_tokens['input_ids'][0])
549
+ tokens['attention_mask'].append(new_tokens['attention_mask'][0])
550
+ tokens['input_ids'] = torch.stack(tokens['input_ids'])
551
+ tokens['attention_mask'] = torch.stack(tokens['attention_mask'])
552
+
553
+ outputs = similar_model(**tokens)
554
+ mean_pooled.append(outputs.pooler_output)
555
+
556
+ return torch.stack(mean_pooled).squeeze(1)
557
+
558
+ def similarity(documents, clauses):
559
+ clauses = clauses.detach().numpy()
560
+ documents = documents.detach().numpy()
561
+ sim = cosine_similarity(clauses,documents)
562
+ max_sim = np.max(sim, axis=0)
563
+ return max_sim
564
+
565
+ def fill_yellow(filename, output_file, highlighted_paras):
566
+ doc = docx.Document(filename)
567
+ for each in highlighted_paras:
568
+ for para in doc.paragraphs:
569
+ inline = para.runs
570
+ colour = False
571
+ if each in para.text:
572
+ colour = True
573
+ if colour:
574
+ for i in range(len(inline)):
575
+ inline[i].font.highlight_color = WD_COLOR_INDEX.YELLOW
576
+ break
577
+ doc.save(output_file)
578
+ return output_file
579
+
580
+
581
+ def get_similar_clauses(filename, output_file,clauses, source_language):
582
+ paras = []
583
+ template_document = Document(filename)
584
+ contract = []
585
+ for paragraph in template_document.paragraphs:
586
+ if(paragraph.text.strip()!=''):
587
+ contract.append(paragraph.text)
588
+
589
+ sentence_batch = []
590
+
591
+ for paragraph in contract:
592
+ sentence_batch.extend(split_sentences(paragraph, source_language))
593
+
594
+ sentence_batch = [each for each in sentence_batch if each!=' ' and len(each.split())>5]
595
+
596
+ doc_rep = obtain_rep(sentence_batch)
597
+ clause_rep = obtain_rep(clauses)
598
+ k = similarity(doc_rep, clause_rep)
599
+ pick_top = max(int(0.1*len(sentence_batch)),3)
600
+ ind = k.argsort()[-pick_top:][::-1]
601
+ for each_idx in ind:
602
+ paras.append(sentence_batch[each_idx])
603
+
604
+ output_file = fill_yellow(filename, output_file, paras)
605
+ highlighted_paras = get_highlighted_clauses(output_file)
606
+ return output_file, highlighted_paras
607
+
608
+
609
+ input_output_similar = {"NON-DISCLOSURE-AGREEMENT":[{"clauses":["hi"], "file":"output_similar.docx"},{"clauses":["bye","see you"], "file":"output.docx"}], "dummy.docx":[{"clauses":["lets see","whatever"],"file":"dummy_colored.docx"}]}
610
+ def get_highlighted_clauses(filename):
611
+ doc = docx.Document(filename)
612
+ para_highlighted = []
613
+ for para in doc.paragraphs:
614
+ inline = para.runs
615
+ colour = False
616
+ for i in range(len(inline)):
617
+ if inline[i].font.highlight_color == WD_COLOR_INDEX.YELLOW :
618
+ colour = True
619
+ break
620
+ if colour:
621
+ para_highlighted.append(para.text)
622
+ return para_highlighted
623
+
624
+ def run_similar_clause(filename, output_file, clauses, source_language):
625
+ print("similar clause")
626
+ doc = docx.Document(filename)
627
+ for doc_input in list(input_output_similar.keys()):
628
+ if doc.paragraphs[0].text in doc_input:
629
+ for each_ in input_output_similar[doc_input]:
630
+ if len(list(set(each_["clauses"]).intersection(set(clauses))))>0 :
631
+ output_file = each_["file"]
632
+ time.sleep(3)
633
+ highlighted_paras = get_highlighted_clauses(output_file)
634
+ return output_file, highlighted_paras
635
+ else:
636
+ output_file, highlighted_paras = get_similar_clauses(filename, output_file,clauses, source_language)
637
+ return output_file, highlighted_paras
638
+
639
+
640
+ import gradio as gr
641
+
642
+ analysis_services = ['Translate Contract', 'Identify key Clauses', 'Red flag Identification', 'Similar Semantic Clause search', 'Generate Questions for Contract Template']
643
+ analysis_label = 'Select Contract Analysis Service'
644
+ analysis_choices = analysis_services
645
+ analysis_choice = ''
646
+ lang_choice = 'english'
647
+ translation_label = 'Upload contract for Translation'
648
+ translation_src_label = 'Select language of uploaded contract'
649
+ translation_tgt_label = 'Select language to translate'
650
+ keyclause_label = 'Upload contract for Key Clause Extraction'
651
+ redflag_label = 'Upload contract for Red Flag Identification'
652
+ similar_label = 'Upload contract for Semantic Similar Clauses'
653
+ similar_clause_label = 'Enter clauses to be identified (enter one clause per line)'
654
+ generate_questions_label = 'Upload template contract for Question Generation'
655
+ delimiter_label = "Input placeholder (pattern or symbol used as blank in template)"
656
+ button_label = "Upload and Analyze"
657
+
658
+
659
+ translation_output_label = 'Download your translated contract'
660
+ keyclause_output_label = 'Download your key clauses from the contract'
661
+ redflag_output_label = 'Download your contract with red flags highlighted'
662
+ similar_file_label = 'Download your contract with highlighted similar clauses in yellow'
663
+ similar_text_label = 'A quick view of similar clauses'
664
+ qg_output_label = 'Download your template contract along with questions'
665
+ q_output_label = 'Download only questions to fill the template contract'
666
+
667
+ def change_analysis(choice):
668
+ global lang_choice, analysis_choices
669
+ lang_choice = choice
670
+ analysis_choices = [translate_paragraph(paragraph, "english", choice) for paragraph in analysis_services]
671
+ return [gr.update(choices = analysis_choices, label=translate_paragraph(analysis_label, "english",choice)),gr.update(visible=False),gr.update(visible=False),gr.update(visible=False),gr.update(visible=False),gr.update(visible=False),gr.update(visible=False),gr.update(visible=False),gr.update(visible=False)]
672
+
673
+ def change_inputs(choice):
674
+ global analysis_choice
675
+ analysis_choice = choice
676
+ if analysis_choice == analysis_choices[0]:
677
+ return [gr.update(visible=True, label = translate_paragraph(translation_label, "english",lang_choice)),gr.update(visible=False), gr.update(visible=True, label=''),gr.update(visible=False),gr.update(visible=False),gr.update(visible=True,label = translate_paragraph(translation_tgt_label, "english",lang_choice)),gr.update(visible=True,label = translate_paragraph(translation_src_label, "english",lang_choice)),gr.update(visible=False), gr.update(value= translate_paragraph(button_label, "english",lang_choice),visible=True)]
678
+ elif analysis_choice == analysis_choices[1]:
679
+ return [gr.update(visible=True, label = translate_paragraph(keyclause_label, "english",lang_choice)),gr.update(visible=False), gr.update(visible=True,label=''),gr.update(visible=False),gr.update(visible=False),gr.update(visible=False),gr.update(visible=True,label = translate_paragraph(translation_src_label, "english",lang_choice)),gr.update(visible=False),gr.update(value= translate_paragraph(button_label, "english",lang_choice),visible=True)]
680
+ elif analysis_choice == analysis_choices[2]:
681
+ return [gr.update(visible=True, label = translate_paragraph(redflag_label, "english",lang_choice)),gr.update(visible=False), gr.update(visible=True,label=''),gr.update(visible=False),gr.update(visible=False),gr.update(visible=False),gr.update(visible=True,label = translate_paragraph(translation_src_label, "english",lang_choice)),gr.update(visible=False),gr.update(value= translate_paragraph(button_label, "english",lang_choice),visible=True)]
682
+ elif analysis_choice == analysis_choices[3]:
683
+ return [gr.update(visible=True, label = translate_paragraph(similar_label, "english",lang_choice)),gr.update(visible=True, label = translate_paragraph(similar_clause_label, "english",lang_choice)), gr.update(visible=True,label=''),gr.update(visible=True,label=''),gr.update(visible=True,label=''),gr.update(visible=False),gr.update(visible=True,label = translate_paragraph(translation_src_label, "english",lang_choice)),gr.update(visible=False),gr.update(value= translate_paragraph(button_label, "english",lang_choice),visible=True)]
684
+ elif analysis_choice == analysis_choices[4]:
685
+ return [gr.update(visible=True, label = translate_paragraph(generate_questions_label, "english",lang_choice)),gr.update(visible=False), gr.update(visible=True,label=''),gr.update(visible=True,label=''),gr.update(visible=False),gr.update(visible=False),gr.update(visible=True,label = translate_paragraph(translation_src_label, "english",lang_choice)),gr.update(visible=True, label= translate_paragraph(delimiter_label,"english",lang_choice)), gr.update(value= translate_paragraph(button_label, "english",lang_choice),visible=True)]
686
+
687
+ def process_analysis(document_name, text, source_language, target_language, delimiter):
688
+ if analysis_choice == analysis_choices[0]:
689
+ translation_output = translate_fill(document_name, "translation_" + target_language + ".docx", source_language , target_language)
690
+ return [gr.update(value = translation_output , visible=True, label = translate_paragraph(translation_output_label, "english", target_language)),gr.update(visible=False),gr.update(visible=False)]
691
+ elif analysis_choice == analysis_choices[1]:
692
+ info_output = run_key_clause(document_name, "key_clauses.txt",source_language)
693
+ return [gr.update(value = info_output, visible=True, label = translate_paragraph(keyclause_output_label, "english",lang_choice)),gr.update(visible=False), gr.update(visible=False)]
694
+ elif analysis_choice == analysis_choices[2]:
695
+ red_flag_output = run_redflags(document_name, "redflag.docx")
696
+ return [gr.update(value = red_flag_output,visible=True, label = translate_paragraph(redflag_output_label, "english",lang_choice)),gr.update(visible=False), gr.update(visible=False)]
697
+ elif analysis_choice == analysis_choices[3]:
698
+ clauses = text.split("\n")
699
+ similar_file, similar_text = run_similar_clause(document_name, "similar.docx", clauses, source_language)
700
+ similar_text = "\n\n\n".join(similar_text)
701
+ return [gr.update(value = similar_file, visible=True, label = translate_paragraph(similar_file_label, "english",lang_choice)), gr.update(visible=False),gr.update(value = similar_text, visible=True, label = translate_paragraph(similar_text_label, "english",lang_choice))]
702
+ elif analysis_choice == analysis_choices[4]:
703
+ qg_output, q_output = run_generate_questions(document_name, "qsns_template.docx", "qsns_only.txt", delimiter, source_language)
704
+ return [gr.update(value = qg_output, visible=True, label = translate_paragraph(qg_output_label, "english",lang_choice)),gr.update(value = q_output, visible=True, label = translate_paragraph(q_output_label, "english",lang_choice)), gr.update(visible=False)]
705
+
706
+
707
+ with gr.Blocks() as demo:
708
+ lang_radio = gr.Radio(list(lang_dict.keys()), value = 'english', label="Select your language")
709
+ analysis_radio = gr.Radio(analysis_services , label=analysis_label)
710
+
711
+ with gr.Row():
712
+ input_file = gr.File(interactive = True, visible = False)
713
+ with gr.Column():
714
+ translation_source = gr.Dropdown(choices = list(lang_dict.keys()),interactive = True, value = 'english', label=translation_src_label, visible=False)
715
+ translation_target = gr.Dropdown(choices = list(lang_dict.keys()),interactive = True, value = 'english', label=translation_tgt_label, visible=False)
716
+ delimiter = gr.Textbox(label= delimiter_label, lines=1, interactive = True, visible = False)
717
+
718
+ input_text = gr.Textbox(lines=4, interactive = True, visible = False)
719
+
720
+ button = gr.Button(value = button_label , visible = False)
721
+ output_file = gr.File(interactive = False, visible = False)
722
+ output_file2 = gr.File(interactive = False, visible = False)
723
+ output_text = gr.Textbox(interactive = False, visible = False)
724
+
725
+ lang_radio.change(fn=change_analysis, inputs=lang_radio, outputs=[analysis_radio,input_file, input_text, output_file,output_file2, output_text,translation_target,translation_source, delimiter])
726
+ analysis_radio.change(fn=change_inputs, inputs=analysis_radio, outputs=[input_file, input_text, output_file, output_file2, output_text,translation_target, translation_source, delimiter, button])
727
+ button.click( process_analysis, [input_file,input_text, translation_source, translation_target, delimiter], [output_file, output_file2, output_text])
728
+
729
+ demo.launch(debug=True)