contract-review / app.py
Heiko Hotz
initial commit
5ad0224
raw
history blame
5.05 kB
import collections
import math
import re
import string
import streamlit as st
from transformers.pipelines import pipeline
import json
import sys
from predict import run_prediction
import random
from io import StringIO
import requests
import boto3
from transformers import (
AutoConfig,
AutoModelForQuestionAnswering,
AutoTokenizer,
squad_convert_examples_to_features
)
from transformers.data.processors.squad import SquadResult, SquadV2Processor, SquadExample
from transformers.data.metrics.squad_metrics import compute_predictions_logits
import gradio as gr
import json
import torch
import time
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
st.set_page_config(layout="wide")
st.cache(show_spinner=False, persist=True)
def load_questions():
questions = []
with open('data/questions.txt') as f:
questions = f.readlines()
# questions = []
# for i, q in enumerate(data['data'][0]['paragraphs'][0]['qas']):
# question = data['data'][0]['paragraphs'][0]['qas'][i]['question']
# questions.append(question)
return questions
st.cache(show_spinner=False, persist=True)
def load_contracts():
with open('data/test.json') as json_file:
data = json.load(json_file)
contracts = []
for i, q in enumerate(data['data']):
contract = ' '.join(data['data'][i]['paragraphs'][0]['context'].split())
contracts.append(contract)
return contracts
questions = load_questions()
# contracts = load_contracts()
### DEFINE SIDEBAR
st.sidebar.title("Interactive Contract Analysis")
st.sidebar.markdown(
"""
Process text with [Huggingface](https://huggingface.co) models and visualize the results.
This model uses a pretrained snapshot trained on the [Atticus](https://www.atticusprojectai.org/) Dataset - CUAD
"""
)
st.sidebar.header("Contract Selection")
# select contract
contracts_drop = ['contract 1', 'contract 2', 'contract 3']
contracts_files = ['contract-1.txt', 'contract-2.txt', 'contract-3.txt']
contract = st.sidebar.selectbox('Please Select a Contract', contracts_drop)
idx = contracts_drop.index(contract)
with open('data/'+contracts_files[idx-1]) as f:
contract_data = f.read()
# upload contract
user_upload = st.sidebar.file_uploader('Please upload your own', type=['docx', 'pdf', 'txt'], accept_multiple_files=False)
print(user_upload)
# process upload
if user_upload is not None:
print(user_upload.name,user_upload.type)
extension = user_upload.name.split('.')[-1].lower()
if extension == 'txt':
print('text file uploaded')
# To convert to a string based IO:
stringio = StringIO(user_upload.getvalue().decode("utf-8"))
# To read file as string:
contract_data = stringio.read()
elif extension == 'pdf':
import PyPDF4
try:
# Extracting Text from PDFs
pdfReader = PyPDF4.PdfFileReader(user_upload)
print(pdfReader.numPages)
contract_data = ''
for i in range(0,pdfReader.numPages):
print(i)
pageobj = pdfReader.getPage(i)
contract_data = contract_data + pageobj.extractText()
except:
st.warning('Unable to read PDF, please try another file')
elif extension == 'docx':
import docx2txt
contract_data = docx2txt.process(user_upload)
else:
st.warning('Unknown uploaded file type, please try again')
results_drop = ['1', '2', '3']
number_results = st.sidebar.selectbox('Select number of results', results_drop)
### DEFINE MAIN PAGE
st.header("Legal Contract Review Demo")
st.write("This demo uses the CUAD dataset for Contract Understanding.")
paragraph = st.text_area(label="Contract",value=contract_data,height=400)
question = st.selectbox('Choose one of the 41 queries from the CUAD dataset:', questions)
if st.button('Analyze'):
if (not len(paragraph)==0) and not (len(question)==0):
print('getting predictions')
with st.spinner(text='Analysis in progress...'):
#predictions = run_prediction([question], paragraph, '../models/roberta-base/')
data = {}
data['question']=[question]
data['context']=paragraph
print(data)
predictions = run_prediction(data['question'], data['context'], 'akdeniz27/roberta-base-cuad',
n_best_size=int(number_results))
# print(resp)
# predictions=resp.json()
# print(predictions)
if predictions['0'] == "":
answer = 'No answer found in document'
else:
if number_results == '1':
answer = predictions['0']
st.text_area(label="Answer", value=f"{answer}")
else:
f = open("nbest.json")
st.success(f.readlines())
st.success("Successfully processed contract!")
else:
st.write("Unable to call model, please select question and contract")