gauravthere akdeniz27 commited on
Commit
519a08a
0 Parent(s):

Duplicate from akdeniz27/contract-understanding-atticus-dataset-demo

Browse files

Co-authored-by: Taner Akdeniz <akdeniz27@users.noreply.huggingface.co>

Files changed (6) hide show
  1. .gitattributes +27 -0
  2. README.md +38 -0
  3. app.py +78 -0
  4. predict.py +113 -0
  5. requirements.txt +4 -0
  6. test.json +0 -0
.gitattributes ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bin.* filter=lfs diff=lfs merge=lfs -text
5
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.model filter=lfs diff=lfs merge=lfs -text
12
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
13
+ *.onnx filter=lfs diff=lfs merge=lfs -text
14
+ *.ot filter=lfs diff=lfs merge=lfs -text
15
+ *.parquet filter=lfs diff=lfs merge=lfs -text
16
+ *.pb filter=lfs diff=lfs merge=lfs -text
17
+ *.pt filter=lfs diff=lfs merge=lfs -text
18
+ *.pth filter=lfs diff=lfs merge=lfs -text
19
+ *.rar filter=lfs diff=lfs merge=lfs -text
20
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
21
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
22
+ *.tflite filter=lfs diff=lfs merge=lfs -text
23
+ *.tgz filter=lfs diff=lfs merge=lfs -text
24
+ *.xz filter=lfs diff=lfs merge=lfs -text
25
+ *.zip filter=lfs diff=lfs merge=lfs -text
26
+ *.zstandard filter=lfs diff=lfs merge=lfs -text
27
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Contract Understanding Atticus Dataset (CUAD) Demo
3
+ emoji: 💻
4
+ colorFrom: red
5
+ colorTo: purple
6
+ sdk: streamlit
7
+ app_file: app.py
8
+ pinned: false
9
+ duplicated_from: akdeniz27/contract-understanding-atticus-dataset-demo
10
+ ---
11
+
12
+ # Configuration
13
+
14
+ `title`: _string_
15
+ Display title for the Space
16
+
17
+ `emoji`: _string_
18
+ Space emoji (emoji-only character allowed)
19
+
20
+ `colorFrom`: _string_
21
+ Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
22
+
23
+ `colorTo`: _string_
24
+ Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
25
+
26
+ `sdk`: _string_
27
+ Can be either `gradio` or `streamlit`
28
+
29
+ `sdk_version` : _string_
30
+ Only applicable for `streamlit` SDK.
31
+ See [doc](https://hf.co/docs/hub/spaces) for more info on supported versions.
32
+
33
+ `app_file`: _string_
34
+ Path to your main application file (which contains either `gradio` or `streamlit` Python code).
35
+ Path is relative to the root of the repository.
36
+
37
+ `pinned`: _boolean_
38
+ Whether the Space stays on top of your list.
app.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoModelForQuestionAnswering, AutoTokenizer
2
+ import streamlit as st
3
+ import json
4
+ from predict import run_prediction
5
+
6
+ st.set_page_config(layout="wide")
7
+
8
+ model_list = ['akdeniz27/roberta-base-cuad',
9
+ 'akdeniz27/roberta-large-cuad',
10
+ 'akdeniz27/deberta-v2-xlarge-cuad']
11
+ st.sidebar.header("Select CUAD Model")
12
+ model_checkpoint = st.sidebar.radio("", model_list)
13
+
14
+ if model_checkpoint == "akdeniz27/deberta-v2-xlarge-cuad": import sentencepiece
15
+
16
+ st.sidebar.write("Project: https://www.atticusprojectai.org/cuad")
17
+ st.sidebar.write("Git Hub: https://github.com/TheAtticusProject/cuad")
18
+ st.sidebar.write("CUAD Dataset: https://huggingface.co/datasets/cuad")
19
+ st.sidebar.write("License: CC BY 4.0 https://creativecommons.org/licenses/by/4.0/")
20
+
21
+ @st.cache(allow_output_mutation=True)
22
+ def load_model():
23
+ model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)
24
+ tokenizer = AutoTokenizer.from_pretrained(model_checkpoint , use_fast=False)
25
+ return model, tokenizer
26
+
27
+ @st.cache(allow_output_mutation=True)
28
+ def load_questions():
29
+ with open('test.json') as json_file:
30
+ data = json.load(json_file)
31
+
32
+
33
+ questions = []
34
+ for i, q in enumerate(data['data'][0]['paragraphs'][0]['qas']):
35
+ question = data['data'][0]['paragraphs'][0]['qas'][i]['question']
36
+ questions.append(question)
37
+ return questions
38
+
39
+ @st.cache(allow_output_mutation=True)
40
+ def load_contracts():
41
+ with open('test.json') as json_file:
42
+ data = json.load(json_file)
43
+
44
+ contracts = []
45
+ for i, q in enumerate(data['data']):
46
+ contract = ' '.join(data['data'][i]['paragraphs'][0]['context'].split())
47
+ contracts.append(contract)
48
+ return contracts
49
+
50
+ model, tokenizer = load_model()
51
+ questions = load_questions()
52
+ contracts = load_contracts()
53
+
54
+ contract = contracts[0]
55
+
56
+ st.header("Contract Understanding Atticus Dataset (CUAD) Demo")
57
+ st.write("Based on https://github.com/marshmellow77/cuad-demo")
58
+
59
+
60
+ selected_question = st.selectbox('Choose one of the 41 queries from the CUAD dataset:', questions)
61
+ question_set = [questions[0], selected_question]
62
+
63
+ contract_type = st.radio("Select Contract", ("Sample Contract", "New Contract"))
64
+ if contract_type == "Sample Contract":
65
+ sample_contract_num = st.slider("Select Sample Contract #")
66
+ contract = contracts[sample_contract_num]
67
+ with st.expander(f"Sample Contract #{sample_contract_num}"):
68
+ st.write(contract)
69
+ else:
70
+ contract = st.text_area("Input New Contract", "", height=256)
71
+
72
+ Run_Button = st.button("Run", key=None)
73
+ if Run_Button == True and not len(contract)==0 and not len(question_set)==0:
74
+ predictions = run_prediction(question_set, contract, 'akdeniz27/roberta-base-cuad')
75
+
76
+ for i, p in enumerate(predictions):
77
+ if i != 0: st.write(f"Question: {question_set[int(p)]}\n\nAnswer: {predictions[p]}\n\n")
78
+
predict.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import time
3
+ from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
4
+
5
+ from transformers import (
6
+ AutoConfig,
7
+ AutoModelForQuestionAnswering,
8
+ AutoTokenizer,
9
+ squad_convert_examples_to_features
10
+ )
11
+
12
+ from transformers.data.processors.squad import SquadResult, SquadV2Processor, SquadExample
13
+ from transformers.data.metrics.squad_metrics import compute_predictions_logits
14
+
15
+ def run_prediction(question_texts, context_text, model_path):
16
+ ### Setting hyperparameters
17
+ max_seq_length = 512
18
+ doc_stride = 256
19
+ n_best_size = 1
20
+ max_query_length = 64
21
+ max_answer_length = 512
22
+ do_lower_case = False
23
+ null_score_diff_threshold = 0.0
24
+
25
+ # model_name_or_path = "../cuad-models/roberta-base/"
26
+
27
+ def to_list(tensor):
28
+ return tensor.detach().cpu().tolist()
29
+
30
+ config_class, model_class, tokenizer_class = (
31
+ AutoConfig, AutoModelForQuestionAnswering, AutoTokenizer)
32
+ config = config_class.from_pretrained(model_path)
33
+ tokenizer = tokenizer_class.from_pretrained(
34
+ model_path, do_lower_case=True, use_fast=False)
35
+ model = model_class.from_pretrained(model_path, config=config)
36
+
37
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
38
+ model.to(device)
39
+
40
+ processor = SquadV2Processor()
41
+ examples = []
42
+
43
+ for i, question_text in enumerate(question_texts):
44
+ example = SquadExample(
45
+ qas_id=str(i),
46
+ question_text=question_text,
47
+ context_text=context_text,
48
+ answer_text=None,
49
+ start_position_character=None,
50
+ title="Predict",
51
+ answers=None,
52
+ )
53
+
54
+ examples.append(example)
55
+
56
+ features, dataset = squad_convert_examples_to_features(
57
+ examples=examples,
58
+ tokenizer=tokenizer,
59
+ max_seq_length=max_seq_length,
60
+ doc_stride=doc_stride,
61
+ max_query_length=max_query_length,
62
+ is_training=False,
63
+ return_dataset="pt",
64
+ threads=1,
65
+ )
66
+
67
+ eval_sampler = SequentialSampler(dataset)
68
+ eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=10)
69
+
70
+ all_results = []
71
+
72
+ for batch in eval_dataloader:
73
+ model.eval()
74
+ batch = tuple(t.to(device) for t in batch)
75
+
76
+ with torch.no_grad():
77
+ inputs = {
78
+ "input_ids": batch[0],
79
+ "attention_mask": batch[1],
80
+ "token_type_ids": batch[2],
81
+ }
82
+
83
+ example_indices = batch[3]
84
+
85
+ outputs = model(**inputs)
86
+
87
+ for i, example_index in enumerate(example_indices):
88
+ eval_feature = features[example_index.item()]
89
+ unique_id = int(eval_feature.unique_id)
90
+
91
+ output = [to_list(output[i]) for output in outputs.to_tuple()]
92
+
93
+ start_logits, end_logits = output
94
+ result = SquadResult(unique_id, start_logits, end_logits)
95
+ all_results.append(result)
96
+
97
+ final_predictions = compute_predictions_logits(
98
+ all_examples=examples,
99
+ all_features=features,
100
+ all_results=all_results,
101
+ n_best_size=n_best_size,
102
+ max_answer_length=max_answer_length,
103
+ do_lower_case=do_lower_case,
104
+ output_prediction_file=None,
105
+ output_nbest_file=None,
106
+ output_null_log_odds_file=None,
107
+ verbose_logging=False,
108
+ version_2_with_negative=True,
109
+ null_score_diff_threshold=null_score_diff_threshold,
110
+ tokenizer=tokenizer
111
+ )
112
+
113
+ return final_predictions
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ streamlit
2
+ torch
3
+ transformers
4
+ sentencepiece
test.json ADDED
The diff for this file is too large to render. See raw diff