File size: 4,858 Bytes
ec6ca10
 
 
c8b4824
 
 
ec6ca10
 
 
c8b4824
 
15c3a36
 
 
 
bb6e8d2
15c3a36
 
 
 
55ea91c
 
 
15c3a36
 
 
 
 
 
 
 
 
55ea91c
15c3a36
 
c8b4824
 
 
 
 
 
 
 
 
d94e318
c8b4824
 
 
 
 
 
 
 
641a09b
c8b4824
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15c3a36
 
bb6e8d2
 
c8b4824
 
 
ec6ca10
c8b4824
 
 
 
 
 
ec6ca10
 
 
 
 
36895c4
 
 
 
 
c8b4824
ec6ca10
 
36895c4
 
15c3a36
ec6ca10
0ae75d3
 
ec6ca10
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import os
import gradio as gr
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import stanza
import re
stanza.download('en', processors='tokenize')

model = AutoModelForSeq2SeqLM.from_pretrained("fangyuan/lfqa_role_classification")
tokenizer = AutoTokenizer.from_pretrained("fangyuan/lfqa_role_classification")
en_nlp = stanza.Pipeline('en', processors='tokenize')


article='''
## About 
This is a demo for our paper: [How Do We Answer Complex Questions: Discourse Structure of Long-form Answers](https://aclanthology.org/2022.acl-long.249/).

Fangyuan Xu, Junyi Jessy Li, Eunsol Choi. 2022.
## Model
The model served here is a T5(large)-based role classification model trained on functional roles of ELI5 answers.
## Resources 
Please see more information (paper/code/data/datasheet) at our [website](https://www.cs.utexas.edu/~fxu/lfqa_discourse/index.html).
## Contact
[Fangyuan Xu](https://www.cs.utexas.edu/~fxu/) via firstname@utexas.edu
'''

role_mappings = {
    'Answer': 'Answer',
    'Answer (Summary)': 'Summary',
    'Auxiliary Information': 'Auxiliary Information',
    'Answer - Example': 'Example',
    'Miscellaneous': 'Miscellaneous',
    'Answer - Organizational sentence': 'Organizational sentence',
    ' ': ' ',
}

def get_ans_sentence_with_stanza(answer_paragraph, pipeline,
                                 is_offset=False):
    '''sentence segmentation with stanza'''
    answer_paragraph_processed = pipeline(answer_paragraph)
    sentences = []
    for sent in answer_paragraph_processed.sentences:
        if is_offset:
            sentences.append((sent.tokens[0].start_char, sent.tokens[-1].end_char))
        else:
            sentence = answer_paragraph[sent.tokens[0].start_char:sent.tokens[-1].end_char]
            sentences.append(sentence.strip())
    return sentences


def create_input_to_t5(question, answer):
    input_line = [question]
    answer_paragraph = get_ans_sentence_with_stanza(answer, en_nlp)
    for idx, answer_sent in enumerate(answer_paragraph):
        sep_token = '[{}]'.format(idx+1) # shift by one
        input_line.append(sep_token)
        input_line.append(answer_sent)
    return ' '.join(input_line)

def process_t5_output(input_txt, output_txt):
    pred_roles = []
    answer_sentence = re.split('\[\d+\] ', input_txt)
    answer_sentence = answer_sentence[1:]
    sentence_idx = re.findall('\[\d+\]', input_txt)
    idx_to_sentence = zip(sentence_idx, answer_sentence)
    pred_role = re.split('\[\d+\] ', output_txt)[1:]
    pred_idx = re.findall('\[\d+\]', output_txt)
    idx_to_role = {
        idx: role.strip() for (idx, role) in zip(pred_idx, pred_role)
    }
    for _, (idx, sentence) in enumerate(idx_to_sentence):
        pred_role = ' ' if idx not in idx_to_role else idx_to_role[idx]
        mapped_pred_role = role_mappings[pred_role]
        pred_roles.append('{} ({})'.format(sentence, mapped_pred_role))
    print(input_txt, output_txt)
    return '\n'.join(pred_roles)



def predict(question, answer):
    input_txt = create_input_to_t5(question, answer)
    input_ids = tokenizer(input_txt, return_tensors='pt').input_ids
    outputs = model.generate(input_ids, max_length=512)
    output_txt = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
    return process_t5_output(input_txt, output_txt)



gr.Interface(
    fn=predict,
    inputs=[
        gr.inputs.Textbox(lines=1, label="Question:"),
        gr.inputs.Textbox(lines=1, label="Answer:"),
    ],
    outputs=[
        gr.outputs.Textbox(label="Predicted sentence-level functional roles"),
    ],
    theme="peach",
    title="Discourse structure of long-form answer",
    description="Input a question with its long-form answer to see the predicted discourse structure by our role classifier.",
    article=article,
    examples=[
        ['''If a sheep's wool never stops growing, how are they not extinct?''',
         '''It's already answered that continuous wool growth has been selected by human breeders, but there's a misconception in your question that I'd like to address.Evolution doesn't select for what is best for *the individual*.Traits that help the individual don't necessarily survive.Only traits that ensure *procreation* survive.The quality of life is no concern to nature.Think of pain.There's absolutely no sense of us feeling excruciating pain.When you're dying, its about as much help to you as a sheep with meter long hair.Pain itself however is very useful during lifetime to avoid injury.An individual capable of feeling pain is much more likely to procreate than an individual which is not.That said, it is very unlikely for an expensive trait like growing massive amounts of wool to occur in wild sheep.However, given the right circumstances, it could well occur.Provided it doesn't hamper reproduction too much.''']
    ]
).launch(enable_queue=True)