lfqa_discourse / app.py
carrie
quick fix
55ea91c
raw
history blame
3.85 kB
import os
import gradio as gr
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import stanza
import re
stanza.download('en', processors='tokenize')
model = AutoModelForSeq2SeqLM.from_pretrained("fangyuan/lfqa_role_classification")
tokenizer = AutoTokenizer.from_pretrained("fangyuan/lfqa_role_classification")
en_nlp = stanza.Pipeline('en', processors='tokenize')
article='''
## About
This is a demo for our paper: [How Do We Answer Complex Questions: Discourse Structure of Long-form Answers](https://aclanthology.org/2022.acl-long.249/).
Fangyuan Xu, Junyi Jessy Li, Eunsol Choi. 2022.
## Model
The model served here is a T5(large)-based role classification model trained on functional roles of ELI5 answers.
## Resources
Please see more information (paper/code/data/datasheet) at our [website](https://www.cs.utexas.edu/~fxu/lfqa_discourse/index.html).
## Contact
[Fangyuan Xu](https://www.cs.utexas.edu/~fxu/) via firstname@utexas.edu
'''
role_mappings = {
'Answer': 'Answer',
'Answer (Summary)': 'Summary',
'Auxiliary Information': 'Auxiliary Information',
'Answer - Example': 'Example',
'Miscellaneous': 'Miscellaneous',
'Answer - Organizational sentence': 'Organizational sentence',
' ': ' ',
}
def get_ans_sentence_with_stanza(answer_paragraph, pipeline,
is_offset=False):
'''sentence segmentation with stanza'''
answer_paragraph_processed = pipeline(answer_paragraph)
sentences = []
for sent in answer_paragraph_processed.sentences:
if is_offset:
sentences.append((sent.tokens[0].start_char, sent.tokens[-1].end_char))
else:
sentence = answer_paragraph[sent.tokens[0].start_char:sent.tokens[-1].end_char + 1]
sentences.append(sentence.strip())
return sentences
def create_input_to_t5(question, answer):
input_line = [question]
answer_paragraph = get_ans_sentence_with_stanza(answer, en_nlp)
for idx, answer_sent in enumerate(answer_paragraph):
sep_token = '[{}]'.format(idx)
input_line.append(sep_token)
input_line.append(answer_sent)
return ' '.join(input_line)
def process_t5_output(input_txt, output_txt):
pred_roles = []
answer_sentence = re.split('\[\d+\] ', input_txt)
answer_sentence = answer_sentence[1:]
sentence_idx = re.findall('\[\d+\]', input_txt)
idx_to_sentence = zip(sentence_idx, answer_sentence)
pred_role = re.split('\[\d+\] ', output_txt)[1:]
pred_idx = re.findall('\[\d+\]', output_txt)
idx_to_role = {
idx: role.strip() for (idx, role) in zip(pred_idx, pred_role)
}
for _, (idx, sentence) in enumerate(idx_to_sentence):
pred_role = ' ' if idx not in idx_to_role else idx_to_role[idx]
mapped_pred_role = role_mappings[pred_role]
pred_roles.append('{}: {}'.format(sentence, mapped_pred_role))
pred_roles.append(output_txt)
return '\n'.join(pred_roles)
def predict(question, answer):
input_txt = create_input_to_t5(question, answer)
input_ids = tokenizer(input_txt, return_tensors='pt').input_ids
outputs = model.generate(input_ids, max_length=512)
output_txt = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
return process_t5_output(input_txt, output_txt)
gr.Interface(
fn=predict,
inputs=[
gr.inputs.Textbox(lines=1, label="Question:"),
gr.inputs.Textbox(lines=1, label="Answer:"),
],
outputs=[
gr.outputs.Textbox(label="Predicted sentence-level functional roles"),
],
theme="peach",
title="Discourse structure of long-form answer",
description="Input a question with its long-form answer to see the predicted discourse structure by our role classifier.",
article=article,
examples=[
#['', '']
]
).launch(enable_queue=True)