import os import gradio as gr from transformers import AutoModelForSeq2SeqLM, AutoTokenizer import stanza import re stanza.download('en', processors='tokenize') model = AutoModelForSeq2SeqLM.from_pretrained("fangyuan/lfqa_role_classification") tokenizer = AutoTokenizer.from_pretrained("fangyuan/lfqa_role_classification") en_nlp = stanza.Pipeline('en', processors='tokenize') article=''' ## About This is a demo for our paper: [How Do We Answer Complex Questions: Discourse Structure of Long-form Answers](https://aclanthology.org/2022.acl-long.249/). Fangyuan Xu, Junyi Jessy Li, Eunsol Choi. 2022. ## Model The model served here is a T5(large)-based role classification model trained on functional roles of ELI5 answers. ## Resources Please see more information (paper/code/data/datasheet) at our [website](https://www.cs.utexas.edu/~fxu/lfqa_discourse/index.html). ## Contact [Fangyuan Xu](https://www.cs.utexas.edu/~fxu/) via firstname@utexas.edu ''' role_mappings = { 'Answer': 'Answer', 'Answer (Summary)': 'Summary', 'Auxiliary Information': 'Auxiliary Information', 'Answer - Example': 'Example', 'Miscellaneous': 'Miscellaneous', 'Answer - Organizational sentence': 'Organizational sentence', ' ': ' ', } def get_ans_sentence_with_stanza(answer_paragraph, pipeline, is_offset=False): '''sentence segmentation with stanza''' answer_paragraph_processed = pipeline(answer_paragraph) sentences = [] for sent in answer_paragraph_processed.sentences: if is_offset: sentences.append((sent.tokens[0].start_char, sent.tokens[-1].end_char)) else: sentence = answer_paragraph[sent.tokens[0].start_char:sent.tokens[-1].end_char + 1] sentences.append(sentence.strip()) return sentences def create_input_to_t5(question, answer): input_line = [question] answer_paragraph = get_ans_sentence_with_stanza(answer, en_nlp) for idx, answer_sent in enumerate(answer_paragraph): sep_token = '[{}]'.format(idx) input_line.append(sep_token) input_line.append(answer_sent) return ' '.join(input_line) def process_t5_output(input_txt, output_txt): pred_roles = [] answer_sentence = re.split('\[\d+\] ', input_txt) answer_sentence = answer_sentence[1:] sentence_idx = re.findall('\[\d+\]', input_txt) idx_to_sentence = zip(sentence_idx, answer_sentence) pred_role = re.split('\[\d+\] ', output_txt)[1:] pred_idx = re.findall('\[\d+\]', output_txt) idx_to_role = { idx: role.strip() for (idx, role) in zip(pred_idx, pred_role) } for _, (idx, sentence) in enumerate(idx_to_sentence): pred_role = ' ' if idx not in idx_to_role else idx_to_role[idx] mapped_pred_role = role_mappings[pred_role] pred_roles.append('{} ({})'.format(sentence, mapped_pred_role)) print(input_txt, output_txt) return '\n'.join(pred_roles) def predict(question, answer): input_txt = create_input_to_t5(question, answer) input_ids = tokenizer(input_txt, return_tensors='pt').input_ids outputs = model.generate(input_ids, max_length=512) output_txt = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0] return process_t5_output(input_txt, output_txt) gr.Interface( fn=predict, inputs=[ gr.inputs.Textbox(lines=1, label="Question:"), gr.inputs.Textbox(lines=1, label="Answer:"), ], outputs=[ gr.outputs.Textbox(label="Predicted sentence-level functional roles"), ], theme="peach", title="Discourse structure of long-form answer", description="Input a question with its long-form answer to see the predicted discourse structure by our role classifier.", article=article, examples=[ #['', ''] ] ).launch(enable_queue=True)