import os import gradio as gr from transformers import AutoModelForSeq2SeqLM, AutoTokenizer import stanza import re stanza.download('en', processors='tokenize') model = AutoModelForSeq2SeqLM.from_pretrained("fangyuan/lfqa_role_classification") tokenizer = AutoTokenizer.from_pretrained("fangyuan/lfqa_role_classification") en_nlp = stanza.Pipeline('en', processors='tokenize') def get_ans_sentence_with_stanza(answer_paragraph, pipeline, is_offset=False): '''sentence segmentation with stanza''' answer_paragraph_processed = pipeline(answer_paragraph) sentences = [] for sent in answer_paragraph_processed.sentences: if is_offset: sentences.append((sent.tokens[0].start_char, sent.tokens[-1].end_char)) else: sentence = answer_paragraph[sent.tokens[0].start_char:sent.tokens[-1].end_char + 1] sentences.append(sentence.strip()) return sentences def create_input_to_t5(question, answer): input_line = [question] answer_paragraph = get_ans_sentence_with_stanza(answer, en_nlp) for idx, answer_sent in enumerate(answer_paragraph): sep_token = '[{}]'.format(idx) input_line.append(sep_token) input_line.append(answer_sent) return ' '.join(input_line) def process_t5_output(input_txt, output_txt): pred_roles = [] answer_sentence = re.split('\[\d+\] ', input_txt) question = answer_sentence[0].strip() answer_sentence = answer_sentence[1:] sentence_idx = re.findall('\[\d+\]', input_txt) idx_to_sentence = zip(sentence_idx, answer_sentence) pred_role = re.split('\[\d+\] ', output_txt)[1:] pred_idx = re.findall('\[\d+\]', output_txt) idx_to_role = { idx: role.strip() for (idx, role) in zip(pred_idx, pred_role) } for _, (idx, sentence) in enumerate(idx_to_sentence): pred_roles.append(' ' if idx not in idx_to_role else idx_to_role[idx]) return '\n'.join(pred_roles) def predict(question, answer): input_txt = create_input_to_t5(question, answer) input_ids = tokenizer(input_txt, return_tensors='pt').input_ids outputs = model.generate(input_ids, max_length=512) output_txt = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0] return process_t5_output(input_txt, output_txt) gr.Interface( fn=predict, inputs=[ gr.inputs.Textbox(lines=1, label="Question:"), gr.inputs.Textbox(lines=1, label="Answer:"), ], outputs=[ gr.outputs.Textbox(label="Predicted sentence-level functional roles"), ], theme="peach", title="Discourse structure of long-form answer", description="Input a question with its long-form answer to see the predicted discourse structure by our role classifier.", examples=[ ] ).launch(enable_queue=True)