import os
import gradio as gr
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import stanza
import re
stanza.download('en', processors='tokenize')

model = AutoModelForSeq2SeqLM.from_pretrained("fangyuan/lfqa_role_classification")
tokenizer = AutoTokenizer.from_pretrained("fangyuan/lfqa_role_classification")
en_nlp = stanza.Pipeline('en', processors='tokenize')


article='''
## About 
This is a demo for our paper: [How Do We Answer Complex Questions: Discourse Structure of Long-form Answers](https://aclanthology.org/2022.acl-long.249/).

Fangyuan Xu, Junyi Jessy Li, Eunsol Choi. 2022.
## Model
The model served here is a T5(large)-based role classification model trained on functional roles of ELI5 answers.
## Resources 
Please see more information (paper/code/data/datasheet) at our [website](https://www.cs.utexas.edu/~fxu/lfqa_discourse/index.html).
## Contact
[Fangyuan Xu](https://www.cs.utexas.edu/~fxu/) via firstname@utexas.edu
'''

role_mappings = {
    'Answer': 'Answer',
    'Answer (Summary)': 'Summary',
    'Auxiliary Information': 'Auxiliary Information',
    'Answer - Example': 'Example',
    'Miscellaneous': 'Miscellaneous',
    'Answer - Organizational sentence': 'Organizational sentence',
    ' ': ' ',
}

def get_ans_sentence_with_stanza(answer_paragraph, pipeline,
                                 is_offset=False):
    '''sentence segmentation with stanza'''
    answer_paragraph_processed = pipeline(answer_paragraph)
    sentences = []
    for sent in answer_paragraph_processed.sentences:
        if is_offset:
            sentences.append((sent.tokens[0].start_char, sent.tokens[-1].end_char))
        else:
            sentence = answer_paragraph[sent.tokens[0].start_char:sent.tokens[-1].end_char + 1]
            sentences.append(sentence.strip())
    return sentences


def create_input_to_t5(question, answer):
    input_line = [question]
    answer_paragraph = get_ans_sentence_with_stanza(answer, en_nlp)
    for idx, answer_sent in enumerate(answer_paragraph):
        sep_token = '[{}]'.format(idx)
        input_line.append(sep_token)
        input_line.append(answer_sent)
    return ' '.join(input_line)

def process_t5_output(input_txt, output_txt):
    pred_roles = []
    answer_sentence = re.split('\[\d+\] ', input_txt)
    answer_sentence = answer_sentence[1:]
    sentence_idx = re.findall('\[\d+\]', input_txt)
    idx_to_sentence = zip(sentence_idx, answer_sentence)
    pred_role = re.split('\[\d+\] ', output_txt)[1:]
    pred_idx = re.findall('\[\d+\]', output_txt)
    idx_to_role = {
        idx: role.strip() for (idx, role) in zip(pred_idx, pred_role)
    }
    for _, (idx, sentence) in enumerate(idx_to_sentence):
        pred_role = ' ' if idx not in idx_to_role else idx_to_role[idx]
        mapped_pred_role = role_mappings[pred_role]
        pred_roles.append('{} ({})'.format(sentence, mapped_pred_role))
    print(input_txt, output_txt)
    return '\n'.join(pred_roles)


def predict(question, answer):
    input_txt = create_input_to_t5(question, answer)
    input_ids = tokenizer(input_txt, return_tensors='pt').input_ids
    outputs = model.generate(input_ids, max_length=512)
    output_txt = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
    return process_t5_output(input_txt, output_txt)


gr.Interface(
    fn=predict,
    inputs=[
        gr.inputs.Textbox(lines=1, label="Question:"),
        gr.inputs.Textbox(lines=1, label="Answer:"),
    ],
    outputs=[
        gr.outputs.Textbox(label="Predicted sentence-level functional roles"),
    ],
    theme="peach",
    title="Discourse structure of long-form answer",
    description="Input a question with its long-form answer to see the predicted discourse structure by our role classifier.",
    article=article,
    examples=[
        #['', '']
    ]
).launch(enable_queue=True)