File size: 5,337 Bytes
e539b70
 
 
c16fec3
a9e7556
bfbd0a1
78a71e8
 
 
a9e7556
e539b70
78a71e8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f37c0ba
bfbd0a1
78a71e8
a9e7556
e539b70
2b3e58c
 
c16fec3
 
 
e539b70
2b3e58c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c16fec3
a9e7556
2b3e58c
c16fec3
 
a9e7556
2b3e58c
c16fec3
e539b70
2b3e58c
a9e7556
bfbd0a1
e539b70
 
bfbd0a1
 
 
c16fec3
 
bfbd0a1
a9e7556
bfbd0a1
 
 
 
 
 
 
a9e7556
bfbd0a1
c16fec3
a9e7556
2b3e58c
 
c16fec3
a9e7556
2b3e58c
 
a9e7556
 
bfbd0a1
 
 
0b0fe08
44040b8
 
5baac1f
 
 
 
 
 
0b0fe08
 
3e66511
a9e7556
 
bfbd0a1
c16fec3
a9e7556
2b3e58c
 
c16fec3
a9e7556
2b3e58c
 
44040b8
bfbd0a1
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import streamlit as st
from extractor import extract, FewDocumentsError
from summarizer import summarize
from translation import translate
from utils.timing import Timer
import cProfile
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
from os import environ

@st.cache(allow_output_mutation=True)
def init():
    # Dowload required NLTK resources
    from nltk import download
    download('punkt')
    download('stopwords')

    device = "cuda" if torch.cuda.is_available() else "cpu"
    # Model for semantic searches
    search_model = SentenceTransformer('msmarco-distilbert-base-v4', device=device)
    # Model for abstraction
    summ_model = AutoModelForSeq2SeqLM.from_pretrained('t5-base')
    tokenizer = AutoTokenizer.from_pretrained('t5-base')

    return search_model, summ_model, tokenizer

def main():
    search_model, summ_model, tokenizer = init()
    Timer.reset()

    _, col2, _ = st.columns([1,1,1])
    col2.image('AutoSumm.png', width=250)
    st.subheader("Lucas Antunes & Matheus Vieira")

    portuguese = st.checkbox('Traduzir para o português.')

    st.sidebar.markdown("""
    # Processing steps
    #### Translation
    Step where the system translates the user's query from Portuguese to English and the summary from English to Portuguese.

    #### Corpus generation
    Step where the system generates the complete corpus: query-related web pages and documents (PDFs and text files) on query-related knowledge area. The Corpus for this model was built to gather documents related to the Blue Amazon, a maritime region in South America.

    #### Exhaustive search
    Step where the system filters the texts of the corpus that contain keywords from the query.

    #### Semantic search over documents
    Step in which the system selects documents related to the query through semantic search.

    #### Semantic search over paragraphs
    Step in which the system breaks documents into paragraphs and selects those related to the query through semantic search.

    #### Abstraction
    Step in which the system generates an abstractive summary about the query from the best three paragraphs of the previous step.
    """)

    if portuguese:
        environ['PORTUGUESE'] = 'true' # work around (gambiarra)
        query_pt = st.text_input('Digite o tópico sobre o qual você deseja gerar um resumo') #text is stored in this variable
        button = st.button('Gerar resumo')
    else:
        environ['PORTUGUESE'] = 'false' # work around (gambiarra)
        query = st.text_input('Type the desired topic to generate the summary') #text is stored in this variable
        button = st.button('Generate summary')

    result = st.container()

    if 'few_documents' not in st.session_state:
        st.session_state['few_documents'] = False
        few_documents = False
    else:
        few_documents = st.session_state['few_documents']

    if button:
        query = translate(query_pt, 'pt', 'en') if portuguese else query
        try:
            text = extract(query, search_model=search_model)
        except FewDocumentsError as e:
            few_documents = True
            st.session_state['few_documents'] = True
            st.session_state['documents'] = e.documents
            st.session_state['msg'] = e.msg
        else:

            summary = summarize(text, summ_model, tokenizer)

            if portuguese:
                result.markdown(f'Seu resumo para "{query_pt}":\n\n> {translate(summary, "en", "pt")}')
                with result.expander(f'Parágrafos usados na geração do resumo'):
                    st.markdown(translate(text, "en", "pt").replace('\n', '\n\n'))
            else:
                result.markdown(f'Your summary for "{query}":\n\n> {summary}')
                with result.expander(f'Paragraphs used in summarization'):
                    st.markdown(text.replace('\n', '\n\n'))

            Timer.show_total()


    if few_documents:
        Timer.reset()
        error_msg = st.empty()
        error_msg.warning(st.session_state['msg'])
        proceed = st.empty()
        if portuguese:
            proceed_button = proceed.button('Prosseguir')
        else:
            proceed_button = proceed.button('Proceed')
        if proceed_button:
            error_msg.empty()
            proceed.empty()
            query = translate(query_pt, 'pt', 'en') if portuguese else query
            text = extract(query, search_model=search_model, extracted_documents=st.session_state['documents'])
            summary = summarize(text, summ_model, tokenizer)

            if portuguese:
                result.markdown(f'Seu resumo para "{query_pt}":\n\n> {translate(summary, "en", "pt")}')
                with result.expander(f'Parágrafos usados na geração do resumo'):
                    st.markdown(translate(text, "en", "pt").replace('\n', '\n\n'))
            else:
                result.markdown(f'Your summary for "{query}":\n\n> {summary}')
                with result.expander(f'Paragraphs used in summarization'):
                    st.markdown(text.replace('\n', '\n\n'))
            
            st.session_state['few_documents'] = False
            few_documents = False
            
if __name__ == '__main__':
    cProfile.run('main()', 'stats.txt')