mhsvieira commited on
Commit
a9e7556
1 Parent(s): c16fec3
app.py CHANGED
@@ -2,11 +2,12 @@ import streamlit as st
2
  from extractor import extract, FewDocumentsError
3
  from summarizer import summarize
4
  from translation import translate
5
- import time
6
  import cProfile
7
  from sentence_transformers import SentenceTransformer
8
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
9
  import torch
 
10
 
11
  @st.cache(allow_output_mutation=True)
12
  def init():
@@ -26,6 +27,7 @@ def init():
26
 
27
  def main():
28
  search_model, summ_model, tokenizer = init()
 
29
 
30
  st.title("AutoSumm")
31
  st.subheader("Lucas Antunes & Matheus Vieira")
@@ -33,14 +35,18 @@ def main():
33
  portuguese = st.checkbox('Traduzir para o português.')
34
 
35
  if portuguese:
 
36
  st.subheader("Digite o tópico sobre o qual você deseja gerar um resumo")
37
  query_pt = st.text_input('Digite o tópico') #text is stored in this variable
38
  button = st.button('Gerar resumo')
39
  else:
 
40
  st.subheader("Type the desired topic to generate the summary")
41
  query = st.text_input('Type your topic') #text is stored in this variable
42
  button = st.button('Generate summary')
43
 
 
 
44
  if 'few_documents' not in st.session_state:
45
  st.session_state['few_documents'] = False
46
  few_documents = False
@@ -48,11 +54,9 @@ def main():
48
  few_documents = st.session_state['few_documents']
49
 
50
  if button:
51
- start_time = time.time()
52
  query = translate(query_pt, 'pt', 'en') if portuguese else query
53
  try:
54
- with st.spinner('Extraindo textos relevantes...'):
55
- text = extract(query, search_model=search_model)
56
  except FewDocumentsError as e:
57
  few_documents = True
58
  st.session_state['few_documents'] = True
@@ -60,32 +64,26 @@ def main():
60
  st.session_state['msg'] = e.msg
61
  else:
62
 
63
- st.info(f'(Extraction) Elapsed time: {time.time() - start_time:.2f}s')
64
- with st.spinner('Gerando resumo...'):
65
- summary = summarize(text, summ_model, tokenizer)
66
- st.info(f'(Total) Elapsed time: {time.time() - start_time:.2f}s')
67
 
68
  if portuguese:
69
- st.markdown(f'Seu resumo para "{query_pt}":\n\n> {translate(summary, "en", "pt")}')
70
  else:
71
- st.markdown(f'Your summary for "{query}":\n\n> {summary}')
 
 
72
 
73
 
74
  if few_documents:
75
  st.warning(st.session_state['msg'])
76
  if st.button('Prosseguir'):
77
- start_time = time.time()
78
- with st.spinner('Extraindo textos relevantes...'):
79
- text = extract(query, search_model=search_model, extracted_documents=st.session_state['documents'])
80
- st.info(f'(Extraction) Elapsed time: {time.time() - start_time:.2f}s')
81
- with st.spinner('Gerando resumo...'):
82
- summary = summarize(text, summ_model, tokenizer)
83
- st.info(f'(Total) Elapsed time: {time.time() - start_time:.2f}s')
84
 
85
  if portuguese:
86
- st.markdown(f'Seu resumo para "{query_pt}":\n\n> {translate(summary, "en", "pt")}')
87
  else:
88
- st.markdown(f'Your summary for "{query}":\n\n> {summary}')
89
 
90
  st.session_state['few_documents'] = False
91
  few_documents = False
 
2
  from extractor import extract, FewDocumentsError
3
  from summarizer import summarize
4
  from translation import translate
5
+ from utils.timing import Timer
6
  import cProfile
7
  from sentence_transformers import SentenceTransformer
8
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
9
  import torch
10
+ from os import environ
11
 
12
  @st.cache(allow_output_mutation=True)
13
  def init():
 
27
 
28
  def main():
29
  search_model, summ_model, tokenizer = init()
30
+ Timer.reset()
31
 
32
  st.title("AutoSumm")
33
  st.subheader("Lucas Antunes & Matheus Vieira")
 
35
  portuguese = st.checkbox('Traduzir para o português.')
36
 
37
  if portuguese:
38
+ environ['PORTUGUESE'] = 'true' # work around (gambiarra)
39
  st.subheader("Digite o tópico sobre o qual você deseja gerar um resumo")
40
  query_pt = st.text_input('Digite o tópico') #text is stored in this variable
41
  button = st.button('Gerar resumo')
42
  else:
43
+ environ['PORTUGUESE'] = 'false' # work around (gambiarra)
44
  st.subheader("Type the desired topic to generate the summary")
45
  query = st.text_input('Type your topic') #text is stored in this variable
46
  button = st.button('Generate summary')
47
 
48
+ result = st.empty()
49
+
50
  if 'few_documents' not in st.session_state:
51
  st.session_state['few_documents'] = False
52
  few_documents = False
 
54
  few_documents = st.session_state['few_documents']
55
 
56
  if button:
 
57
  query = translate(query_pt, 'pt', 'en') if portuguese else query
58
  try:
59
+ text = extract(query, search_model=search_model)
 
60
  except FewDocumentsError as e:
61
  few_documents = True
62
  st.session_state['few_documents'] = True
 
64
  st.session_state['msg'] = e.msg
65
  else:
66
 
67
+ summary = summarize(text, summ_model, tokenizer)
 
 
 
68
 
69
  if portuguese:
70
+ result.markdown(f'Seu resumo para "{query_pt}":\n\n> {translate(summary, "en", "pt")}')
71
  else:
72
+ result.markdown(f'Your summary for "{query}":\n\n> {summary}')
73
+
74
+ Timer.show_total()
75
 
76
 
77
  if few_documents:
78
  st.warning(st.session_state['msg'])
79
  if st.button('Prosseguir'):
80
+ text = extract(query, search_model=search_model, extracted_documents=st.session_state['documents'])
81
+ summary = summarize(text, summ_model, tokenizer)
 
 
 
 
 
82
 
83
  if portuguese:
84
+ result.markdown(f'Seu resumo para "{query_pt}":\n\n> {translate(summary, "en", "pt")}')
85
  else:
86
+ result.markdown(f'Your summary for "{query}":\n\n> {summary}')
87
 
88
  st.session_state['few_documents'] = False
89
  few_documents = False
extractor/extract.py CHANGED
@@ -1,10 +1,12 @@
1
  from ._utils import FewDocumentsError
2
  from ._utils import document_extraction, paragraph_extraction, semantic_search
 
3
  from corpora import gen_corpus
4
  from nltk.corpus import stopwords
5
  from nltk.tokenize import word_tokenize
6
  import string
7
 
 
8
  def extract(query: str, search_model, n: int=3, extracted_documents: list=None) -> str:
9
  """Extract n paragraphs from the corpus using the given query.
10
 
@@ -16,7 +18,8 @@ def extract(query: str, search_model, n: int=3, extracted_documents: list=None)
16
  str: String containing the n most relevant paragraphs joined by line breaks
17
  """
18
  # Open corpus
19
- corpus = gen_corpus(query)
 
20
 
21
  # Setup query
22
  stop_words = set(stopwords.words('english'))
@@ -25,36 +28,37 @@ def extract(query: str, search_model, n: int=3, extracted_documents: list=None)
25
  keywords = [keyword for keyword in tokens_without_sw if keyword not in string.punctuation]
26
 
27
  # Gross search
28
- if not extracted_documents:
29
- extracted_documents, documents_empty, documents_sizes = document_extraction(
30
- dataset=corpus,
31
- query=query,
32
- keywords=keywords,
33
- min_document_size=0,
34
- min_just_one_paragraph_size=0
35
- )
 
36
 
37
  # First semantc search (over documents)
38
- selected_documents, documents_distances = semantic_search(
39
- model=search_model,
40
- query=query,
41
- files=extracted_documents,
42
- number_of_similar_files=10
43
- )
 
44
 
45
  # Second semantic search (over paragraphs)
46
- paragraphs = paragraph_extraction(
47
- documents=selected_documents,
48
- min_paragraph_size=20,
49
- )
50
-
51
- # Model for the second semantic search
52
- selected_paragraphs, paragraphs_distances = semantic_search(
53
- model=search_model,
54
- query=query,
55
- files=paragraphs,
56
- number_of_similar_files=10
57
- )
58
 
59
  text = '\n'.join(selected_paragraphs[:n])
60
 
 
1
  from ._utils import FewDocumentsError
2
  from ._utils import document_extraction, paragraph_extraction, semantic_search
3
+ from utils.timing import Timer
4
  from corpora import gen_corpus
5
  from nltk.corpus import stopwords
6
  from nltk.tokenize import word_tokenize
7
  import string
8
 
9
+ @Timer.time_it('extração', 'extraction')
10
  def extract(query: str, search_model, n: int=3, extracted_documents: list=None) -> str:
11
  """Extract n paragraphs from the corpus using the given query.
12
 
 
18
  str: String containing the n most relevant paragraphs joined by line breaks
19
  """
20
  # Open corpus
21
+ with Timer('geração do corpus', 'corpus generation'):
22
+ corpus = gen_corpus(query)
23
 
24
  # Setup query
25
  stop_words = set(stopwords.words('english'))
 
28
  keywords = [keyword for keyword in tokens_without_sw if keyword not in string.punctuation]
29
 
30
  # Gross search
31
+ with Timer('busca exaustiva', 'exhaustive search'):
32
+ if not extracted_documents:
33
+ extracted_documents, documents_empty, documents_sizes = document_extraction(
34
+ dataset=corpus,
35
+ query=query,
36
+ keywords=keywords,
37
+ min_document_size=0,
38
+ min_just_one_paragraph_size=0
39
+ )
40
 
41
  # First semantc search (over documents)
42
+ with Timer('busca semantica nos documentos', 'semantic search over documents'):
43
+ selected_documents, documents_distances = semantic_search(
44
+ model=search_model,
45
+ query=query,
46
+ files=extracted_documents,
47
+ number_of_similar_files=10
48
+ )
49
 
50
  # Second semantic search (over paragraphs)
51
+ with Timer('busca semantica nos parágrafos', 'semantic search over paragraphs'):
52
+ paragraphs = paragraph_extraction(
53
+ documents=selected_documents,
54
+ min_paragraph_size=20,
55
+ )
56
+ selected_paragraphs, paragraphs_distances = semantic_search(
57
+ model=search_model,
58
+ query=query,
59
+ files=paragraphs,
60
+ number_of_similar_files=10
61
+ )
 
62
 
63
  text = '\n'.join(selected_paragraphs[:n])
64
 
summarizer/summarize.py CHANGED
@@ -1,4 +1,6 @@
 
1
 
 
2
  def summarize(text: str, model, tokenizer) -> str:
3
  """
4
  Generate a summary based from the given text
 
1
+ from utils.timing import Timer
2
 
3
+ @Timer.time_it('abstração', 'abstraction')
4
  def summarize(text: str, model, tokenizer) -> str:
5
  """
6
  Generate a summary based from the given text
translation/translation.py CHANGED
@@ -1,6 +1,8 @@
1
  from deep_translator import GoogleTranslator
2
  from easynmt import EasyNMT
 
3
 
 
4
  def translate(text, source_language, target_language):
5
  try:
6
  print("Trying to use Google Translator...")
 
1
  from deep_translator import GoogleTranslator
2
  from easynmt import EasyNMT
3
+ from utils.timing import Timer
4
 
5
+ @Timer.time_it('tradução', 'translation')
6
  def translate(text, source_language, target_language):
7
  try:
8
  print("Trying to use Google Translator...")
utils/__init__.py ADDED
File without changes
utils/__pycache__/__init__.cpython-39.pyc ADDED
Binary file (132 Bytes). View file
 
utils/__pycache__/timing.cpython-39.pyc ADDED
Binary file (2.89 kB). View file
 
utils/timing.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import streamlit as st
3
+ from os import environ
4
+
5
+ class Timer():
6
+ total = 0
7
+ expander = None
8
+ def __init__(self, pt_name, en_name):
9
+ self.pt_name = pt_name
10
+ self.en_name = en_name
11
+ if environ['PORTUGUESE'] == 'true':
12
+ self.portuguese = True
13
+ elif environ['PORTUGUESE'] == 'false':
14
+ self.portuguese = False
15
+ else:
16
+ raise EnvironmentError
17
+ if not Timer.expander:
18
+ if self.portuguese:
19
+ Timer.expander = st.expander('Ver progresso')
20
+ else:
21
+ Timer.expander = st.expander('See progress')
22
+ self.display = Timer.expander.empty()
23
+
24
+ def __enter__(self):
25
+ if self.portuguese:
26
+ self.display.info(f'Executando "{self.pt_name}"...')
27
+ else:
28
+ self.display.info(f'Running "{self.en_name}"...')
29
+ self.start_time = time.time()
30
+
31
+ def __exit__(self, type, value, traceback):
32
+ end_time = time.time()
33
+ elapsed_time = end_time - self.start_time
34
+ Timer.total += elapsed_time
35
+ self.display.empty()
36
+ if self.portuguese:
37
+ Timer.expander.info(f'"{self.pt_name}" terminou em {elapsed_time:.2f} s')
38
+ else:
39
+ Timer.expander.info(f'"{self.en_name}" finished in {elapsed_time:.2f} s')
40
+
41
+ # for manually starting the timer
42
+ def start(self):
43
+ if self.portuguese:
44
+ self.display.warning(f'Executando "{self.pt_name}"...')
45
+ else:
46
+ self.display.warning(f'Running "{self.en_name}"...')
47
+ self.start_time = time.time()
48
+
49
+ # for manually stopping the timer
50
+ def stop(self):
51
+ end_time = time.time()
52
+ elapsed_time = end_time - self.start_time
53
+ Timer.total += elapsed_time
54
+ self.display.empty()
55
+ if self.portuguese:
56
+ Timer.expander.warning(f'"{self.pt_name}" terminou em {elapsed_time:.2f} s')
57
+ else:
58
+ Timer.expander.warning(f'"{self.en_name}" finished in {elapsed_time:.2f} s')
59
+
60
+ def reset():
61
+ Timer.total = 0
62
+ Timer.expander = None
63
+
64
+ def show_total():
65
+ if environ['PORTUGUESE'] == 'true':
66
+ Timer.expander.success(f'Tempo de execução total: {Timer.total:.2f} s')
67
+ elif environ['PORTUGUESE'] == 'false':
68
+ Timer.expander.success(f'Total elapsed time: {Timer.total:.2f} s')
69
+
70
+ def time_it(pt_name, en_name):
71
+ def decorator(func):
72
+ def wrapper(*args, **kwargs):
73
+ timer = Timer(pt_name, en_name)
74
+ timer.start()
75
+ result = func(*args, **kwargs)
76
+ timer.stop()
77
+ return result
78
+ return wrapper
79
+ return decorator