Paula Leonova commited on
Commit
e452a5c
1 Parent(s): 1f1805f

Add keyword extraction model and clean up custom models import reference

Browse files
Files changed (3) hide show
  1. app.py +9 -8
  2. models.py +18 -0
  3. requirements.txt +1 -0
app.py CHANGED
@@ -8,7 +8,8 @@ import streamlit as st
8
  from sklearn.metrics import classification_report
9
 
10
 
11
- from models import create_nest_sentences, load_summary_model, summarizer_gen, load_model, classifier_zero
 
12
  from utils import plot_result, plot_dual_bar_chart, examples_load, example_long_text_load
13
  import json
14
 
@@ -46,12 +47,12 @@ with st.form(key='my_form'):
46
 
47
  with st.spinner('Loading pretrained summarizer mnli model...'):
48
  start = time.time()
49
- summarizer = load_summary_model()
50
  st.success(f'Time taken to load summarizer mnli model: {round(time.time() - start,4)} seconds')
51
 
52
  with st.spinner('Loading pretrained classifier mnli model...'):
53
  start = time.time()
54
- classifier = load_model()
55
  st.success(f'Time taken to load classifier mnli model: {round(time.time() - start,4)} seconds')
56
 
57
 
@@ -63,7 +64,7 @@ if submit_button:
63
  my_expander = st.expander(label='Expand to see summary generation details')
64
  with my_expander:
65
  # For each body of text, create text chunks of a certain token size required for the transformer
66
- nested_sentences = create_nest_sentences(document = text_input, token_max_length = 1024)
67
 
68
  summary = []
69
  # st.markdown("### Text Chunk & Summaries")
@@ -77,21 +78,21 @@ if submit_button:
77
  st.markdown(f"###### Original Text Chunk {n+1}/{len(nested_sentences)}" )
78
  st.markdown(text_chunk)
79
 
80
- chunk_summary = summarizer_gen(summarizer, sequence=text_chunk, maximum_tokens = 300, minimum_tokens = 20)
81
  summary.append(chunk_summary)
82
  st.markdown(f"###### Partial Summary {n+1}/{len(nested_sentences)}")
83
  st.markdown(chunk_summary)
84
  # Combine all the summaries into a list and compress into one document, again
85
  final_summary = " \n\n".join(list(summary))
86
 
87
- # final_summary = summarizer_gen(summarizer, sequence=text_input, maximum_tokens = 30, minimum_tokens = 100)
88
  st.markdown("### Combined Summary")
89
  st.markdown(final_summary)
90
 
91
 
92
  st.markdown("### Top Label Predictions on Summary & Full Text")
93
  with st.spinner('Matching labels...'):
94
- topics, scores = classifier_zero(classifier, sequence=final_summary, labels=labels, multi_class=True)
95
  # st.markdown("### Top Label Predictions: Combined Summary")
96
  # plot_result(topics[::-1][:], scores[::-1][:])
97
  # st.markdown("### Download Data")
@@ -103,7 +104,7 @@ if submit_button:
103
  # unsafe_allow_html = True
104
  # )
105
 
106
- topics_ex_text, scores_ex_text = classifier_zero(classifier, sequence=example_text, labels=labels, multi_class=True)
107
  plot_dual_bar_chart(topics, scores, topics_ex_text, scores_ex_text)
108
 
109
  data_ex_text = pd.DataFrame({'label': topics_ex_text, 'scores_from_full_text': scores_ex_text})
 
8
  from sklearn.metrics import classification_report
9
 
10
 
11
+ # from models import create_nest_sentences, load_summary_model, summarizer_gen, load_model, classifier_zero
12
+ import models as md
13
  from utils import plot_result, plot_dual_bar_chart, examples_load, example_long_text_load
14
  import json
15
 
 
47
 
48
  with st.spinner('Loading pretrained summarizer mnli model...'):
49
  start = time.time()
50
+ summarizer = md.load_summary_model()
51
  st.success(f'Time taken to load summarizer mnli model: {round(time.time() - start,4)} seconds')
52
 
53
  with st.spinner('Loading pretrained classifier mnli model...'):
54
  start = time.time()
55
+ classifier = md.load_model()
56
  st.success(f'Time taken to load classifier mnli model: {round(time.time() - start,4)} seconds')
57
 
58
 
 
64
  my_expander = st.expander(label='Expand to see summary generation details')
65
  with my_expander:
66
  # For each body of text, create text chunks of a certain token size required for the transformer
67
+ nested_sentences = md.create_nest_sentences(document = text_input, token_max_length = 1024)
68
 
69
  summary = []
70
  # st.markdown("### Text Chunk & Summaries")
 
78
  st.markdown(f"###### Original Text Chunk {n+1}/{len(nested_sentences)}" )
79
  st.markdown(text_chunk)
80
 
81
+ chunk_summary = md.summarizer_gen(summarizer, sequence=text_chunk, maximum_tokens = 300, minimum_tokens = 20)
82
  summary.append(chunk_summary)
83
  st.markdown(f"###### Partial Summary {n+1}/{len(nested_sentences)}")
84
  st.markdown(chunk_summary)
85
  # Combine all the summaries into a list and compress into one document, again
86
  final_summary = " \n\n".join(list(summary))
87
 
88
+ # final_summary = md.summarizer_gen(summarizer, sequence=text_input, maximum_tokens = 30, minimum_tokens = 100)
89
  st.markdown("### Combined Summary")
90
  st.markdown(final_summary)
91
 
92
 
93
  st.markdown("### Top Label Predictions on Summary & Full Text")
94
  with st.spinner('Matching labels...'):
95
+ topics, scores = md.classifier_zero(classifier, sequence=final_summary, labels=labels, multi_class=True)
96
  # st.markdown("### Top Label Predictions: Combined Summary")
97
  # plot_result(topics[::-1][:], scores[::-1][:])
98
  # st.markdown("### Download Data")
 
104
  # unsafe_allow_html = True
105
  # )
106
 
107
+ topics_ex_text, scores_ex_text = md.classifier_zero(classifier, sequence=example_text, labels=labels, multi_class=True)
108
  plot_dual_bar_chart(topics, scores, topics_ex_text, scores_ex_text)
109
 
110
  data_ex_text = pd.DataFrame({'label': topics_ex_text, 'scores_from_full_text': scores_ex_text})
models.py CHANGED
@@ -1,6 +1,7 @@
1
  import torch
2
  from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
3
  import streamlit as st
 
4
 
5
 
6
  import spacy
@@ -29,6 +30,23 @@ def create_nest_sentences(document:str, token_max_length = 1024):
29
  nested.append(sent)
30
  return nested
31
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  # Reference: https://huggingface.co/facebook/bart-large-mnli
33
  @st.cache(allow_output_mutation=True)
34
  def load_summary_model():
 
1
  import torch
2
  from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
3
  import streamlit as st
4
+ from keybert import KeyBERT
5
 
6
 
7
  import spacy
 
30
  nested.append(sent)
31
  return nested
32
 
33
+ # Reference: https://github.com/MaartenGr/KeyBERT
34
+ @st.cache(allow_output_mutation=True)
35
+ def load_keyword_model():
36
+ kw_model = KeyBERT()
37
+ return ky_model
38
+
39
+ def keyword_gen(sequence:str):
40
+ keywords = kw_model.extract_keywords(sequence,
41
+ keyphrase_ngram_range=(1, 1),
42
+ stop_words='english',
43
+ use_mmr=True,
44
+ diversity=0.5,
45
+ top_n=10)
46
+ return keywords
47
+
48
+
49
+
50
  # Reference: https://huggingface.co/facebook/bart-large-mnli
51
  @st.cache(allow_output_mutation=True)
52
  def load_summary_model():
requirements.txt CHANGED
@@ -4,5 +4,6 @@ streamlit
4
  plotly
5
  torch
6
  sklearn
 
7
  spacy>=2.2.0,<3.0.0
8
  https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gz#egg=en_core_web_sm
 
4
  plotly
5
  torch
6
  sklearn
7
+ KeyBERT
8
  spacy>=2.2.0,<3.0.0
9
  https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gz#egg=en_core_web_sm