Paula Leonova
commited on
Commit
•
e452a5c
1
Parent(s):
1f1805f
Add keyword extraction model and clean up custom models import reference
Browse files- app.py +9 -8
- models.py +18 -0
- requirements.txt +1 -0
app.py
CHANGED
@@ -8,7 +8,8 @@ import streamlit as st
|
|
8 |
from sklearn.metrics import classification_report
|
9 |
|
10 |
|
11 |
-
from models import create_nest_sentences, load_summary_model, summarizer_gen, load_model, classifier_zero
|
|
|
12 |
from utils import plot_result, plot_dual_bar_chart, examples_load, example_long_text_load
|
13 |
import json
|
14 |
|
@@ -46,12 +47,12 @@ with st.form(key='my_form'):
|
|
46 |
|
47 |
with st.spinner('Loading pretrained summarizer mnli model...'):
|
48 |
start = time.time()
|
49 |
-
summarizer = load_summary_model()
|
50 |
st.success(f'Time taken to load summarizer mnli model: {round(time.time() - start,4)} seconds')
|
51 |
|
52 |
with st.spinner('Loading pretrained classifier mnli model...'):
|
53 |
start = time.time()
|
54 |
-
classifier = load_model()
|
55 |
st.success(f'Time taken to load classifier mnli model: {round(time.time() - start,4)} seconds')
|
56 |
|
57 |
|
@@ -63,7 +64,7 @@ if submit_button:
|
|
63 |
my_expander = st.expander(label='Expand to see summary generation details')
|
64 |
with my_expander:
|
65 |
# For each body of text, create text chunks of a certain token size required for the transformer
|
66 |
-
nested_sentences = create_nest_sentences(document = text_input, token_max_length = 1024)
|
67 |
|
68 |
summary = []
|
69 |
# st.markdown("### Text Chunk & Summaries")
|
@@ -77,21 +78,21 @@ if submit_button:
|
|
77 |
st.markdown(f"###### Original Text Chunk {n+1}/{len(nested_sentences)}" )
|
78 |
st.markdown(text_chunk)
|
79 |
|
80 |
-
chunk_summary = summarizer_gen(summarizer, sequence=text_chunk, maximum_tokens = 300, minimum_tokens = 20)
|
81 |
summary.append(chunk_summary)
|
82 |
st.markdown(f"###### Partial Summary {n+1}/{len(nested_sentences)}")
|
83 |
st.markdown(chunk_summary)
|
84 |
# Combine all the summaries into a list and compress into one document, again
|
85 |
final_summary = " \n\n".join(list(summary))
|
86 |
|
87 |
-
# final_summary = summarizer_gen(summarizer, sequence=text_input, maximum_tokens = 30, minimum_tokens = 100)
|
88 |
st.markdown("### Combined Summary")
|
89 |
st.markdown(final_summary)
|
90 |
|
91 |
|
92 |
st.markdown("### Top Label Predictions on Summary & Full Text")
|
93 |
with st.spinner('Matching labels...'):
|
94 |
-
topics, scores = classifier_zero(classifier, sequence=final_summary, labels=labels, multi_class=True)
|
95 |
# st.markdown("### Top Label Predictions: Combined Summary")
|
96 |
# plot_result(topics[::-1][:], scores[::-1][:])
|
97 |
# st.markdown("### Download Data")
|
@@ -103,7 +104,7 @@ if submit_button:
|
|
103 |
# unsafe_allow_html = True
|
104 |
# )
|
105 |
|
106 |
-
topics_ex_text, scores_ex_text = classifier_zero(classifier, sequence=example_text, labels=labels, multi_class=True)
|
107 |
plot_dual_bar_chart(topics, scores, topics_ex_text, scores_ex_text)
|
108 |
|
109 |
data_ex_text = pd.DataFrame({'label': topics_ex_text, 'scores_from_full_text': scores_ex_text})
|
|
|
8 |
from sklearn.metrics import classification_report
|
9 |
|
10 |
|
11 |
+
# from models import create_nest_sentences, load_summary_model, summarizer_gen, load_model, classifier_zero
|
12 |
+
import models as md
|
13 |
from utils import plot_result, plot_dual_bar_chart, examples_load, example_long_text_load
|
14 |
import json
|
15 |
|
|
|
47 |
|
48 |
with st.spinner('Loading pretrained summarizer mnli model...'):
|
49 |
start = time.time()
|
50 |
+
summarizer = md.load_summary_model()
|
51 |
st.success(f'Time taken to load summarizer mnli model: {round(time.time() - start,4)} seconds')
|
52 |
|
53 |
with st.spinner('Loading pretrained classifier mnli model...'):
|
54 |
start = time.time()
|
55 |
+
classifier = md.load_model()
|
56 |
st.success(f'Time taken to load classifier mnli model: {round(time.time() - start,4)} seconds')
|
57 |
|
58 |
|
|
|
64 |
my_expander = st.expander(label='Expand to see summary generation details')
|
65 |
with my_expander:
|
66 |
# For each body of text, create text chunks of a certain token size required for the transformer
|
67 |
+
nested_sentences = md.create_nest_sentences(document = text_input, token_max_length = 1024)
|
68 |
|
69 |
summary = []
|
70 |
# st.markdown("### Text Chunk & Summaries")
|
|
|
78 |
st.markdown(f"###### Original Text Chunk {n+1}/{len(nested_sentences)}" )
|
79 |
st.markdown(text_chunk)
|
80 |
|
81 |
+
chunk_summary = md.summarizer_gen(summarizer, sequence=text_chunk, maximum_tokens = 300, minimum_tokens = 20)
|
82 |
summary.append(chunk_summary)
|
83 |
st.markdown(f"###### Partial Summary {n+1}/{len(nested_sentences)}")
|
84 |
st.markdown(chunk_summary)
|
85 |
# Combine all the summaries into a list and compress into one document, again
|
86 |
final_summary = " \n\n".join(list(summary))
|
87 |
|
88 |
+
# final_summary = md.summarizer_gen(summarizer, sequence=text_input, maximum_tokens = 30, minimum_tokens = 100)
|
89 |
st.markdown("### Combined Summary")
|
90 |
st.markdown(final_summary)
|
91 |
|
92 |
|
93 |
st.markdown("### Top Label Predictions on Summary & Full Text")
|
94 |
with st.spinner('Matching labels...'):
|
95 |
+
topics, scores = md.classifier_zero(classifier, sequence=final_summary, labels=labels, multi_class=True)
|
96 |
# st.markdown("### Top Label Predictions: Combined Summary")
|
97 |
# plot_result(topics[::-1][:], scores[::-1][:])
|
98 |
# st.markdown("### Download Data")
|
|
|
104 |
# unsafe_allow_html = True
|
105 |
# )
|
106 |
|
107 |
+
topics_ex_text, scores_ex_text = md.classifier_zero(classifier, sequence=example_text, labels=labels, multi_class=True)
|
108 |
plot_dual_bar_chart(topics, scores, topics_ex_text, scores_ex_text)
|
109 |
|
110 |
data_ex_text = pd.DataFrame({'label': topics_ex_text, 'scores_from_full_text': scores_ex_text})
|
models.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
import torch
|
2 |
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
|
3 |
import streamlit as st
|
|
|
4 |
|
5 |
|
6 |
import spacy
|
@@ -29,6 +30,23 @@ def create_nest_sentences(document:str, token_max_length = 1024):
|
|
29 |
nested.append(sent)
|
30 |
return nested
|
31 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
# Reference: https://huggingface.co/facebook/bart-large-mnli
|
33 |
@st.cache(allow_output_mutation=True)
|
34 |
def load_summary_model():
|
|
|
1 |
import torch
|
2 |
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
|
3 |
import streamlit as st
|
4 |
+
from keybert import KeyBERT
|
5 |
|
6 |
|
7 |
import spacy
|
|
|
30 |
nested.append(sent)
|
31 |
return nested
|
32 |
|
33 |
+
# Reference: https://github.com/MaartenGr/KeyBERT
|
34 |
+
@st.cache(allow_output_mutation=True)
|
35 |
+
def load_keyword_model():
|
36 |
+
kw_model = KeyBERT()
|
37 |
+
return ky_model
|
38 |
+
|
39 |
+
def keyword_gen(sequence:str):
|
40 |
+
keywords = kw_model.extract_keywords(sequence,
|
41 |
+
keyphrase_ngram_range=(1, 1),
|
42 |
+
stop_words='english',
|
43 |
+
use_mmr=True,
|
44 |
+
diversity=0.5,
|
45 |
+
top_n=10)
|
46 |
+
return keywords
|
47 |
+
|
48 |
+
|
49 |
+
|
50 |
# Reference: https://huggingface.co/facebook/bart-large-mnli
|
51 |
@st.cache(allow_output_mutation=True)
|
52 |
def load_summary_model():
|
requirements.txt
CHANGED
@@ -4,5 +4,6 @@ streamlit
|
|
4 |
plotly
|
5 |
torch
|
6 |
sklearn
|
|
|
7 |
spacy>=2.2.0,<3.0.0
|
8 |
https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gz#egg=en_core_web_sm
|
|
|
4 |
plotly
|
5 |
torch
|
6 |
sklearn
|
7 |
+
KeyBERT
|
8 |
spacy>=2.2.0,<3.0.0
|
9 |
https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gz#egg=en_core_web_sm
|