Spaces:

alpertml
/

TopicModelingForSummarization

Runtime error

App Files Files Community

alpertml commited on Jul 14, 2023

Commit

06b4325

•

1 Parent(s): 1d4fc05

Upload 4 files

Browse files

Files changed (4) hide show

app.py +112 -5
config.py +51 -0
pipeline.py +132 -0
requirements.txt +9 -2

app.py CHANGED Viewed

@@ -1,9 +1,116 @@
 import streamlit as st
 from transformers import pipeline
-pipe = pipeline('sentiment-analysis')
-text = st.text_area('enter some text:')
-if text:
-  out = pipe(text)
-  st.json(out)

+# external libraries
 import streamlit as st
 from transformers import pipeline
+import pandas as pd
+# internal libraries
+from config import config
+import pipeline
+def main():
+    st.set_page_config(
+        layout="centered",  # Can be "centered" or "wide". In the future also "dashboard", etc.
+        initial_sidebar_state="auto",  # Can be "auto", "expanded", "collapsed"
+        page_title=config.main_title,  # String or None. Strings get appended with "• Streamlit".
+        page_icon=config.logo_path,  # String, anything supported by st.image, or None.
+    )
+    if "output" not in st.session_state:
+        st.session_state['data'] = pd.read_csv(config.sample_texts_path)
+        st.session_state['sample_text'] = None
+        generate_text()
+        st.session_state["output"] = False
+        st.session_state["output_text"] = ""
+        st.session_state['inputs'] = {}
+    col1, col2, col3 = st.columns(3)
+    col1.write(' ')
+    col2.image(config.logo_path)
+    col3.write(' ')
+    st.markdown(f"<h1 style='text-align: center;'>{config.main_title}</h1>", unsafe_allow_html=True)
+    st.markdown(f"<h3 style='text-align: center;'>{config.lecture_title}</h3>", unsafe_allow_html=True)
+    # topic modelling radio bar
+    input_topic_modelling = st.radio(
+                        config.topic_modelling_title,
+                        config.topic_modelling_answers,
+                        horizontal=True)
+    st.session_state['inputs']['input_topic_modelling'] = input_topic_modelling
+    # input text area
+    input_text = st.text_area(config.input_text, st.session_state['sample_text'], height=300)
+    st.session_state['inputs']['input_text'] = input_text
+    # generate sample text button
+    st.button(config.button_text, on_click=generate_text)
+    # choosing segmenter radio bar
+    input_segmenter = st.radio(
+                        config.segmenter_title,
+                        config.segmenter_answers,
+                        horizontal=True)
+    st.session_state['inputs']['input_segmenter'] = input_segmenter
+    # choosing summarizer algorithm radio bar
+    input_summarizer = st.radio(
+                        config.summarizer_title,
+                        config.summarizer_answers,
+                        horizontal=True)
+    st.session_state['inputs']['input_summarizer'] = input_summarizer
+    # generating summary button
+    col1, col2, col3 = st.columns(3)
+    col1.header(' ')
+    col2.button(config.generate_text, on_click=generate_summary)
+    col3.header(' ')
+    if st.session_state["output"]:
+        TOPICS = [key for key, value in st.session_state["output_text"].items() if key != '#']
+        if config.filter_threshold_summaries:
+            TOPICS = [key for key in TOPICS if st.session_state["output_text"][key]['summary'] != config.threshold_error]
+        st.write(config.output_title)
+        options = {}
+        for topic in TOPICS:
+            option = st.checkbox(topic)
+            options[topic] = option
+        if len(options) == 0:
+            st.warning(config.warning_len_input_text, icon="⚠️")
+        for topic, option in options.items():
+            if option == True:
+                st.text_area(topic,
+                            st.session_state["output_text"][topic]['summary'],
+                            disabled=True)
+def generate_text():
+    df = st.session_state['data']
+    df = df[~df['data'].isnull()]
+    df = df[df['data'].str.len().gt(100)]
+    st.session_state['sample_text'] = df.sample(1)['data'].values[0]
+def generate_summary():
+    st.session_state["output"] = True
+    MODELS = {
+        'summarizer':st.session_state['inputs']['input_summarizer'],
+        'topic_modelling':st.session_state['inputs']['input_topic_modelling'],
+        'segmentizer':st.session_state['inputs']['input_segmenter']
+    }
+    with st.spinner('Generating the output of Topic Modeling for Summarization...'):
+            OUTPUT = pipeline.run(st.session_state['inputs']['input_text'], MODELS)
+    st.session_state["output_text"] = OUTPUT
+    st.success('Done!')
+if __name__ == "__main__":
+    main()

config.py ADDED Viewed

	@@ -0,0 +1,51 @@

+class config():
+    sample_texts_path = 'data/sample_text.csv'
+    logo_path = 'images/tum-logo.png'
+    main_title = 'Topic Modeling for Summarization'
+    lecture_title = 'Machine Learning for Natural Language Processing Applications (IN2106)'
+    input_text = 'Enter Some Text:'
+    button_text = 'Generate new sample text'
+    topic_modelling_title = 'Topic Modelling:'
+    topic_modelling_answers = ('BERTopic','LDA','CTM','NMF','Top2Vec')
+    segmenter_title = 'Segmentizer:'
+    segmenter_answers = ('Nltk','Spacy','Stanza')
+    summarizer_title = 'Summarizer:'
+    summarizer_answers = ('Bart','T5-base','Prophetnet','Pegasus')
+    generate_text = 'Generate topic based summaries'
+    output_title = 'Select topics you want to see summaries of:'
+    warning_len_input_text = 'The length of the input text is not enough to create topic awareness summaries! or change parameters!'
+    filter_threshold_summaries = True
+    threshold_error = 'X -> not possible to generate a summary due to ''threshold'
+    # model parameters
+    MIN_NUM_SENTENCES_FOR_SUMMARY_CREATION = 2
+    PATH_20_NEWS_CLUSTERID_LABEL_WORDS = 'data/_20news_df_output_clusterId_label_words.csv'
+    PATH_20_NEWS_CLUSTERID_LABEL_WORDS_CTM = 'data/_20news_df_output_doc_topic_CTM_LIST.csv'
+    PATH_20_NEWS_CLUSTERID_LABEL_WORDS_LDA = 'data/_20news_df_output_doc_topic_LDA_LIST.csv'
+    PATH_20_NEWS_CLUSTERID_LABEL_WORDS_NMF = 'data/_20news_df_output_doc_topic_NMF_LIST.csv'
+    PATH_20_NEWS_CLUSTERID_LABEL_WORDS_TOP2VEC = 'data/_20news_df_output_doc_topic_Top2Vec_LIST.csv'
+    # model paths
+    nltk_path = 'models/nltkUtilsObj.pkl'
+    sent_trans_path = 'models/sentTransfModelUtilsObj.pkl'
+    pegasus_model_path = 'models/pegasus_model'
+    bart_model_path = 'models/bart_model'
+    t5_model_path = 'models/t5_model'
+    prophetnet_model_path = 'models/prophetnet_model'

pipeline.py ADDED Viewed

	@@ -0,0 +1,132 @@

+# external libraries
+import pickle
+import numpy as np
+import pandas as pd
+import ast
+from transformers import BartForConditionalGeneration, BartTokenizer
+# internal libraries
+from config import config
+from src.nltk_utilities import NltkSegmentizer
+from src.stanza_utilities import StanzaSegmentizer
+from src.spacy_utilities import SpacySegmentizer
+from src.preprocessing import remove_patterns
+from src.summarization_utilities import SummarizationUtilities, BARTSummarizer, T5Summarizer, ProphetNetSummarizer
+nltkUtilsObj = None
+sentTransfModelUtilsObj = pickle.load(open(config.sent_trans_path, 'rb'))
+sentTransfModelUtilsObj.model = sentTransfModelUtilsObj.model.to('cpu')
+TopicModelling = ''
+summUtilsObj = None
+def text_to_sentences(data):
+  list_sentences = [*nltkUtilsObj.segment_into_sentences(data)]
+  return list_sentences
+def preprocess(list_sentences, sentTransfModelUtilsObj):
+  list_sentences = [remove_patterns(x) for x in list_sentences]
+  list_sentences_per_doc_embeddings = [sentTransfModelUtilsObj.get_embeddings(x) for x in list_sentences if len(x) > 0]
+  return list_sentences_per_doc_embeddings, list_sentences
+def get_emb_cluster_topic(sentTransfModelUtilsObj):
+  df_latVectorRep = pd.read_csv(TopicModelling)
+  df_latVectorRep["sentence_from_words"] = df_latVectorRep["list_topic_words"].map(lambda x: " ".join(ast.literal_eval(x)))
+  list_embeddings_cluster_sentences = list()
+  for index, row in df_latVectorRep.iterrows():
+      list_embeddings_cluster_sentences.append(sentTransfModelUtilsObj.get_embeddings(row["sentence_from_words"]))
+  return list_embeddings_cluster_sentences, df_latVectorRep
+def compute_similarity_matrix(list_sentences_per_doc_embeddings, list_sentences, sentTransfModelUtilsObj):
+  list_embeddings_cluster_sentences, df_latVectorRep = get_emb_cluster_topic(sentTransfModelUtilsObj)
+  similarity_matrix = np.zeros((len(list_embeddings_cluster_sentences), len(list_sentences_per_doc_embeddings)))
+  for i, cluster_embedding in enumerate(list_embeddings_cluster_sentences):
+    for j, sentence_emebedding in enumerate(list_sentences_per_doc_embeddings):
+        similarity_matrix[i][j] = sentTransfModelUtilsObj.compute_cosine_similarity(cluster_embedding, sentence_emebedding)
+  list_index_topics_within_matrix = np.argmax(similarity_matrix, axis=0)
+  dict_topic_sentences = dict()
+  for index_sentence, index_id_topic in enumerate(list_index_topics_within_matrix):
+      label_class = df_latVectorRep.iloc[index_id_topic]["label_class"]
+      if label_class not in dict_topic_sentences.keys():
+          dict_topic_sentences[label_class] = list()
+      dict_topic_sentences[label_class].append(list_sentences[index_sentence])
+  return dict_topic_sentences
+def summarize(dict_topic_sentences):
+  summaries_report = dict()
+  for class_label in dict_topic_sentences.keys():
+      summaries_report[class_label] = {}
+      if len(dict_topic_sentences[class_label]) >= config.MIN_NUM_SENTENCES_FOR_SUMMARY_CREATION:
+          summaries_report[class_label]["source"] = dict_topic_sentences[class_label]
+          summaries_report[class_label]["summary"] = summUtilsObj.summarize(" ".join(dict_topic_sentences[class_label]))
+          print(dict_topic_sentences[class_label])
+      else:
+          summaries_report[class_label]["summary"] = "X -> not possible to generate a summary due to threshold"
+          summaries_report[class_label]["source"] = dict_topic_sentences[class_label]
+  return summaries_report
+def define_models(MODELS):
+  global TopicModelling
+  global summUtilsObj
+  global nltkUtilsObj
+  if MODELS['summarizer'] == 'Pegasus':
+    summUtilsObj = SummarizationUtilities()
+  elif MODELS['summarizer'] == 'Bart':
+    summUtilsObj = BARTSummarizer()
+  elif MODELS['summarizer'] == 'T5-base':
+    summUtilsObj = T5Summarizer()
+  elif MODELS['summarizer'] == 'Prophetnet':
+    summUtilsObj = ProphetNetSummarizer()
+  if MODELS['topic_modelling'] == 'BERTopic':
+    TopicModelling = config.PATH_20_NEWS_CLUSTERID_LABEL_WORDS
+  elif MODELS['topic_modelling'] == 'LDA':
+    TopicModelling = config.PATH_20_NEWS_CLUSTERID_LABEL_WORDS_CTM
+  elif MODELS['topic_modelling'] == 'CTM':
+    TopicModelling = config.PATH_20_NEWS_CLUSTERID_LABEL_WORDS_LDA
+  elif MODELS['topic_modelling'] == 'NMF':
+    TopicModelling = config.PATH_20_NEWS_CLUSTERID_LABEL_WORDS_NMF
+  elif MODELS['topic_modelling'] == 'Top2Vec':
+    TopicModelling = config.PATH_20_NEWS_CLUSTERID_LABEL_WORDS_TOP2VEC
+  if MODELS['segmentizer'] == 'Nltk':
+    nltkUtilsObj = NltkSegmentizer()
+  if MODELS['segmentizer'] == 'Spacy':
+    nltkUtilsObj = SpacySegmentizer()
+  elif MODELS['segmentizer'] == 'Stanza':
+    nltkUtilsObj = StanzaSegmentizer()
+def run(data, MODELS):
+  define_models(MODELS)
+  data_sentences = text_to_sentences(data)
+  data_embed, list_sentences = preprocess(data_sentences, sentTransfModelUtilsObj)
+  dict_topic_sentences = compute_similarity_matrix(data_embed, list_sentences, sentTransfModelUtilsObj)
+  summaries_report = summarize(dict_topic_sentences)
+  return summaries_report

requirements.txt CHANGED Viewed

@@ -1,2 +1,9 @@
-torch
-transformers

+nltk==3.8.1
+numpy==1.24.3
+pandas==2.0.0
+sentence_transformers==2.2.2
+spacy==3.5.2
+stanza==1.5.0
+streamlit==1.23.1
+torch==2.0.1
+transformers==4.29.1