Spaces:

EE21
/

ToS-Summarization

Running

File size: 7,046 Bytes

c0ae847
af90ec4
5f89cc0
 
 
 
 
 
cc0d652
2d2d28b
50d3158
7a228d7
d90f05d
4b1dcc8
5f89cc0
46193fd
5f89cc0
 
2d2d28b
 
 
845351c
5f89cc0
20d6d68
 
 
 
 
 
 
 
03ddbfd
20d6d68
 
 
 
99ba17a
20d6d68
 
1a0e07b
20d6d68
03ddbfd
20d6d68
5f89cc0
d90f05d
1918f01
1df149b
2115e8c
20d6d68
 
75d2ced
20d6d68
2d2d28b
 
fdcbe89
2d2d28b
20d6d68
71f1303
2a61e91
 
 
03ddbfd
20d6d68
 
b87bcef
 
c7464b2
2d2d28b
20d6d68
2d2d28b
b87bcef
 
5f89cc0
 
 
 
 
b87bcef
94d59bf
f497f7f
2a61e91
20d6d68
94d59bf
 
 
 
 
5f89cc0
fb5ed70
1918f01
da1f55e
 
5f89cc0
da1f55e
7a228d7
ccd2173
97f7d3e
5f89cc0
8cff1a5
b3bc472
 
 
5f89cc0
eea15da
b3bc472
 
 
d7485e8
d90f05d
 
 
432c28d
d7485e8
c0ade28
13e8889
d7485e8
2115e8c
20d6d68
 
1df149b
cce90fc
 
97f7d3e
6f4ddaf
af8a888
6f4ddaf
 
 
 
 
 
 
 
0eb61e5
 
2115e8c
1a0e07b
2115e8c
 
 
 
 
6349b5a
6a4b9ce
 
6349b5a
6a4b9ce
6349b5a
6a4b9ce
6349b5a
2115e8c
20d6d68
b87bcef

import streamlit as st
import re
import PyPDF2
import matplotlib.pyplot as plt
import io
from wordcloud import WordCloud
from PIL import Image

from rouge import Rouge
from datasets import load_dataset
from extractive_summarization import summarize_with_textrank, summarize_with_lsa
from abstractive_summarization import summarize_with_bart_cnn, summarize_with_bart_ft, summarize_with_led, summarize_with_t5
#from keyword_extraction import extract_keywords
from keyphrase_extraction import extract_sentences_with_obligations
from hybrid_summarization import summarize_hybrid

#-------------------------------------------------------------------#
# Load in ToS-Summaries dataset
dataset = load_dataset("EE21/ToS-Summaries")

# Extract titles or identifiers for the ToS
tos_titles = [f"Document {i}" for i in range(len(dataset['train']))]
    
# Set page to wide mode
st.set_page_config(layout="wide")

# Function to handle file upload and return its content
def load_pdf(file):
    pdf_reader = PyPDF2.PdfReader(file)
    pdf_text = ""
    for page_num in range(len(pdf_reader.pages)):
        pdf_text += pdf_reader.pages[page_num].extract_text() or ""
    return pdf_text

# Main app
def main():
    st.title("QuickToS - Terms of Service Summarizer")

    # Layout: 3 columns
    col1, col2, col3 = st.columns([1, 3, 2], gap="large")

    # Left column: Radio buttons for summarizer choice
    with col1:
        radio_options = ["Hybrid (RAKE + BART Fine-tuned)", "Abstractive (LongT5)", "Abstractive (LED)", 'Abstractive (BART Fine-tuned)', "Abstractive (BART-large-CNN)", 'Extractive (TextRank)', 
                         "Extractive (Latent Semantic Analysis)", 'Keyphrase Extraction (RAKE)']
        
        radio_selection = st.radio("Choose type of summarizer", radio_options)

    # Middle column: Text input and File uploader
    with col2:
        user_input = st.text_area("Enter a text")
        uploaded_file = st.file_uploader("Upload a PDF", type="pdf")
        
        # Dropdown for selecting the document
        tos_selection_index  = st.selectbox("Select a Terms of Service Document (only for testing purposes)", range(len(tos_titles)), format_func=lambda x: tos_titles[x])
        
        if st.button("Summarize"):
            if uploaded_file and user_input and tos_selection_index:
                st.warning("Please provide either text input or a PDF file, not both.")
                return
            elif uploaded_file:
                # Extract text from PDF
                file_content = load_pdf(uploaded_file)
                st.write("PDF uploaded successfully.")
            elif user_input:
                file_content = user_input
            elif tos_selection_index is not None:
                file_content = dataset['train'][tos_selection_index]['plain_text']
            else:
                st.warning("Please upload a PDF, enter some text, or select a document to summarize.")
                return

            # Perform hybrid summarization
            if radio_selection == "Hybrid (RAKE + BART Fine-tuned)":
                summary = summarize_hybrid(file_content)
                st.session_state.summary = summary

            # Perform extractive summarization
            if radio_selection == "Extractive (TextRank)":
                summary = summarize_with_textrank(file_content)
                st.session_state.summary = summary

            # Perform extractive summarization
            if radio_selection == "Extractive (Latent Semantic Analysis)":
                summary = summarize_with_lsa(file_content)
                st.session_state.summary = summary

            # Perform abstractive summarization
            if radio_selection == "Abstractive (BART Fine-tuned)":
                summary = summarize_with_bart_ft(file_content)
                st.session_state.summary = summary

            # Perform abstractive summarization
            if radio_selection == "Abstractive (BART-large-CNN)":
                summary = summarize_with_bart_cnn(file_content)
                st.session_state.summary = summary

            # Perform abstractive summarization
            if radio_selection == "Abstractive (LongT5)":
                summary = summarize_with_t5(file_content)
                st.session_state.summary = summary

            # Perform abstractive summarization
            if radio_selection == "Abstractive (LED)":
                summary = summarize_with_led(file_content)
                st.session_state.summary = summary

            # Perform Keyword Extraction
            #if radio_selection == "Keyword Extraction (RAKE)":
            #    summary = extract_keywords(file_content)
            #    st.session_state.summary = summary

            # Perform Keyphrase Extraction
            if radio_selection == "Keyphrase Extraction (RAKE)":
                summary = extract_sentences_with_obligations(file_content)
                st.session_state.summary = summary
    
    # Right column: Displaying text after pressing 'Summarize'
    with col3:
        st.write("Summary")
        if 'summary' in st.session_state:
            st.write(st.session_state.summary)

            # Generate and display word cloud
            wordcloud = WordCloud(width=800, height=400, background_color='white', max_words=20).generate(st.session_state.summary)
            # Convert to PIL Image
            image = wordcloud.to_image()
            # Convert PIL Image to bytes
            buf = io.BytesIO()
            image.save(buf, format='PNG')
            byte_im = buf.getvalue()
            st.image(byte_im, caption='Word Cloud of Summary', use_column_width=True)

            # Check if no PDF or text input is provided and a ToS document is selected
            if not uploaded_file and not user_input and tos_selection_index is not None and 'summary' in dataset['train'][tos_selection_index]:
                # Fetch the reference summary
                reference_summary = dataset['train'][tos_selection_index]['summary']
    
                # Calculate ROUGE scores
                rouge = Rouge()
                scores = rouge.get_scores(st.session_state.summary, reference_summary)
    
            # Display ROUGE scores as styled text
                col1, col2, col3 = st.columns(3)
                with col1:
                    st.markdown(f"<p style='text-align: center; color: black; border: 1px solid #cccccc; padding: 5px; border-radius: 4px;'>ROUGE-1: {scores[0]['rouge-1']['f']:.4f}</p>", unsafe_allow_html=True)
                with col2:
                    st.markdown(f"<p style='text-align: center; color: black; border: 1px solid #cccccc; padding: 5px; border-radius: 4px;'>ROUGE-2: {scores[0]['rouge-2']['f']:.4f}</p>", unsafe_allow_html=True)
                with col3:
                    st.markdown(f"<p style='text-align: center; color: black; border: 1px solid #cccccc; padding: 5px; border-radius: 4px;'>ROUGE-L: {scores[0]['rouge-l']['f']:.4f}</p>", unsafe_allow_html=True)

if __name__ == "__main__":
    main()