Spaces:

Seyirex
/

hugginface_pretrained_article_summerizer

Runtime error

App Files Files Community

Gbadamosi_oluwaseyi commited on Apr 8, 2022

Commit

c20a951

•

1 Parent(s): d210bc0

v1

Browse files

Files changed (5) hide show

README.md +45 -2
app.py +119 -0
function.py +106 -0
requirements.txt +6 -0
summarizer_database.db +0 -0

README.md CHANGED Viewed

	@@ -1,2 +1,45 @@
1	- ~~# hugginface_pretrained_article_summerizer~~
2	-

+---
+title: Article Text Summarizer
+emoji: 💻
+colorFrom: green
+colorTo: green
+sdk: streamlit
+app_file: app.py
+pinned: false
+---
+# Configuration
+`title`: _string_
+Display title for the Space
+`emoji`: _string_
+Space emoji (emoji-only character allowed)
+`colorFrom`: _string_
+Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
+`colorTo`: _string_
+Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
+`sdk`: _string_
+Can be either `gradio`, `streamlit`, or `static`
+`sdk_version` : _string_
+Only applicable for `streamlit` SDK.
+See [doc](https://hf.co/docs/hub/spaces) for more info on supported versions.
+`app_file`: _string_
+Path to your main application file (which contains either `gradio` or `streamlit` Python code, or `static` html code).
+Path is relative to the root of the repository.
+`models`: _List[string]_
+HF model IDs (like "gpt2" or "deepset/roberta-base-squad2") used in the Space.
+Will be parsed automatically from your code if not specified here.
+`datasets`: _List[string]_
+HF dataset IDs (like "common_voice" or "oscar-corpus/OSCAR-2109") used in the Space.
+Will be parsed automatically from your code if not specified here.
+`pinned`: _boolean_
+Whether the Space stays on top of your list.

app.py ADDED Viewed

	@@ -0,0 +1,119 @@

+# Core Pkgs
+import streamlit as st
+from function import *
+# EDA Pkgs
+import pandas as pd
+import matplotlib.pyplot as plt
+from wordcloud import WordCloud
+# Utils
+from datetime import datetime
+warnings.filterwarnings("ignore")
+st.set_option('deprecation.showPyplotGlobalUse', False)
+def main():
+    menu = ["Home","Storage","About"]
+    create_table()
+    choice = st.sidebar.selectbox("Menu",menu)
+    if choice == "Home":
+        st.title("Demo")
+        st.sidebar.subheader("Tuning/Settings")
+        # max_length= st.sidebar.slider("Maximum length of the generated text ",30,100)
+        # top_k= st.sidebar.slider(" limits the sampled tokens to the top k values ",1,100)
+        # temperature= st.sidebar.slider("Controls the craziness of the text ",0.7,100.0)
+        model_type = st.sidebar.selectbox("Model type", options=["Bart","T5"])
+        upload_doc = st.file_uploader("Upload a .txt, .pdf, .docx file for summarization")
+        st.markdown("<h3 style='text-align: center; color: red;'>OR</h3>",unsafe_allow_html=True,)
+        plain_text = st.text_area("Type your Message...",height=200)
+        if upload_doc:
+            clean_text = preprocess_plain_text(extract_text_from_file(upload_doc))
+        else:
+            clean_text = preprocess_plain_text(plain_text)
+        summarize = st.button("Summarize...")
+        # called on toggle button [summarize]
+        if summarize:
+            if model_type == "Bart":
+                text_to_summarize = clean_text
+                with st.spinner(
+                    text="Loading Bart Model and Extracting summary. This might take a few seconds depending on the length of your text..."):
+                    summarizer_model = bart()
+                    summarized_text = summarizer_model(text_to_summarize, max_length=100, min_length=30)
+                    summarized_text = ' '.join([summ['summary_text'] for summ in summarized_text])
+                    st.success("Data Submitted for model retraining")
+                    postdate = datetime.now()
+                    # Add Data To Database
+                    add_data(text_to_summarize,summarized_text,postdate)
+            elif model_type == "T5":
+                text_to_summarize = clean_text
+                with st.spinner(
+                    text="Loading T5 Model and Extracting summary. This might take a few seconds depending on the length of your text..."):
+                    summarizer_model = t5()
+                    summarized_text = summarizer_model(text_to_summarize, max_length=100, min_length=30)
+                    summarized_text = ' '.join([summ['summary_text'] for summ in summarized_text])
+                    st.success("Data Submitted for model retraining")
+                    postdate = datetime.now()
+                    # Add Data To Database
+                    add_data(text_to_summarize,summarized_text,postdate)
+            # else:
+            #     text_to_summarize = clean_text
+            #     with st.spinner(
+            #         text="Loading Pegasus Model and Extracting summary. This might take a few seconds depending on the length of your text..."):
+            #         summarizer_model = pegasus()
+            #         summarized_text = summarizer_model(text_to_summarize, max_length=100, min_length=30)
+            #         # summarized_text = ' '.join([summ['summary_text'] for summ in summarized_text])
+            #         st.success("Data Submitted for model retraining")
+            #         postdate = datetime.now()
+            #         # Add Data To Database
+            #         # add_data(text_to_summarize,summarized_text,postdate)
+            res_col1 ,res_col2 = st.columns(2)
+            with res_col1:
+                st.subheader("Generated Text Visualization")
+                # Create and generate a word cloud image:
+                wordcloud = WordCloud().generate(summarized_text)
+                # Display the generated image:
+                plt.imshow(wordcloud, interpolation='bilinear')
+                plt.axis("off")
+                plt.show()
+                st.pyplot()
+                summary_downloader(summarized_text)
+            with res_col2:
+                st.subheader("Summarized Text Output")
+                st.success("Summarized Text")
+                st.write(summarized_text)
+    elif choice == "Storage":
+        st.title("Manage & Monitor Results")
+        # stored_data =  view_all_data()
+        # new_df = pd.DataFrame(stored_data,columns=["text_to_summarize","summarized_text","postdate"])
+        # st.dataframe(new_df)
+        # new_df['postdate'] = pd.to_datetime(new_df['postdate'])
+    else:
+        st.subheader("About")
+        # html_temp ="""<div>
+        #          <p></p>
+        #          <p></p>
+        #          </div>"""
+        # st.markdown(html_temp, unsafe_allow_html=True)
+if __name__ == '__main__':
+	main()

function.py ADDED Viewed

	@@ -0,0 +1,106 @@

+# Core Pkgs
+import streamlit as st
+from transformers import pipeline
+from PyPDF2 import PdfFileReader
+import docx2txt
+import base64
+import  re
+import sqlite3
+import time
+from io import StringIO
+import warnings
+warnings.filterwarnings("ignore")
+time_str = time.strftime("%Y%m%d-%H%M%S")
+# Loading function the model pipeline from huggingface model
+@st.cache(allow_output_mutation=True)
+def bart():
+    ''' Loading bart model using pipeline api '''
+    summarizer = pipeline('summarization',model='amazon/bort')
+    return summarizer
+def t5():
+    ''' Loading t5 model using pipeline api '''
+    summarizer = pipeline("summarization", model="t5-base", tokenizer="t5-base", framework="tf")
+    return summarizer
+# def pegasus():
+#     ''' Loading pegasus model using pipeline api '''
+#     summarizer = pipeline('summarization',model='google/pegasus-xsum')
+#     return summarizer
+def preprocess_plain_text(x):
+    x = x.encode("ascii", "ignore").decode()  # unicode
+    x = re.sub(r"https*\S+", " ", x)  # url
+    x = re.sub(r"@\S+", " ", x)  # mentions
+    x = re.sub(r"#\S+", " ", x)  # hastags
+    x = re.sub(r"\s{2,}", " ", x)  # over spaces
+    x = re.sub("[^.,!?A-Za-z0-9]+", " ", x)  # special charachters except .,!?
+    return x
+def extract_pdf(file):
+    '''Extract text from PDF file'''
+    pdfReader = PdfFileReader(file)
+    count = pdfReader.numPages
+    all_text = ""
+    for i in range(count):
+        page = pdfReader.getPage(i)
+        all_text += page.extractText()
+    return all_text
+def extract_text_from_file(file):
+    '''Extract text from uploaded file'''
+    # read text file
+    if file.type == "text/plain":
+        # To convert to a string based IO:
+        stringio = StringIO(file.getvalue().decode("utf-8"))
+        # To read file as string:
+        file_text = stringio.read()
+    # read pdf file
+    elif file.type == "application/pdf":
+        file_text = extract_pdf(file)
+    # read docx file
+    elif (
+        file.type
+        == "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
+    ):
+        file_text = docx2txt.process(file)
+    return file_text
+def summary_downloader(raw_text):
+	b64 = base64.b64encode(raw_text.encode()).decode()
+	new_filename = "new_text_file_{}_.txt".format(time_str)
+	st.markdown("#### Download Summary as a File ###")
+	href = f'<a href="data:file/txt;base64,{b64}" download="{new_filename}">Click to Download!!</a>'
+	st.markdown(href,unsafe_allow_html=True)
+# Storage in A Database
+conn = sqlite3.connect('summarizer_database.db',check_same_thread=False)
+c = conn.cursor()
+    # Create Fxn From SQL
+def create_table():
+	c.execute('CREATE TABLE IF NOT EXISTS TextTable(text_to_summarize TEXT,summarized_text TEXT,postdate DATE)')
+def add_data(text_to_summarize,summarized_text,postdate):
+    c.execute('INSERT INTO TextTable(text_to_summarize,summarized_text,postdate) VALUES (?,?,?)',(text_to_summarize,summarized_text,postdate))
+    conn.commit()
+def view_all_data():
+	c.execute("SELECT * FROM TextTable")
+	data = c.fetchall()
+	return data

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+docx2txt==0.8
+pandas==1.3.5
+PyPDF2==1.26.0
+regex==2021.8.28
+transformers==4.17.0
+wordcloud== 1.8.1

summarizer_database.db ADDED Viewed

Binary file (16.4 kB). View file