Gbadamosi_oluwaseyi commited on
Commit
c20a951
1 Parent(s): d210bc0
Files changed (5) hide show
  1. README.md +45 -2
  2. app.py +119 -0
  3. function.py +106 -0
  4. requirements.txt +6 -0
  5. summarizer_database.db +0 -0
README.md CHANGED
@@ -1,2 +1,45 @@
1
- # hugginface_pretrained_article_summerizer
2
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Article Text Summarizer
3
+ emoji: 💻
4
+ colorFrom: green
5
+ colorTo: green
6
+ sdk: streamlit
7
+ app_file: app.py
8
+ pinned: false
9
+ ---
10
+
11
+ # Configuration
12
+
13
+ `title`: _string_
14
+ Display title for the Space
15
+
16
+ `emoji`: _string_
17
+ Space emoji (emoji-only character allowed)
18
+
19
+ `colorFrom`: _string_
20
+ Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
21
+
22
+ `colorTo`: _string_
23
+ Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
24
+
25
+ `sdk`: _string_
26
+ Can be either `gradio`, `streamlit`, or `static`
27
+
28
+ `sdk_version` : _string_
29
+ Only applicable for `streamlit` SDK.
30
+ See [doc](https://hf.co/docs/hub/spaces) for more info on supported versions.
31
+
32
+ `app_file`: _string_
33
+ Path to your main application file (which contains either `gradio` or `streamlit` Python code, or `static` html code).
34
+ Path is relative to the root of the repository.
35
+
36
+ `models`: _List[string]_
37
+ HF model IDs (like "gpt2" or "deepset/roberta-base-squad2") used in the Space.
38
+ Will be parsed automatically from your code if not specified here.
39
+
40
+ `datasets`: _List[string]_
41
+ HF dataset IDs (like "common_voice" or "oscar-corpus/OSCAR-2109") used in the Space.
42
+ Will be parsed automatically from your code if not specified here.
43
+
44
+ `pinned`: _boolean_
45
+ Whether the Space stays on top of your list.
app.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Core Pkgs
2
+ import streamlit as st
3
+ from function import *
4
+ # EDA Pkgs
5
+ import pandas as pd
6
+ import matplotlib.pyplot as plt
7
+ from wordcloud import WordCloud
8
+ # Utils
9
+ from datetime import datetime
10
+ warnings.filterwarnings("ignore")
11
+
12
+ st.set_option('deprecation.showPyplotGlobalUse', False)
13
+
14
+ def main():
15
+ menu = ["Home","Storage","About"]
16
+ create_table()
17
+
18
+ choice = st.sidebar.selectbox("Menu",menu)
19
+
20
+ if choice == "Home":
21
+ st.title("Demo")
22
+
23
+ st.sidebar.subheader("Tuning/Settings")
24
+ # max_length= st.sidebar.slider("Maximum length of the generated text ",30,100)
25
+ # top_k= st.sidebar.slider(" limits the sampled tokens to the top k values ",1,100)
26
+ # temperature= st.sidebar.slider("Controls the craziness of the text ",0.7,100.0)
27
+ model_type = st.sidebar.selectbox("Model type", options=["Bart","T5"])
28
+
29
+ upload_doc = st.file_uploader("Upload a .txt, .pdf, .docx file for summarization")
30
+
31
+ st.markdown("<h3 style='text-align: center; color: red;'>OR</h3>",unsafe_allow_html=True,)
32
+
33
+ plain_text = st.text_area("Type your Message...",height=200)
34
+
35
+ if upload_doc:
36
+ clean_text = preprocess_plain_text(extract_text_from_file(upload_doc))
37
+ else:
38
+ clean_text = preprocess_plain_text(plain_text)
39
+
40
+ summarize = st.button("Summarize...")
41
+
42
+ # called on toggle button [summarize]
43
+ if summarize:
44
+ if model_type == "Bart":
45
+ text_to_summarize = clean_text
46
+
47
+ with st.spinner(
48
+ text="Loading Bart Model and Extracting summary. This might take a few seconds depending on the length of your text..."):
49
+ summarizer_model = bart()
50
+ summarized_text = summarizer_model(text_to_summarize, max_length=100, min_length=30)
51
+ summarized_text = ' '.join([summ['summary_text'] for summ in summarized_text])
52
+ st.success("Data Submitted for model retraining")
53
+ postdate = datetime.now()
54
+ # Add Data To Database
55
+ add_data(text_to_summarize,summarized_text,postdate)
56
+
57
+ elif model_type == "T5":
58
+ text_to_summarize = clean_text
59
+
60
+ with st.spinner(
61
+ text="Loading T5 Model and Extracting summary. This might take a few seconds depending on the length of your text..."):
62
+ summarizer_model = t5()
63
+ summarized_text = summarizer_model(text_to_summarize, max_length=100, min_length=30)
64
+ summarized_text = ' '.join([summ['summary_text'] for summ in summarized_text])
65
+ st.success("Data Submitted for model retraining")
66
+ postdate = datetime.now()
67
+ # Add Data To Database
68
+ add_data(text_to_summarize,summarized_text,postdate)
69
+
70
+ # else:
71
+ # text_to_summarize = clean_text
72
+
73
+ # with st.spinner(
74
+ # text="Loading Pegasus Model and Extracting summary. This might take a few seconds depending on the length of your text..."):
75
+ # summarizer_model = pegasus()
76
+ # summarized_text = summarizer_model(text_to_summarize, max_length=100, min_length=30)
77
+ # # summarized_text = ' '.join([summ['summary_text'] for summ in summarized_text])
78
+ # st.success("Data Submitted for model retraining")
79
+ # postdate = datetime.now()
80
+ # # Add Data To Database
81
+ # # add_data(text_to_summarize,summarized_text,postdate)
82
+
83
+ res_col1 ,res_col2 = st.columns(2)
84
+ with res_col1:
85
+ st.subheader("Generated Text Visualization")
86
+ # Create and generate a word cloud image:
87
+ wordcloud = WordCloud().generate(summarized_text)
88
+ # Display the generated image:
89
+ plt.imshow(wordcloud, interpolation='bilinear')
90
+ plt.axis("off")
91
+ plt.show()
92
+ st.pyplot()
93
+ summary_downloader(summarized_text)
94
+
95
+ with res_col2:
96
+ st.subheader("Summarized Text Output")
97
+ st.success("Summarized Text")
98
+ st.write(summarized_text)
99
+
100
+ elif choice == "Storage":
101
+ st.title("Manage & Monitor Results")
102
+ # stored_data = view_all_data()
103
+ # new_df = pd.DataFrame(stored_data,columns=["text_to_summarize","summarized_text","postdate"])
104
+ # st.dataframe(new_df)
105
+ # new_df['postdate'] = pd.to_datetime(new_df['postdate'])
106
+
107
+
108
+ else:
109
+ st.subheader("About")
110
+ # html_temp ="""<div>
111
+ # <p></p>
112
+ # <p></p>
113
+ # </div>"""
114
+ # st.markdown(html_temp, unsafe_allow_html=True)
115
+
116
+
117
+
118
+ if __name__ == '__main__':
119
+ main()
function.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Core Pkgs
2
+ import streamlit as st
3
+ from transformers import pipeline
4
+ from PyPDF2 import PdfFileReader
5
+ import docx2txt
6
+ import base64
7
+ import re
8
+ import sqlite3
9
+ import time
10
+ from io import StringIO
11
+ import warnings
12
+ warnings.filterwarnings("ignore")
13
+
14
+ time_str = time.strftime("%Y%m%d-%H%M%S")
15
+ # Loading function the model pipeline from huggingface model
16
+ @st.cache(allow_output_mutation=True)
17
+ def bart():
18
+ ''' Loading bart model using pipeline api '''
19
+ summarizer = pipeline('summarization',model='amazon/bort')
20
+ return summarizer
21
+
22
+ def t5():
23
+ ''' Loading t5 model using pipeline api '''
24
+ summarizer = pipeline("summarization", model="t5-base", tokenizer="t5-base", framework="tf")
25
+ return summarizer
26
+
27
+ # def pegasus():
28
+ # ''' Loading pegasus model using pipeline api '''
29
+ # summarizer = pipeline('summarization',model='google/pegasus-xsum')
30
+ # return summarizer
31
+
32
+ def preprocess_plain_text(x):
33
+
34
+ x = x.encode("ascii", "ignore").decode() # unicode
35
+ x = re.sub(r"https*\S+", " ", x) # url
36
+ x = re.sub(r"@\S+", " ", x) # mentions
37
+ x = re.sub(r"#\S+", " ", x) # hastags
38
+ x = re.sub(r"\s{2,}", " ", x) # over spaces
39
+ x = re.sub("[^.,!?A-Za-z0-9]+", " ", x) # special charachters except .,!?
40
+
41
+ return x
42
+
43
+ def extract_pdf(file):
44
+
45
+ '''Extract text from PDF file'''
46
+
47
+ pdfReader = PdfFileReader(file)
48
+ count = pdfReader.numPages
49
+ all_text = ""
50
+ for i in range(count):
51
+ page = pdfReader.getPage(i)
52
+ all_text += page.extractText()
53
+
54
+ return all_text
55
+
56
+
57
+ def extract_text_from_file(file):
58
+
59
+ '''Extract text from uploaded file'''
60
+
61
+ # read text file
62
+ if file.type == "text/plain":
63
+ # To convert to a string based IO:
64
+ stringio = StringIO(file.getvalue().decode("utf-8"))
65
+
66
+ # To read file as string:
67
+ file_text = stringio.read()
68
+
69
+ # read pdf file
70
+ elif file.type == "application/pdf":
71
+ file_text = extract_pdf(file)
72
+
73
+ # read docx file
74
+ elif (
75
+ file.type
76
+ == "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
77
+ ):
78
+ file_text = docx2txt.process(file)
79
+
80
+ return file_text
81
+
82
+ def summary_downloader(raw_text):
83
+
84
+ b64 = base64.b64encode(raw_text.encode()).decode()
85
+ new_filename = "new_text_file_{}_.txt".format(time_str)
86
+ st.markdown("#### Download Summary as a File ###")
87
+ href = f'<a href="data:file/txt;base64,{b64}" download="{new_filename}">Click to Download!!</a>'
88
+ st.markdown(href,unsafe_allow_html=True)
89
+
90
+
91
+ # Storage in A Database
92
+ conn = sqlite3.connect('summarizer_database.db',check_same_thread=False)
93
+ c = conn.cursor()
94
+ # Create Fxn From SQL
95
+ def create_table():
96
+ c.execute('CREATE TABLE IF NOT EXISTS TextTable(text_to_summarize TEXT,summarized_text TEXT,postdate DATE)')
97
+
98
+
99
+ def add_data(text_to_summarize,summarized_text,postdate):
100
+ c.execute('INSERT INTO TextTable(text_to_summarize,summarized_text,postdate) VALUES (?,?,?)',(text_to_summarize,summarized_text,postdate))
101
+ conn.commit()
102
+
103
+ def view_all_data():
104
+ c.execute("SELECT * FROM TextTable")
105
+ data = c.fetchall()
106
+ return data
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
1
+ docx2txt==0.8
2
+ pandas==1.3.5
3
+ PyPDF2==1.26.0
4
+ regex==2021.8.28
5
+ transformers==4.17.0
6
+ wordcloud== 1.8.1
summarizer_database.db ADDED
Binary file (16.4 kB). View file