Spaces:
Sleeping
Sleeping
Commit
•
e1ed456
1
Parent(s):
80ef9b7
Upload app.py (#1)
Browse files- Upload app.py (3ac5220e974708c6f12fe52b2a4b5427181eefe4)
Co-authored-by: Dawit Andebrhan Teklay <Dada-Andat@users.noreply.huggingface.co>
app.py
CHANGED
@@ -7,22 +7,19 @@ import numpy as np
|
|
7 |
import transformers
|
8 |
import streamlit as st
|
9 |
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
|
10 |
-
#from transformers import sentencepiece
|
11 |
-
import sentencepiece as spm
|
12 |
import webbrowser
|
13 |
|
14 |
|
15 |
-
activities
|
16 |
-
|
17 |
-
choice = st.sidebar.selectbox("OPTIONS", activities)
|
18 |
|
19 |
|
20 |
##################################
|
21 |
-
model = AutoModelForSeq2SeqLM.from_pretrained("
|
22 |
|
23 |
-
#
|
24 |
|
25 |
-
tokenizer = AutoTokenizer.from_pretrained("google/pegasus-cnn_dailymail")
|
26 |
|
27 |
##################################
|
28 |
|
@@ -31,15 +28,16 @@ tokenizer = AutoTokenizer.from_pretrained("google/pegasus-cnn_dailymail")
|
|
31 |
|
32 |
#text_input = st.text_input("Enter some text:")
|
33 |
|
34 |
-
#
|
35 |
-
|
36 |
-
#tokenizer = AutoTokenizer.from_pretrained("tuner007/pegasus_paraphrase")
|
37 |
-
|
38 |
|
39 |
-
#
|
40 |
#tokenizer = AutoTokenizer.from_pretrained("Nattiman/CHATSUMMARY")
|
41 |
|
42 |
|
|
|
43 |
def generate_summary(text, max_length=100, min_length=30):
|
44 |
summarizer = pipeline("summarization", model=model, tokenizer=tokenizer)
|
45 |
summary = summarizer(text, max_length=max_length,
|
@@ -48,46 +46,42 @@ def generate_summary(text, max_length=100, min_length=30):
|
|
48 |
return summary[0]["summary_text"]
|
49 |
|
50 |
|
|
|
|
|
51 |
####################################################################
|
52 |
def main():
|
|
|
53 |
|
54 |
-
if choice
|
55 |
-
|
56 |
-
st.
|
57 |
-
st.markdown(
|
58 |
-
"<h1 style='color: #FF9900; font-size: 3em;'>SAMSUNG INNOVATION CAMPUS</h1>", unsafe_allow_html=True)
|
59 |
st.subheader("Welcome to Our Dialogue Summarizer App!")
|
60 |
-
st.markdown(
|
61 |
-
">*This is a capstone project developed by Group-6 under the supervision of SIC team*.")
|
62 |
st.markdown("---")
|
63 |
#txt = st.text_area('Enter your long dialogue below please')
|
64 |
-
#txt_out = st.text_area('Output summary')
|
65 |
|
66 |
-
elif choice
|
67 |
-
st.markdown(
|
68 |
-
|
69 |
-
input_dialogue = st.text_area("Enter Your Dialogue Below", "Type here")
|
70 |
if st.button("Summarize"):
|
71 |
summary = generate_summary(input_dialogue)
|
72 |
-
st.markdown(
|
73 |
-
"*<h1 style='color: #9925be; font-size: 1.2em;'>Here is your summarized dialogue*</h1>", unsafe_allow_html=True)
|
74 |
st.write(summary)
|
75 |
|
76 |
-
elif choice
|
77 |
st.subheader("")
|
78 |
-
st.markdown(
|
79 |
-
"<h1 style='color: #FF9900; font-size: 3em;'>TRAINING DATASET Info</h1>", unsafe_allow_html=True)
|
80 |
st.header("Dataset Card for SAMSum Corpus")
|
81 |
-
st.markdown("> *Dataset Summary\n The SAMSum dataset contains about 16k messenger-like conversations with summaries. Conversations were created and written down by linguists fluent in English. Linguists were asked to create conversations similar to those they write on a daily basis, reflecting the proportion of topics of their real-life messenger convesations. The style and register are diversified - conversations could be informal, semi-formal or formal, they may contain slang words, emoticons and typos. Then, the conversations were annotated with summaries. It was assumed that summaries should be a concise brief of what people talked about in the conversation in third person. The SAMSum dataset was prepared by Samsung R&D Institute Poland and is distributed for research purposes* (non-commercial licence: CC BY-NC-ND 4.0)",
|
82 |
#st.button("Read more")
|
83 |
url = 'https://huggingface.co/datasets/samsum'
|
84 |
|
85 |
# Create a button with the label 'Go to Google'
|
86 |
if st.button('Read More'):
|
87 |
webbrowser.open_new_tab(url)
|
88 |
-
elif choice
|
89 |
-
st.markdown(
|
90 |
-
"<h1 style='color: #FF9900; font-size: 3em;'>PEGASUS MODEL Info</h1>", unsafe_allow_html=True)
|
91 |
st.markdown(">*The Pegasus model was proposed in PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu on Dec 18, 2019. According to the abstract, Pegasus’ pretraining task is intentionally similar to summarization: important sentences are removed/masked from an input document and are generated together as one output sequence from the remaining sentences, similar to an extractive summary. Pegasus achieves SOTA summarization performance on all 12 downstream tasks, as measured by ROUGE and human eval. This model was contributed by sshleifer. The Authors’ code can be found here. Tips: Sequence-to-sequence model with the same encoder-decoder model architecture as BART. Pegasus is pre-trained jointly on two self-supervised objective functions: Masked Language Modeling (MLM) and a novel summarization specific pretraining objective, called Gap Sentence Generation (GSG). MLM: encoder input tokens are randomly replaced by a mask tokens and have to be predicted by the encoder (like in BERT) GSG: whole encoder input sentences are replaced by a second mask token and fed to the decoder, but which has a causal mask to hide the future words like a regular auto-regressive transformer decoder.*")
|
92 |
url = 'https://huggingface.co/google/pegasus-cnn_dailymail'
|
93 |
|
@@ -95,16 +89,16 @@ def main():
|
|
95 |
if st.button('Read More'):
|
96 |
webbrowser.open_new_tab(url)
|
97 |
|
98 |
-
elif choice
|
99 |
-
st.markdown(
|
100 |
-
"<h1 style='color: #FF9900; font-size: 3em;'>ABOUT US</h1>", unsafe_allow_html=True)
|
101 |
st.markdown("> *Welcome to our website! We are a team of passionate individuals dedicated to providing new NLP based services to our customers. Our goal is to create a positive impact in the world by leveraging our expertise and innovative solutions. With passion and resilence and through experience, we strive to exceed expectations and build lasting relationships with our clients. We, the developers of this capstone project are from Fujairah Emirate. We proudly own this project as it was the product of our hectic crash course that was offered by Samsung Innovation Campus. Thank you for choosing us, and we look forward to serving you!*")
|
102 |
-
|
103 |
-
st.markdown(
|
104 |
-
">*<h1 style='color: #EA8770; font-size: 2em;'>Developers Name List</h1>*", unsafe_allow_html=True)
|
105 |
st.markdown("*<h1 style='color: #EA8790; font-size: 1.2em;'>This project was developed by: Nathan Berhe, Smon Fitwi, Dawit Andebrhan, Bereket Kibreab, Eyasu Tesfamichael, Milkias Butsuamlak</h1>*", unsafe_allow_html=True)
|
106 |
st.markdown("*<h1 style='color: #EA8790; font-size: 1.2em;'>This project was developed under the supervision of Mrs. Rabab, Mr.Mrad and Mr. Marc, honourable staffs of SIC program</h1>*", unsafe_allow_html=True)
|
107 |
|
108 |
|
109 |
-
|
|
|
|
|
110 |
main()
|
|
|
7 |
import transformers
|
8 |
import streamlit as st
|
9 |
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
|
|
|
|
|
10 |
import webbrowser
|
11 |
|
12 |
|
13 |
+
activities=["Home","Summarize","Training Dataset","Model info","About Us"]
|
14 |
+
choice=st.sidebar.selectbox("OPTIONS",activities)
|
|
|
15 |
|
16 |
|
17 |
##################################
|
18 |
+
#model = AutoModelForSeq2SeqLM.from_pretrained("google/pegasus-cnn_dailymail")
|
19 |
|
20 |
+
#model.save_pretrained("my_model_checkpoint")
|
21 |
|
22 |
+
#tokenizer = AutoTokenizer.from_pretrained("google/pegasus-cnn_dailymail")
|
23 |
|
24 |
##################################
|
25 |
|
|
|
28 |
|
29 |
#text_input = st.text_input("Enter some text:")
|
30 |
|
31 |
+
#with open('src/fine_tuned_model.pkl', 'rb') as f:
|
32 |
+
model = AutoModelForSeq2SeqLM.from_pretrained("SmonF/YTFineTunePegasus")
|
33 |
+
#tokenizer = AutoTokenizer.from_pretrained("tuner007/pegasus_paraphrase")
|
34 |
+
tokenizer = AutoTokenizer.from_pretrained("google/pegasus-cnn_dailymail")
|
35 |
|
36 |
+
#model=AutoModelForSeq2SeqLM.from_pretrained('Nattiman/CHATSUMMARY')
|
37 |
#tokenizer = AutoTokenizer.from_pretrained("Nattiman/CHATSUMMARY")
|
38 |
|
39 |
|
40 |
+
|
41 |
def generate_summary(text, max_length=100, min_length=30):
|
42 |
summarizer = pipeline("summarization", model=model, tokenizer=tokenizer)
|
43 |
summary = summarizer(text, max_length=max_length,
|
|
|
46 |
return summary[0]["summary_text"]
|
47 |
|
48 |
|
49 |
+
|
50 |
+
|
51 |
####################################################################
|
52 |
def main():
|
53 |
+
|
54 |
|
55 |
+
if choice=="Home":
|
56 |
+
st.image("https://s3.amazonaws.com/moonup/production/uploads/641c752b353524fe41ed8e37/HkSAqww4dCUtVJLpRkHie.jpeg",width=680)
|
57 |
+
st.markdown("<h1 style='color: #FF9900; font-size: 3em;'>SAMSUNG INNOVATION CAMPUS</h1>", unsafe_allow_html=True)
|
|
|
|
|
58 |
st.subheader("Welcome to Our Dialogue Summarizer App!")
|
59 |
+
st.markdown(">*This is a capstone project developed by Group-6 under the supervision of SIC team*.")
|
|
|
60 |
st.markdown("---")
|
61 |
#txt = st.text_area('Enter your long dialogue below please')
|
62 |
+
#txt_out = st.text_area('Output summary')
|
63 |
|
64 |
+
elif choice=="Summarize":
|
65 |
+
st.markdown("<h1 style='color: #FF9900; font-size: 2em;'>Dialog Summarizing Tool</h1>", unsafe_allow_html=True)
|
66 |
+
input_dialogue=st.text_area("Enter Your Dialogue Below","Type here")
|
|
|
67 |
if st.button("Summarize"):
|
68 |
summary = generate_summary(input_dialogue)
|
69 |
+
st.markdown("*<h1 style='color: #9925be; font-size: 1.2em;'>Here is your summarized dialogue*</h1>", unsafe_allow_html=True)
|
|
|
70 |
st.write(summary)
|
71 |
|
72 |
+
elif choice=="Training Dataset":
|
73 |
st.subheader("")
|
74 |
+
st.markdown("<h1 style='color: #FF9900; font-size: 3em;'>TRAINING DATASET Info</h1>", unsafe_allow_html=True)
|
|
|
75 |
st.header("Dataset Card for SAMSum Corpus")
|
76 |
+
st.markdown("> *Dataset Summary\n The SAMSum dataset contains about 16k messenger-like conversations with summaries. Conversations were created and written down by linguists fluent in English. Linguists were asked to create conversations similar to those they write on a daily basis, reflecting the proportion of topics of their real-life messenger convesations. The style and register are diversified - conversations could be informal, semi-formal or formal, they may contain slang words, emoticons and typos. Then, the conversations were annotated with summaries. It was assumed that summaries should be a concise brief of what people talked about in the conversation in third person. The SAMSum dataset was prepared by Samsung R&D Institute Poland and is distributed for research purposes* (non-commercial licence: CC BY-NC-ND 4.0)",unsafe_allow_html=True)
|
77 |
#st.button("Read more")
|
78 |
url = 'https://huggingface.co/datasets/samsum'
|
79 |
|
80 |
# Create a button with the label 'Go to Google'
|
81 |
if st.button('Read More'):
|
82 |
webbrowser.open_new_tab(url)
|
83 |
+
elif choice=="Model info":
|
84 |
+
st.markdown("<h1 style='color: #FF9900; font-size: 3em;'>PEGASUS MODEL Info</h1>", unsafe_allow_html=True)
|
|
|
85 |
st.markdown(">*The Pegasus model was proposed in PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu on Dec 18, 2019. According to the abstract, Pegasus’ pretraining task is intentionally similar to summarization: important sentences are removed/masked from an input document and are generated together as one output sequence from the remaining sentences, similar to an extractive summary. Pegasus achieves SOTA summarization performance on all 12 downstream tasks, as measured by ROUGE and human eval. This model was contributed by sshleifer. The Authors’ code can be found here. Tips: Sequence-to-sequence model with the same encoder-decoder model architecture as BART. Pegasus is pre-trained jointly on two self-supervised objective functions: Masked Language Modeling (MLM) and a novel summarization specific pretraining objective, called Gap Sentence Generation (GSG). MLM: encoder input tokens are randomly replaced by a mask tokens and have to be predicted by the encoder (like in BERT) GSG: whole encoder input sentences are replaced by a second mask token and fed to the decoder, but which has a causal mask to hide the future words like a regular auto-regressive transformer decoder.*")
|
86 |
url = 'https://huggingface.co/google/pegasus-cnn_dailymail'
|
87 |
|
|
|
89 |
if st.button('Read More'):
|
90 |
webbrowser.open_new_tab(url)
|
91 |
|
92 |
+
elif choice=="About Us":
|
93 |
+
st.markdown("<h1 style='color: #FF9900; font-size: 3em;'>ABOUT US</h1>", unsafe_allow_html=True)
|
|
|
94 |
st.markdown("> *Welcome to our website! We are a team of passionate individuals dedicated to providing new NLP based services to our customers. Our goal is to create a positive impact in the world by leveraging our expertise and innovative solutions. With passion and resilence and through experience, we strive to exceed expectations and build lasting relationships with our clients. We, the developers of this capstone project are from Fujairah Emirate. We proudly own this project as it was the product of our hectic crash course that was offered by Samsung Innovation Campus. Thank you for choosing us, and we look forward to serving you!*")
|
95 |
+
|
96 |
+
st.markdown(">*<h1 style='color: #EA8770; font-size: 2em;'>Developers Name List</h1>*", unsafe_allow_html=True)
|
|
|
97 |
st.markdown("*<h1 style='color: #EA8790; font-size: 1.2em;'>This project was developed by: Nathan Berhe, Smon Fitwi, Dawit Andebrhan, Bereket Kibreab, Eyasu Tesfamichael, Milkias Butsuamlak</h1>*", unsafe_allow_html=True)
|
98 |
st.markdown("*<h1 style='color: #EA8790; font-size: 1.2em;'>This project was developed under the supervision of Mrs. Rabab, Mr.Mrad and Mr. Marc, honourable staffs of SIC program</h1>*", unsafe_allow_html=True)
|
99 |
|
100 |
|
101 |
+
|
102 |
+
|
103 |
+
if __name__=='__main__':
|
104 |
main()
|