Spaces:
Running
Running
Upload 2 files
Browse files- app.py +122 -0
- requirements.txt.txt +7 -0
app.py
ADDED
@@ -0,0 +1,122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from random import random
|
3 |
+
from spacy.lang.en.stop_words import STOP_WORDS
|
4 |
+
#import en_core_web_sm
|
5 |
+
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
|
6 |
+
import torch
|
7 |
+
import re
|
8 |
+
from string import punctuation
|
9 |
+
from heapq import nlargest
|
10 |
+
#import spacy_streamlit
|
11 |
+
import configparser
|
12 |
+
import random
|
13 |
+
import spacy
|
14 |
+
import gtts
|
15 |
+
import os
|
16 |
+
os.exec(python -m spacy download en_core_web_sm)
|
17 |
+
nlp = spacy.load("en_core_web_sm")
|
18 |
+
#nlp= en_core_web_sm.load()
|
19 |
+
stopwords = list(STOP_WORDS)
|
20 |
+
punctuation = punctuation + "\n"
|
21 |
+
model_name = 'google/pegasus-xsum'
|
22 |
+
torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
23 |
+
tokenizer = PegasusTokenizer.from_pretrained(model_name)
|
24 |
+
model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)
|
25 |
+
def abst_summary(src_text):
|
26 |
+
batch = tokenizer.prepare_seq2seq_batch(src_text, truncation=True, padding='longest',return_tensors='pt')
|
27 |
+
translated = model.generate(**batch)
|
28 |
+
tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
|
29 |
+
return tgt_text
|
30 |
+
|
31 |
+
def word_frequency(doc):
|
32 |
+
word_frequencies = {}
|
33 |
+
for word in doc:
|
34 |
+
if word.text.lower() not in stopwords:
|
35 |
+
if word.text.lower() not in punctuation:
|
36 |
+
if word.text not in word_frequencies.keys():
|
37 |
+
word_frequencies[word.text] = 1
|
38 |
+
else:
|
39 |
+
word_frequencies[word.text] += 1
|
40 |
+
return word_frequencies
|
41 |
+
|
42 |
+
def sentence_score(sentence_tokens, word_frequencies):
|
43 |
+
sentence_score = {}
|
44 |
+
for sent in sentence_tokens:
|
45 |
+
for word in sent:
|
46 |
+
if word.text.lower() in word_frequencies.keys():
|
47 |
+
if sent not in sentence_score.keys():
|
48 |
+
sentence_score[sent] = word_frequencies[word.text.lower()]
|
49 |
+
else:
|
50 |
+
sentence_score[sent] += word_frequencies[word.text.lower()]
|
51 |
+
return sentence_score
|
52 |
+
|
53 |
+
def get_summary(text):
|
54 |
+
|
55 |
+
#text = re.sub(f"[{re.escape(punctuation)}]", "", text)
|
56 |
+
text = re.sub(r"<.*?>", "", text)
|
57 |
+
text = re.sub(r"[.*?]", "", text)
|
58 |
+
text = re.sub(r"https?://\S+", " ", text)
|
59 |
+
text = re.sub(r"\b[0-9]+\b\s*", " ", text)
|
60 |
+
doc = nlp(text)
|
61 |
+
|
62 |
+
word_frequencies = word_frequency(doc)
|
63 |
+
|
64 |
+
for word in word_frequencies.keys():
|
65 |
+
word_frequencies[word] = word_frequencies[word] / max(word_frequencies.values())
|
66 |
+
sentence_tokens = [sent for sent in doc.sents]
|
67 |
+
sentence_scores = sentence_score(sentence_tokens, word_frequencies)
|
68 |
+
|
69 |
+
select_length = int(len(sentence_tokens)*0.10)
|
70 |
+
if select_length < 1:
|
71 |
+
select_length =1
|
72 |
+
#print(len(sentence_tokens)*0.10)
|
73 |
+
summary = nlargest(select_length, sentence_scores, key=sentence_scores.get)
|
74 |
+
summary = [word.text for word in summary]
|
75 |
+
summary = " ".join(summary)
|
76 |
+
#print("sums up:",summary)
|
77 |
+
return summary
|
78 |
+
|
79 |
+
st.set_page_config(
|
80 |
+
page_title="Audio summarizer Web App",
|
81 |
+
layout="wide",
|
82 |
+
initial_sidebar_state="expanded"
|
83 |
+
)
|
84 |
+
st.title("Audio Summaries")
|
85 |
+
col1, col2 = st.columns(2)
|
86 |
+
|
87 |
+
with col1:
|
88 |
+
text_ = st.text_area(label="Enter Your Text or story", height=350, placeholder="Enter Your Text or story or your article iit can be of any length")
|
89 |
+
|
90 |
+
if st.button("Get Summary"):
|
91 |
+
ex_summary = get_summary(text_)
|
92 |
+
ab_summary = abst_summary(text_)
|
93 |
+
print(ab_summary)
|
94 |
+
try:
|
95 |
+
with col2:
|
96 |
+
st.text_area(label="Extractive Text Summarization (Summary length :{}, Actual Text :{})".format(len(ex_summary),len(text_)),
|
97 |
+
value=ex_summary,
|
98 |
+
height=350)
|
99 |
+
#st.text(summary)
|
100 |
+
if len(ex_summary)>0:
|
101 |
+
ex_tts = gtts.gTTS(ex_summary)
|
102 |
+
ex_tts.save("ex_summary.wav")
|
103 |
+
ex_audio_file = open('ex_summary.wav', 'rb')
|
104 |
+
ex_audio_bytes = ex_audio_file.read()
|
105 |
+
data = ex_audio_bytes
|
106 |
+
st.audio(data, format="audio/wav", start_time=0, sample_rate=None)
|
107 |
+
|
108 |
+
st.text_area(label="Abstractive Text Summarization (Summary length :{}, Actual Text :{})".format(len(ab_summary[0]),len(text_)),
|
109 |
+
value=ab_summary[0],
|
110 |
+
height=350)
|
111 |
+
#st.text(summary)
|
112 |
+
if len(ab_summary[0])>0:
|
113 |
+
ab_tts = gtts.gTTS(ab_summary[0])
|
114 |
+
ab_tts.save("ab_summary.wav")
|
115 |
+
ab_audio_file = open('ab_summary.wav', 'rb')
|
116 |
+
ab_audio_bytes = ab_audio_file.read()
|
117 |
+
data = ab_audio_bytes
|
118 |
+
st.audio(data, format="audio/wav", start_time=0, sample_rate=None)
|
119 |
+
|
120 |
+
|
121 |
+
except NameError:
|
122 |
+
pass
|
requirements.txt.txt
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
streamlit==1.16.0
|
2 |
+
transformers
|
3 |
+
torchaudio==0.13.1
|
4 |
+
torchvision==0.14.1
|
5 |
+
spacy
|
6 |
+
gTTS==2.3.0
|
7 |
+
gensim==4.2.0
|