pat229988 commited on
Commit
012154d
1 Parent(s): 1f50fc6

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +122 -0
  2. requirements.txt.txt +7 -0
app.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from random import random
3
+ from spacy.lang.en.stop_words import STOP_WORDS
4
+ #import en_core_web_sm
5
+ from transformers import PegasusForConditionalGeneration, PegasusTokenizer
6
+ import torch
7
+ import re
8
+ from string import punctuation
9
+ from heapq import nlargest
10
+ #import spacy_streamlit
11
+ import configparser
12
+ import random
13
+ import spacy
14
+ import gtts
15
+ import os
16
+ os.exec(python -m spacy download en_core_web_sm)
17
+ nlp = spacy.load("en_core_web_sm")
18
+ #nlp= en_core_web_sm.load()
19
+ stopwords = list(STOP_WORDS)
20
+ punctuation = punctuation + "\n"
21
+ model_name = 'google/pegasus-xsum'
22
+ torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
23
+ tokenizer = PegasusTokenizer.from_pretrained(model_name)
24
+ model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)
25
+ def abst_summary(src_text):
26
+ batch = tokenizer.prepare_seq2seq_batch(src_text, truncation=True, padding='longest',return_tensors='pt')
27
+ translated = model.generate(**batch)
28
+ tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
29
+ return tgt_text
30
+
31
+ def word_frequency(doc):
32
+ word_frequencies = {}
33
+ for word in doc:
34
+ if word.text.lower() not in stopwords:
35
+ if word.text.lower() not in punctuation:
36
+ if word.text not in word_frequencies.keys():
37
+ word_frequencies[word.text] = 1
38
+ else:
39
+ word_frequencies[word.text] += 1
40
+ return word_frequencies
41
+
42
+ def sentence_score(sentence_tokens, word_frequencies):
43
+ sentence_score = {}
44
+ for sent in sentence_tokens:
45
+ for word in sent:
46
+ if word.text.lower() in word_frequencies.keys():
47
+ if sent not in sentence_score.keys():
48
+ sentence_score[sent] = word_frequencies[word.text.lower()]
49
+ else:
50
+ sentence_score[sent] += word_frequencies[word.text.lower()]
51
+ return sentence_score
52
+
53
+ def get_summary(text):
54
+
55
+ #text = re.sub(f"[{re.escape(punctuation)}]", "", text)
56
+ text = re.sub(r"<.*?>", "", text)
57
+ text = re.sub(r"[.*?]", "", text)
58
+ text = re.sub(r"https?://\S+", " ", text)
59
+ text = re.sub(r"\b[0-9]+\b\s*", " ", text)
60
+ doc = nlp(text)
61
+
62
+ word_frequencies = word_frequency(doc)
63
+
64
+ for word in word_frequencies.keys():
65
+ word_frequencies[word] = word_frequencies[word] / max(word_frequencies.values())
66
+ sentence_tokens = [sent for sent in doc.sents]
67
+ sentence_scores = sentence_score(sentence_tokens, word_frequencies)
68
+
69
+ select_length = int(len(sentence_tokens)*0.10)
70
+ if select_length < 1:
71
+ select_length =1
72
+ #print(len(sentence_tokens)*0.10)
73
+ summary = nlargest(select_length, sentence_scores, key=sentence_scores.get)
74
+ summary = [word.text for word in summary]
75
+ summary = " ".join(summary)
76
+ #print("sums up:",summary)
77
+ return summary
78
+
79
+ st.set_page_config(
80
+ page_title="Audio summarizer Web App",
81
+ layout="wide",
82
+ initial_sidebar_state="expanded"
83
+ )
84
+ st.title("Audio Summaries")
85
+ col1, col2 = st.columns(2)
86
+
87
+ with col1:
88
+ text_ = st.text_area(label="Enter Your Text or story", height=350, placeholder="Enter Your Text or story or your article iit can be of any length")
89
+
90
+ if st.button("Get Summary"):
91
+ ex_summary = get_summary(text_)
92
+ ab_summary = abst_summary(text_)
93
+ print(ab_summary)
94
+ try:
95
+ with col2:
96
+ st.text_area(label="Extractive Text Summarization (Summary length :{}, Actual Text :{})".format(len(ex_summary),len(text_)),
97
+ value=ex_summary,
98
+ height=350)
99
+ #st.text(summary)
100
+ if len(ex_summary)>0:
101
+ ex_tts = gtts.gTTS(ex_summary)
102
+ ex_tts.save("ex_summary.wav")
103
+ ex_audio_file = open('ex_summary.wav', 'rb')
104
+ ex_audio_bytes = ex_audio_file.read()
105
+ data = ex_audio_bytes
106
+ st.audio(data, format="audio/wav", start_time=0, sample_rate=None)
107
+
108
+ st.text_area(label="Abstractive Text Summarization (Summary length :{}, Actual Text :{})".format(len(ab_summary[0]),len(text_)),
109
+ value=ab_summary[0],
110
+ height=350)
111
+ #st.text(summary)
112
+ if len(ab_summary[0])>0:
113
+ ab_tts = gtts.gTTS(ab_summary[0])
114
+ ab_tts.save("ab_summary.wav")
115
+ ab_audio_file = open('ab_summary.wav', 'rb')
116
+ ab_audio_bytes = ab_audio_file.read()
117
+ data = ab_audio_bytes
118
+ st.audio(data, format="audio/wav", start_time=0, sample_rate=None)
119
+
120
+
121
+ except NameError:
122
+ pass
requirements.txt.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ streamlit==1.16.0
2
+ transformers
3
+ torchaudio==0.13.1
4
+ torchvision==0.14.1
5
+ spacy
6
+ gTTS==2.3.0
7
+ gensim==4.2.0