File size: 4,617 Bytes
012154d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82e37d1
012154d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import streamlit as st
from random import random
from spacy.lang.en.stop_words import STOP_WORDS
#import en_core_web_sm
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
import torch
import re
from string import punctuation
from heapq import nlargest
#import spacy_streamlit
import configparser
import random
import spacy
import gtts
import os
os.system("spacy download en_core_web_sm")
nlp = spacy.load("en_core_web_sm")
#nlp= en_core_web_sm.load()
stopwords = list(STOP_WORDS)
punctuation = punctuation + "\n"
model_name = 'google/pegasus-xsum'
torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)
def abst_summary(src_text):
    batch = tokenizer.prepare_seq2seq_batch(src_text, truncation=True, padding='longest',return_tensors='pt')
    translated = model.generate(**batch)
    tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
    return tgt_text

def word_frequency(doc):
    word_frequencies = {}
    for word in doc:
        if word.text.lower() not in stopwords:
            if word.text.lower() not in punctuation:
                if word.text not in word_frequencies.keys():
                    word_frequencies[word.text] = 1
                else:
                    word_frequencies[word.text] += 1
    return word_frequencies

def sentence_score(sentence_tokens, word_frequencies):
    sentence_score = {}
    for sent in sentence_tokens:
        for word in sent:
            if word.text.lower() in word_frequencies.keys():
                if sent not in sentence_score.keys():
                    sentence_score[sent] = word_frequencies[word.text.lower()] 
                else:
                    sentence_score[sent] += word_frequencies[word.text.lower()]
    return sentence_score

def get_summary(text):

    #text = re.sub(f"[{re.escape(punctuation)}]", "", text)
    text = re.sub(r"<.*?>", "", text)
    text = re.sub(r"[.*?]", "", text)
    text = re.sub(r"https?://\S+", " ", text)
    text = re.sub(r"\b[0-9]+\b\s*", " ", text)
    doc = nlp(text)
    
    word_frequencies = word_frequency(doc)
    
    for word in word_frequencies.keys():
        word_frequencies[word] = word_frequencies[word] / max(word_frequencies.values())
    sentence_tokens = [sent for sent in doc.sents]
    sentence_scores = sentence_score(sentence_tokens, word_frequencies)
    
    select_length = int(len(sentence_tokens)*0.10)
    if select_length < 1:
        select_length =1
    #print(len(sentence_tokens)*0.10)
    summary  = nlargest(select_length, sentence_scores, key=sentence_scores.get)
    summary = [word.text  for word in summary]
    summary = " ".join(summary)
    #print("sums up:",summary)
    return summary

st.set_page_config(
     page_title="Audio summarizer Web App",
     layout="wide",
     initial_sidebar_state="expanded"
)
st.title("Audio Summaries")
col1, col2 = st.columns(2)

with col1:
    text_ = st.text_area(label="Enter Your Text or story", height=350, placeholder="Enter Your Text or story or your article iit can be of any length")

if st.button("Get Summary"):
    ex_summary = get_summary(text_)
    ab_summary = abst_summary(text_)
    print(ab_summary)
    try:
        with col2:
            st.text_area(label="Extractive Text Summarization (Summary length :{}, Actual Text :{})".format(len(ex_summary),len(text_)),
                        value=ex_summary,
                        height=350)
            #st.text(summary)
            if len(ex_summary)>0:
                ex_tts = gtts.gTTS(ex_summary)
                ex_tts.save("ex_summary.wav")
                ex_audio_file = open('ex_summary.wav', 'rb')
                ex_audio_bytes = ex_audio_file.read()
                data = ex_audio_bytes
                st.audio(data, format="audio/wav", start_time=0, sample_rate=None)
            
            st.text_area(label="Abstractive Text Summarization (Summary length :{}, Actual Text :{})".format(len(ab_summary[0]),len(text_)),
                        value=ab_summary[0],
                        height=350)
            #st.text(summary)
            if len(ab_summary[0])>0:
                ab_tts = gtts.gTTS(ab_summary[0])
                ab_tts.save("ab_summary.wav")
                ab_audio_file = open('ab_summary.wav', 'rb')
                ab_audio_bytes = ab_audio_file.read()
                data = ab_audio_bytes
                st.audio(data, format="audio/wav", start_time=0, sample_rate=None)
    

    except NameError:
        pass