File size: 4,029 Bytes
4d940bf
96d3c78
4d940bf
 
 
 
 
b1eaf99
 
4d940bf
 
 
96d3c78
 
4d940bf
 
 
96d3c78
 
 
06e6ec2
24cb4f9
 
96d3c78
8e3a4c7
4d940bf
 
 
 
 
 
7272581
4d940bf
 
 
 
 
 
 
 
 
 
 
 
7272581
4d940bf
7272581
4d940bf
 
 
695cbcc
 
 
 
 
 
 
4d940bf
 
7272581
4d940bf
 
7272581
4d940bf
 
 
 
 
b2e388a
4d940bf
96d3c78
7f35581
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96d3c78
 
e7ac810
96d3c78
 
e7ac810
7f35581
 
77cd680
 
 
96d3c78
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import gradio as gr
from gradio.mix import Parallel, Series
import wikipedia
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
import nltk
nltk.download('wordnet', quiet=True)
nltk.download('punkt', quiet=True)
from nltk.stem import WordNetLemmatizer
from heapq import nlargest
import warnings
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

warnings.filterwarnings("ignore")

def get_wiki_original_text(inp):
    text = wikipedia.summary(inp)
    return text
   


def get_wiki_summary_by_lem(inp):
    text = wikipedia.summary(inp)

    print(text)

    stopwords = list(STOP_WORDS)

    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(str(token).lower()) for token in nltk.word_tokenize(text) if str(token) not in punctuation and str(token).lower() not in stopwords and len(token) >1]
    word_counts = {}

    for token in tokens:
        if token in word_counts.keys():
            word_counts[token] += 1
        else:
            word_counts[token] = 1

        

    sentence_scores = {}

    for sentence in nltk.sent_tokenize(text):
        sentence_scores[sentence] = 0
        for wrd in nltk.word_tokenize(sentence):
            if lemmatizer.lemmatize(str(wrd).lower()) in word_counts.keys():
                sentence_scores[sentence] += word_counts[lemmatizer.lemmatize(str(wrd).lower())]

    summary_length = 0

    if len(sentence_scores) > 5 :
        summary_length = int(len(sentence_scores)*0.20)
    else:
        summary_length = int(len(sentence_scores)*0.50)
        
    summary = str()

    for sentence in nltk.sent_tokenize(text):
        for i in range(0,summary_length):
            if str(sentence).find(str(nlargest(summary_length, sentence_scores, key = sentence_scores.get)[i])) == 0:
                summary += str(sentence).replace('\n','')
                summary += ' '
                
                
    print('\033[1m' + "Summarized Text" + '\033[0m')

    return summary


def get_wiki_summary_by_tfidf(inp):
    text = wikipedia.summary(inp)

    tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,3))
    
    all_sentences = [str(sent) for sent in nltk.sent_tokenize(text)]
    sentence_vectors = tfidf_vectorizer.fit_transform(all_sentences)
    
    sentence_scores_vector = np.hstack(np.array(sentence_vectors.sum(axis=1)))

    sentence_scores = dict(zip(all_sentences, sentence_scores_vector))

    summary_length = 0

    if len(sentence_scores) > 5 :
        summary_length = int(len(sentence_scores)*0.20)
    else:
        summary_length = int(len(sentence_scores)*0.50)
        
    summary = str()

    for sentence in nltk.sent_tokenize(text):
        for i in range(0,summary_length):
            if str(sentence).find(str(nlargest(summary_length, sentence_scores, key = sentence_scores.get)[i])) == 0:
                summary += str(sentence).replace('\n','')
                summary += ' '
                
                
    return summary



desc =  """This interface allows you to summarize Wikipedia contents. Only requirement is to write the topic and it collects content by fetching from Wikipedia. For summarization this model uses 2 different extractive summarization methods and the number of sentences in the output depends on the length of the original text."""


sample = [['Europe'],['Great Depression'],['Crocodile Dundee']]


iface = Parallel(gr.Interface(fn=get_wiki_original_text, inputs=gr.inputs.Textbox(label="Text"), outputs="text", description='Original Text'),
                 gr.Interface(fn=get_wiki_summary_by_lem, inputs=gr.inputs.Textbox(label="Text"), outputs="text", description='Summary 1'),
                 gr.Interface(fn=get_wiki_summary_by_tfidf, inputs=gr.inputs.Textbox(label="Text"), outputs="text", description='Summary 2'),
                 title= 'Text Summarizer', 
                 description = desc,
                 examples=sample, 
                 inputs = gr.inputs.Textbox(label="Text"))

iface.launch(inline = False)