File size: 4,402 Bytes
576fb99
2075e27
ff7f9fb
576fb99
 
f0515e4
 
 
 
 
 
 
dfe4a4f
ecc70cc
 
dfe4a4f
51e3ec1
f91af0c
dfe4a4f
23fe41c
fb7f34e
576fb99
 
dfe4a4f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
725414f
 
 
 
dfe4a4f
725414f
 
dfe4a4f
725414f
dfe4a4f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57fa22a
dfe4a4f
 
 
 
b408330
 
dfe4a4f
 
 
 
 
 
 
 
57fa22a
dfe4a4f
 
 
 
 
57fa22a
dfe4a4f
 
 
 
 
 
 
57fa22a
dfe4a4f
 
 
725414f
dfe4a4f
 
725414f
dfe4a4f
 
 
 
 
 
 
 
725414f
 
dfe4a4f
725414f
 
1a6661b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import pickle
import torch


import io


#contents = pickle.load(f) becomes...
#contents = CPU_Unpickler(f).load()


model_path = "finbert.sav"

#load model from drive
with open(model_path, "rb") as f:
    model=  pickle.load(f)
    

#tokenizer = AutoTokenizer.from_pretrained(checkpoint)
#model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)


import nltk
from finbert_embedding.embedding import FinbertEmbedding
import pandas as pd
from nltk.cluster import KMeansClusterer
import numpy as np
import os
from scipy.spatial import distance_matrix
from tensorflow.python.lib.io import file_io
import pickle

nltk.download('punkt')


def make_extractive_summary(word):
    # Instantiate path to store each text Datafile in dataframe
    data_path = "/tmp/"
    if not os.path.exists(data_path):
        os.makedirs(data_path)
    input_ = "/tmp/input.txt"
    # Write file to disk so we can convert each datapoint to a txt file
    with open(input_, "w") as file:
        file.write(word)
    # read the written txt into a variable to start clustering
    with open(input_ , 'r') as f:
        text = f.read()
    # Create tokens from the txt file
    tokens = nltk.sent_tokenize(text)
    # Strip out trailing and leading white spaces from tokens
    sentences = [word.strip() for word in tokens]
    #Create a DataFrame from the tokens 
    data = pd.DataFrame(sentences)
    # Assign name Sentences to the column containing text tokens
    data.columns = ['Sentences']
    
    # Function to create numerical embeddings for each text tokens in dataframe 
    def get_sentence_embeddings():
        # Create empty list for sentence embeddings
        sentence_list = []
        # Loop through all sentences and append sentence embeddings to list
        for i in tokens:
            sentence_embedding = model.sentence_vector(i)
            sentence_list.append(sentence_embedding)
        # Create empty list for ndarray
        sentence_array=[]
        # Loop through sentence list and change data type from tensor to array
        for i in sentence_list:
            sentence_array.append(i.numpy())
        # return sentence embeddings as list
        return sentence_array

    # Apply get_sentence_embeddings to dataframe to create column Embeddings
    data['Embeddings'] = get_sentence_embeddings()
    
    #Number of expected sentences
    NUM_CLUSTERS = 10
    iterations = 8
    # Convert Embeddings into an array and store in variable X
    X = np.array(data['Embeddings'].to_list())
    
    #Build k-means cluster algorithm
    Kclusterer = KMeansClusterer(
                                NUM_CLUSTERS,
                                distance = nltk.cluster.util.cosine_distance,
                                repeats = iterations, avoid_empty_clusters = True)

    # if length of text is too short, K means would return an error
    # use the try except block to return the text as result if it is too short.
    try:
        
        assigned_clusters = Kclusterer.cluster(X,assign_clusters=True)

        # Apply Kmean Cluster to DataFrame and create new columns Clusters and Centroid
        data['Cluster'] = pd.Series(assigned_clusters, index = data.index)
        data['Centroid'] = data['Cluster'].apply(lambda x: Kclusterer.means()[x])
    
    # return the text if clustering algorithm catches an exceptiona and move to the next text file
    except ValueError:
        return text

    # function that computes the distance of each embeddings from the centroid of the cluster
    def distance_from_centroid(row):
        return distance_matrix([row['Embeddings']], [row['Centroid'].tolist()])[0][0]
    
    # apply distance_from_centroid function to data
    data['Distance_From_Centroid'] = data.apply(distance_from_centroid, axis =1)
    
    ## Return Final Summary
    summary = " ".join(data.sort_values(
                'Distance_From_Centroid',
                ascending = True).groupby('Cluster').head(1).sort_index()['Sentences'].tolist())
    return summary
    


import gradio as gr

iface = gr.Interface(fn= make_extractive_summary, 
                     inputs =gr.inputs.Textbox(lines=15,placeholder="Enter your text !!"),
                     outputs="text",title="Document Summarizer",description ="An AI that makes your life easier by helping you summarise long texts.")
iface.launch()