Shivam29rathore commited on
Commit
4555368
1 Parent(s): bbb7fb4

Create new file

Browse files
Files changed (1) hide show
  1. app.py +122 -0
app.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
2
+ import pickle
3
+ import torch
4
+ from transformers import PegasusTokenizer, PegasusForConditionalGeneration
5
+ import tensorflow as tf
6
+ from tensorflow.python.lib.io import file_io
7
+ from nltk.tokenize import sent_tokenize
8
+
9
+
10
+ import io
11
+
12
+
13
+ #contents = pickle.load(f) becomes...
14
+ #contents = CPU_Unpickler(f).load()
15
+
16
+
17
+ model_path = "finbert.sav"
18
+
19
+ #load model from drive
20
+ with open(model_path, "rb") as f:
21
+ model= pickle.load(f)
22
+
23
+
24
+
25
+ #tokenizer = AutoTokenizer.from_pretrained(checkpoint)
26
+ #model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
27
+
28
+
29
+ import nltk
30
+ from finbert_embedding.embedding import FinbertEmbedding
31
+ import pandas as pd
32
+ from nltk.cluster import KMeansClusterer
33
+ import numpy as np
34
+ import os
35
+ from scipy.spatial import distance_matrix
36
+ from tensorflow.python.lib.io import file_io
37
+ import pickle
38
+
39
+ nltk.download('punkt')
40
+
41
+
42
+ def make_summary(word):
43
+
44
+ # Create tokens from the txt file
45
+ tokens = nltk.sent_tokenize(word)
46
+ # Strip out trailing and leading white spaces from tokens
47
+ sentences = [word.strip() for word in tokens]
48
+ #Create a DataFrame from the tokens
49
+ data = pd.DataFrame(sentences)
50
+ # Assign name Sentences to the column containing text tokens
51
+ data.columns = ['Sentences']
52
+
53
+ # Function to create numerical embeddings for each text tokens in dataframe
54
+ def get_sentence_embeddings():
55
+ # Create empty list for sentence embeddings
56
+ sentence_list = []
57
+ # Loop through all sentences and append sentence embeddings to list
58
+ for i in tokens:
59
+ sentence_embedding = model.sentence_vector(i)
60
+ sentence_list.append(sentence_embedding)
61
+ # Create empty list for ndarray
62
+ sentence_array=[]
63
+ # Loop through sentence list and change data type from tensor to array
64
+ for i in sentence_list:
65
+ sentence_array.append(i.numpy())
66
+ # return sentence embeddings as list
67
+ return sentence_array
68
+
69
+ # Apply get_sentence_embeddings to dataframe to create column Embeddings
70
+ data['Embeddings'] = get_sentence_embeddings()
71
+
72
+ #Number of expected sentences
73
+ if len(tokens) <= 1:
74
+ NUM_CLUSTERS = 1
75
+ else:
76
+ NUM_CLUSTERS = len(tokens)//4
77
+
78
+ iterations = 25
79
+ # Convert Embeddings into an array and store in variable X
80
+ X = np.array(data['Embeddings'].to_list())
81
+
82
+ #Build k-means cluster algorithm
83
+ Kclusterer = KMeansClusterer(
84
+ NUM_CLUSTERS,
85
+ distance = nltk.cluster.util.cosine_distance,
86
+ repeats = iterations, avoid_empty_clusters = True)
87
+
88
+ # if length of text is too short, K means would return an error
89
+ # use the try except block to return the text as result if it is too short.
90
+ try:
91
+
92
+ assigned_clusters = Kclusterer.cluster(X,assign_clusters=True)
93
+
94
+ # Apply Kmean Cluster to DataFrame and create new columns Clusters and Centroid
95
+ data['Cluster'] = pd.Series(assigned_clusters, index = data.index)
96
+ data['Centroid'] = data['Cluster'].apply(lambda x: Kclusterer.means()[x])
97
+
98
+ # return the text if clustering algorithm catches an exceptiona and move to the next text file
99
+ except ValueError:
100
+ return word
101
+
102
+ # function that computes the distance of each embeddings from the centroid of the cluster
103
+ def distance_from_centroid(row):
104
+ return distance_matrix([row['Embeddings']], [row['Centroid'].tolist()])[0][0]
105
+
106
+ # apply distance_from_centroid function to data
107
+ data['Distance_From_Centroid'] = data.apply(distance_from_centroid, axis =1)
108
+
109
+ ## Return Final Summary
110
+ summary = " ".join(data.sort_values(
111
+ 'Distance_From_Centroid',
112
+ ascending = True).groupby('Cluster').head(1).sort_index()['Sentences'].tolist())
113
+ return summary
114
+
115
+ import gradio as gr
116
+
117
+
118
+
119
+
120
+ interface1 = gr.Interface(fn=make_summary,
121
+ inputs =gr.inputs.Textbox(lines=15,placeholder="Enter your text !!",label='Input-10k Sections'),
122
+ outputs=gr.outputs.Textbox(label='Output- Finbert')).launch()