Shivam29rathore commited on
Commit
725414f
1 Parent(s): 4fde81c

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +104 -0
app.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import nltk
2
+ from finbert_embedding.embedding import FinbertEmbedding
3
+ import pandas as pd
4
+ from nltk.cluster import KMeansClusterer
5
+ import numpy as np
6
+ import os
7
+ from scipy.spatial import distance_matrix
8
+ from tensorflow.python.lib.io import file_io
9
+ import pickle
10
+
11
+
12
+ nltk.download('punkt')
13
+
14
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
15
+
16
+ checkpoint = "Shivam29rathore/finBert_10k"
17
+ model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
18
+
19
+
20
+ def make_extractive_summary(word):
21
+ # Instantiate path to store each text Datafile in dataframe
22
+ data_path = "/tmp/"
23
+ if not os.path.exists(data_path):
24
+ os.makedirs(data_path)
25
+ input_ = "/tmp/input.txt"
26
+ # Write file to disk so we can convert each datapoint to a txt file
27
+ with open(input_, "w") as file:
28
+ file.write(word)
29
+ # read the written txt into a variable to start clustering
30
+ with open(input_ , 'r') as f:
31
+ text = f.read()
32
+ # Create tokens from the txt file
33
+ tokens = nltk.sent_tokenize(text)
34
+ # Strip out trailing and leading white spaces from tokens
35
+ sentences = [word.strip() for word in tokens]
36
+ #Create a DataFrame from the tokens
37
+ data = pd.DataFrame(sentences)
38
+ # Assign name Sentences to the column containing text tokens
39
+ data.columns = ['Sentences']
40
+
41
+ # Function to create numerical embeddings for each text tokens in dataframe
42
+ def get_sentence_embeddings():
43
+ # Create empty list for sentence embeddings
44
+ sentence_list = []
45
+ # Loop through all sentences and append sentence embeddings to list
46
+ for i in tokens:
47
+ sentence_embedding = model.sentence_vector(i)
48
+ sentence_list.append(sentence_embedding)
49
+ # Create empty list for ndarray
50
+ sentence_array=[]
51
+ # Loop through sentence list and change data type from tensor to array
52
+ for i in sentence_list:
53
+ sentence_array.append(i.numpy())
54
+ # return sentence embeddings as list
55
+ return sentence_array
56
+
57
+ # Apply get_sentence_embeddings to dataframe to create column Embeddings
58
+ data['Embeddings'] = get_sentence_embeddings()
59
+
60
+ #Number of expected sentences
61
+ NUM_CLUSTERS = 15
62
+ iterations = 25
63
+ # Convert Embeddings into an array and store in variable X
64
+ X = np.array(data['Embeddings'].to_list())
65
+
66
+ #Build k-means cluster algorithm
67
+ Kclusterer = KMeansClusterer(
68
+ NUM_CLUSTERS,
69
+ distance = nltk.cluster.util.cosine_distance,
70
+ repeats = iterations, avoid_empty_clusters = True)
71
+
72
+ # if length of text is too short, K means would return an error
73
+ # use the try except block to return the text as result if it is too short.
74
+ try:
75
+
76
+ assigned_clusters = Kclusterer.cluster(X,assign_clusters=True)
77
+
78
+ # Apply Kmean Cluster to DataFrame and create new columns Clusters and Centroid
79
+ data['Cluster'] = pd.Series(assigned_clusters, index = data.index)
80
+ data['Centroid'] = data['Cluster'].apply(lambda x: Kclusterer.means()[x])
81
+
82
+ # return the text if clustering algorithm catches an exceptiona and move to the next text file
83
+ except ValueError:
84
+ return text
85
+
86
+ # function that computes the distance of each embeddings from the centroid of the cluster
87
+ def distance_from_centroid(row):
88
+ return distance_matrix([row['Embeddings']], [row['Centroid'].tolist()])[0][0]
89
+
90
+ # apply distance_from_centroid function to data
91
+ data['Distance_From_Centroid'] = data.apply(distance_from_centroid, axis =1)
92
+
93
+ ## Return Final Summary
94
+ summary = " ".join(data.sort_values(
95
+ 'Distance_From_Centroid',
96
+ ascending = True).groupby('Cluster').head(1).sort_index()['Sentences'].tolist())
97
+ return summary
98
+
99
+ import gradio as gr
100
+
101
+ iface = gr.Interface(fn=make_extractive_summary,
102
+ inputs =gr.inputs.Textbox(lines=15,placeholder="Enter your text !!"),
103
+ outputs="text",title="Document Summarizer",description ="An AI that makes your life easier by helping you summarise long texts.")
104
+ iface.launch(auth=("hamoye","docai")