Shivam29rathore commited on
Commit
dfe4a4f
1 Parent(s): d196f8e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +85 -24
app.py CHANGED
@@ -10,53 +10,114 @@ import io
10
  #contents = CPU_Unpickler(f).load()
11
 
12
 
13
- checkpoint = "finbert.sav"
14
 
15
  #load model from drive
16
- with open(checkpoint, "rb") as f:
17
  model= pickle.load(f)
18
 
 
19
  #tokenizer = AutoTokenizer.from_pretrained(checkpoint)
20
  #model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
21
 
22
 
23
- def summarize(word):
24
- import os
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  data_path = "/tmp/"
26
  if not os.path.exists(data_path):
27
  os.makedirs(data_path)
28
  input_ = "/tmp/input.txt"
29
-
30
  with open(input_, "w") as file:
31
  file.write(word)
32
- # read the written txt into a variable
33
  with open(input_ , 'r') as f:
34
- text_ = f.read()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
- def clean_data(texts):
37
- import re
38
- words = list()
39
- for text in texts.split():
40
- text = re.sub(r'\n','',text)
41
- text = re.sub(r'\s$','',text)
42
- words.append(text)
 
 
 
 
 
 
 
43
 
44
- return "summarize " + " ".join(words)
45
- text = clean_data(text_)
 
 
 
46
 
47
- final_summary = []
48
- for x in range(0,len(text)-1256,1256):
49
- text_to_summarize= text[x:x+1256]
50
- final_summary.append(model.predict(text_to_summarize))
 
 
 
51
 
52
- final_list = list(itertools.chain.from_iterable(final_summary))
53
- final_list = ''.join(final_list)
54
- return final_list
55
 
 
 
56
 
 
 
 
 
 
 
 
 
57
  import gradio as gr
58
 
59
- iface = gr.Interface(fn= summarize,
60
  inputs =gr.inputs.Textbox(lines=15,placeholder="Enter your text !!"),
61
  outputs="text",title="Document Summarizer",description ="An AI that makes your life easier by helping you summarise long texts.")
62
  iface.launch(auth=("docai","ailabs"))
 
10
  #contents = CPU_Unpickler(f).load()
11
 
12
 
13
+ model_path = "finbert.sav"
14
 
15
  #load model from drive
16
+ with open(model_path, "rb") as f:
17
  model= pickle.load(f)
18
 
19
+
20
  #tokenizer = AutoTokenizer.from_pretrained(checkpoint)
21
  #model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
22
 
23
 
24
+ import nltk
25
+ from finbert_embedding.embedding import FinbertEmbedding
26
+ import pandas as pd
27
+ from nltk.cluster import KMeansClusterer
28
+ import numpy as np
29
+ import os
30
+ from scipy.spatial import distance_matrix
31
+ from tensorflow.python.lib.io import file_io
32
+ import pickle
33
+
34
+ nltk.download('punkt')
35
+
36
+
37
+ def make_extractive_summary(word):
38
+ # Instantiate path to store each text Datafile in dataframe
39
  data_path = "/tmp/"
40
  if not os.path.exists(data_path):
41
  os.makedirs(data_path)
42
  input_ = "/tmp/input.txt"
43
+ # Write file to disk so we can convert each datapoint to a txt file
44
  with open(input_, "w") as file:
45
  file.write(word)
46
+ # read the written txt into a variable to start clustering
47
  with open(input_ , 'r') as f:
48
+ text = f.read()
49
+ # Create tokens from the txt file
50
+ tokens = nltk.sent_tokenize(text)
51
+ # Strip out trailing and leading white spaces from tokens
52
+ sentences = [word.strip() for word in tokens]
53
+ #Create a DataFrame from the tokens
54
+ data = pd.DataFrame(sentences)
55
+ # Assign name Sentences to the column containing text tokens
56
+ data.columns = ['Sentences']
57
+
58
+ # Function to create numerical embeddings for each text tokens in dataframe
59
+ def get_sentence_embeddings():
60
+ # Create empty list for sentence embeddings
61
+ sentence_list = []
62
+ # Loop through all sentences and append sentence embeddings to list
63
+ for i in tokens:
64
+ sentence_embedding = model.sentence_vector(i)
65
+ sentence_list.append(sentence_embedding)
66
+ # Create empty list for ndarray
67
+ sentence_array=[]
68
+ # Loop through sentence list and change data type from tensor to array
69
+ for i in sentence_list:
70
+ sentence_array.append(i.numpy())
71
+ # return sentence embeddings as list
72
+ return sentence_array
73
 
74
+ # Apply get_sentence_embeddings to dataframe to create column Embeddings
75
+ data['Embeddings'] = get_sentence_embeddings()
76
+
77
+ #Number of expected sentences
78
+ NUM_CLUSTERS =
79
+ iterations = 25
80
+ # Convert Embeddings into an array and store in variable X
81
+ X = np.array(data['Embeddings'].to_list())
82
+
83
+ #Build k-means cluster algorithm
84
+ Kclusterer = KMeansClusterer(
85
+ NUM_CLUSTERS,
86
+ distance = nltk.cluster.util.cosine_distance,
87
+ repeats = iterations, avoid_empty_clusters = True)
88
 
89
+ # if length of text is too short, K means would return an error
90
+ # use the try except block to return the text as result if it is too short.
91
+ try:
92
+
93
+ assigned_clusters = Kclusterer.cluster(X,assign_clusters=True)
94
 
95
+ # Apply Kmean Cluster to DataFrame and create new columns Clusters and Centroid
96
+ data['Cluster'] = pd.Series(assigned_clusters, index = data.index)
97
+ data['Centroid'] = data['Cluster'].apply(lambda x: Kclusterer.means()[x])
98
+
99
+ # return the text if clustering algorithm catches an exceptiona and move to the next text file
100
+ except ValueError:
101
+ return text
102
 
103
+ # function that computes the distance of each embeddings from the centroid of the cluster
104
+ def distance_from_centroid(row):
105
+ return distance_matrix([row['Embeddings']], [row['Centroid'].tolist()])[0][0]
106
 
107
+ # apply distance_from_centroid function to data
108
+ data['Distance_From_Centroid'] = data.apply(distance_from_centroid, axis =1)
109
 
110
+ ## Return Final Summary
111
+ summary = " ".join(data.sort_values(
112
+ 'Distance_From_Centroid',
113
+ ascending = True).groupby('Cluster').head(1).sort_index()['Sentences'].tolist())
114
+ return summary
115
+
116
+
117
+
118
  import gradio as gr
119
 
120
+ iface = gr.Interface(fn= make_extractive_summary,
121
  inputs =gr.inputs.Textbox(lines=15,placeholder="Enter your text !!"),
122
  outputs="text",title="Document Summarizer",description ="An AI that makes your life easier by helping you summarise long texts.")
123
  iface.launch(auth=("docai","ailabs"))