Spaces:
Runtime error
Runtime error
File size: 4,197 Bytes
4555368 fbefd1f 4555368 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import pickle
import torch
from transformers import PegasusTokenizer, PegasusForConditionalGeneration
import tensorflow as tf
from tensorflow.python.lib.io import file_io
from nltk.tokenize import sent_tokenize
import io
#contents = pickle.load(f) becomes...
#contents = CPU_Unpickler(f).load()
model_path = "finbert.sav"
#load model from drive
with open(model_path, "rb") as f:
model= pickle.load(f)
#tokenizer = AutoTokenizer.from_pretrained(checkpoint)
#model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
import nltk
from finbert_embedding.embedding import FinbertEmbedding
import pandas as pd
from nltk.cluster import KMeansClusterer
import numpy as np
import os
from scipy.spatial import distance_matrix
from tensorflow.python.lib.io import file_io
import pickle
nltk.download('punkt')
def make_summary(word):
# Create tokens from the txt file
tokens = nltk.sent_tokenize(word)
# Strip out trailing and leading white spaces from tokens
sentences = [word.strip() for word in tokens]
#Create a DataFrame from the tokens
data = pd.DataFrame(sentences)
# Assign name Sentences to the column containing text tokens
data.columns = ['Sentences']
# Function to create numerical embeddings for each text tokens in dataframe
def get_sentence_embeddings():
# Create empty list for sentence embeddings
sentence_list = []
# Loop through all sentences and append sentence embeddings to list
for i in tokens:
sentence_embedding = model.sentence_vector(i)
sentence_list.append(sentence_embedding)
# Create empty list for ndarray
sentence_array=[]
# Loop through sentence list and change data type from tensor to array
for i in sentence_list:
sentence_array.append(i.numpy())
# return sentence embeddings as list
return sentence_array
# Apply get_sentence_embeddings to dataframe to create column Embeddings
data['Embeddings'] = get_sentence_embeddings()
#Number of expected sentences for shorter summaries
if len(tokens) <= 4:
NUM_CLUSTERS = 1
else:
NUM_CLUSTERS = len(tokens)//4
iterations = 25
# Convert Embeddings into an array and store in variable X
X = np.array(data['Embeddings'].to_list())
#Build k-means cluster algorithm
Kclusterer = KMeansClusterer(
NUM_CLUSTERS,
distance = nltk.cluster.util.cosine_distance,
repeats = iterations, avoid_empty_clusters = True)
# if length of text is too short, K means would return an error
# use the try except block to return the text as result if it is too short.
try:
assigned_clusters = Kclusterer.cluster(X,assign_clusters=True)
# Apply Kmean Cluster to DataFrame and create new columns Clusters and Centroid
data['Cluster'] = pd.Series(assigned_clusters, index = data.index)
data['Centroid'] = data['Cluster'].apply(lambda x: Kclusterer.means()[x])
# return the text if clustering algorithm catches an exceptiona and move to the next text file
except ValueError:
return word
# function that computes the distance of each embeddings from the centroid of the cluster
def distance_from_centroid(row):
return distance_matrix([row['Embeddings']], [row['Centroid'].tolist()])[0][0]
# apply distance_from_centroid function to data
data['Distance_From_Centroid'] = data.apply(distance_from_centroid, axis =1)
## Return Final Summary
summary = " ".join(data.sort_values(
'Distance_From_Centroid',
ascending = True).groupby('Cluster').head(1).sort_index()['Sentences'].tolist())
return summary
import gradio as gr
interface1 = gr.Interface(fn=make_summary,
inputs =gr.inputs.Textbox(lines=15,placeholder="Enter your text !!",label='Input-10k Sections'),
outputs=gr.outputs.Textbox(label='Output- Finbert')).launch()
|