Spaces:

Shivam29rathore
/

shorter-finbert

Runtime error

File size: 4,197 Bytes

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import pickle
import torch
from transformers import PegasusTokenizer, PegasusForConditionalGeneration
import tensorflow as tf
from tensorflow.python.lib.io import file_io
from nltk.tokenize import sent_tokenize


import io


#contents = pickle.load(f) becomes...
#contents = CPU_Unpickler(f).load()


model_path = "finbert.sav"

#load model from drive
with open(model_path, "rb") as f:
    model=  pickle.load(f)
    
    

#tokenizer = AutoTokenizer.from_pretrained(checkpoint)
#model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)


import nltk
from finbert_embedding.embedding import FinbertEmbedding
import pandas as pd
from nltk.cluster import KMeansClusterer
import numpy as np
import os
from scipy.spatial import distance_matrix
from tensorflow.python.lib.io import file_io
import pickle

nltk.download('punkt')


def make_summary(word):

    # Create tokens from the txt file
    tokens = nltk.sent_tokenize(word)
    # Strip out trailing and leading white spaces from tokens
    sentences = [word.strip() for word in tokens]
    #Create a DataFrame from the tokens 
    data = pd.DataFrame(sentences)
    # Assign name Sentences to the column containing text tokens
    data.columns = ['Sentences']
    
    # Function to create numerical embeddings for each text tokens in dataframe 
    def get_sentence_embeddings():
        # Create empty list for sentence embeddings
        sentence_list = []
        # Loop through all sentences and append sentence embeddings to list
        for i in tokens:
            sentence_embedding = model.sentence_vector(i)
            sentence_list.append(sentence_embedding)
        # Create empty list for ndarray
        sentence_array=[]
        # Loop through sentence list and change data type from tensor to array
        for i in sentence_list:
            sentence_array.append(i.numpy())
        # return sentence embeddings as list
        return sentence_array

    # Apply get_sentence_embeddings to dataframe to create column Embeddings
    data['Embeddings'] = get_sentence_embeddings()
    
    #Number of expected sentences for shorter summaries
    if len(tokens) <= 4: 
        NUM_CLUSTERS = 1
    else:
        NUM_CLUSTERS = len(tokens)//4

    iterations = 25
    # Convert Embeddings into an array and store in variable X
    X = np.array(data['Embeddings'].to_list())
    
    #Build k-means cluster algorithm
    Kclusterer = KMeansClusterer(
                                NUM_CLUSTERS,
                                distance = nltk.cluster.util.cosine_distance,
                                repeats = iterations, avoid_empty_clusters = True)

    # if length of text is too short, K means would return an error
    # use the try except block to return the text as result if it is too short.
    try:
        
        assigned_clusters = Kclusterer.cluster(X,assign_clusters=True)

        # Apply Kmean Cluster to DataFrame and create new columns Clusters and Centroid
        data['Cluster'] = pd.Series(assigned_clusters, index = data.index)
        data['Centroid'] = data['Cluster'].apply(lambda x: Kclusterer.means()[x])
    
    # return the text if clustering algorithm catches an exceptiona and move to the next text file
    except ValueError:
        return word

    # function that computes the distance of each embeddings from the centroid of the cluster
    def distance_from_centroid(row):
        return distance_matrix([row['Embeddings']], [row['Centroid'].tolist()])[0][0]
    
    # apply distance_from_centroid function to data
    data['Distance_From_Centroid'] = data.apply(distance_from_centroid, axis =1)
    
    ## Return Final Summary
    summary = " ".join(data.sort_values(
                'Distance_From_Centroid',
                ascending = True).groupby('Cluster').head(1).sort_index()['Sentences'].tolist())
    return summary

import gradio as gr



                     
interface1 = gr.Interface(fn=make_summary, 
                     inputs =gr.inputs.Textbox(lines=15,placeholder="Enter your text !!",label='Input-10k Sections'),
                     outputs=gr.outputs.Textbox(label='Output- Finbert')).launch()