File size: 1,281 Bytes
dcfa2ec
 
 
 
92e1aef
 
dcfa2ec
 
 
92e1aef
 
 
 
 
 
dcfa2ec
 
92e1aef
 
dcfa2ec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import numpy as np
import pandas as pd
import nltk
import re

import torch
import networkx as nx
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

import warnings
warnings.filterwarnings('ignore')

nltk.download('punkt')

model = SentenceTransformer('all-mpnet-base-v2')
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

def get_summary(text, num_words: int=1000):
    sentences = nltk.sent_tokenize(text)
    embeddings = model.encode(sentences, show_progress_bar=False)
    try:
        sim_matrix = cosine_similarity(embeddings)
    except Exception as e:
        print(e, type(e))
        print(embeddings.shape)
    nx_graph = nx.from_numpy_array(sim_matrix)
    scores = nx.pagerank(nx_graph)
    
    ranked_sentences = sorted(((scores[i],s, i) for i,s in enumerate(sentences)), reverse=True)
    final_sents = []
    total_length = 0
    for score, sents, i in ranked_sentences:
        total_length += len(sents.split())
        if total_length < num_words:
            final_sents.append((score, sents, i))
        else:
            break

    top_k_sents = sorted(final_sents, key=lambda x: x[2])
    sents = " ".join([s[1] for s in top_k_sents])

    return sents