File size: 3,407 Bytes
01d9aad
 
ae7b1b1
e452a5c
053538e
ae7b1b1
01d9aad
 
 
 
 
 
 
 
ebb04f6
01d9aad
 
 
 
 
 
9c7749e
01d9aad
 
 
 
 
e452a5c
f974572
f6d0ea6
 
 
 
 
 
 
 
 
 
 
 
e452a5c
01d9aad
f974572
01d9aad
71541e6
01d9aad
 
 
 
 
 
 
 
 
 
 
944c52f
 
 
 
 
 
 
 
01d9aad
 
 
 
 
 
 
 
 
 
 
 
 
f974572
01d9aad
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import streamlit as st
from keybert import KeyBERT
import re


# Reference: https://discuss.huggingface.co/t/summarization-on-long-documents/920/7
def create_nest_sentences(document:str, token_max_length = 1024):
  nested = []
  sent = []
  length = 0
  tokenizer = AutoTokenizer.from_pretrained('facebook/bart-large-mnli')

  for sentence in re.split(r'(?<=[^A-Z].[.?]) +(?=[A-Z])', document.replace("\n", ' ')):
    tokens_in_sentence = tokenizer(str(sentence), truncation=False, padding=False)[0] # hugging face transformer tokenizer
    length += len(tokens_in_sentence)
    if length < token_max_length:
      sent.append(sentence)
    else:
      nested.append(sent)
      sent = [sentence]
      length = 0
  if sent:
    nested.append(sent)
  return nested

# Reference: https://github.com/MaartenGr/KeyBERT
@st.cache_resource
def load_keyword_model():
  kw_model = KeyBERT()
  return kw_model

def keyword_gen(kw_model, sequence:str):
  keywords = kw_model.extract_keywords(sequence, 
    keyphrase_ngram_range=(1, 1),
    stop_words='english', 
    use_mmr=True, 
    diversity=0.5,
    top_n=10)
  return keywords

# Reference: https://huggingface.co/facebook/bart-large-mnli
@st.cache_resource
def load_summary_model():
    model_name = "facebook/bart-large-cnn"
    summarizer = pipeline(task='summarization', model=model_name)
    return summarizer

# def load_summary_model():
#     model_name = "facebook/bart-large-mnli"
#     tokenizer = BartTokenizer.from_pretrained(model_name)
#     model = BartForConditionalGeneration.from_pretrained(model_name)
#     summarizer = pipeline(task='summarization', model=model, tokenizer=tokenizer, framework='pt')
#     return summarizer

def summarizer_gen(summarizer, sequence:str, maximum_tokens:int, minimum_tokens:int):
	output = summarizer(sequence, 
    num_beams=4, 
    length_penalty=2.0,
    max_length=maximum_tokens, 
    min_length=minimum_tokens, 
    do_sample=False, 
    early_stopping = True,
    no_repeat_ngram_size=3)
	return output[0].get('summary_text')

# # Reference: https://www.datatrigger.org/post/nlp_hugging_face/
# # Custom summarization pipeline (to handle long articles)
# def summarize(text, minimum_length_of_summary = 100):
#     # Tokenize and truncate
#     inputs = tokenizer_bart([text], truncation=True, max_length=1024, return_tensors='pt').to('cuda')
#     # Generate summary 
#     summary_ids = model_bart.generate(inputs['input_ids'], num_beams=4, min_length = minimum_length_of_summary, max_length=400, early_stopping=True)
#     # Untokenize
#     return([tokenizer_bart.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids][0])

# Reference: https://huggingface.co/spaces/team-zero-shot-nli/zero-shot-nli/blob/main/utils.py
@st.cache_resource
def load_model():
    model_name = "facebook/bart-large-mnli"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name)
    classifier = pipeline(task='zero-shot-classification', model=model, tokenizer=tokenizer, framework='pt')
    return classifier

def classifier_zero(classifier, sequence:str, labels:list, multi_class:bool):
    outputs = classifier(sequence, labels, multi_label=multi_class)
    return outputs['labels'], outputs['scores']