Spaces:

pleonova
/

multi-label-summary-text

Running

File size: 3,407 Bytes

01d9aad
 
ae7b1b1
e452a5c
053538e
ae7b1b1
01d9aad
 
 
 
 
 
 
 
ebb04f6
01d9aad
 
 
 
 
 
9c7749e
01d9aad
 
 
 
 
e452a5c
f974572
f6d0ea6
 
 
 
 
 
 
 
 
 
 
 
e452a5c
01d9aad
f974572
01d9aad
71541e6
01d9aad
 
 
 
 
 
 
 
 
 
 
944c52f
 
 
 
 
 
 
 
01d9aad
 
 
 
 
 
 
 
 
 
 
 
 
f974572
01d9aad

import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import streamlit as st
from keybert import KeyBERT
import re


# Reference: https://discuss.huggingface.co/t/summarization-on-long-documents/920/7
def create_nest_sentences(document:str, token_max_length = 1024):
  nested = []
  sent = []
  length = 0
  tokenizer = AutoTokenizer.from_pretrained('facebook/bart-large-mnli')

  for sentence in re.split(r'(?<=[^A-Z].[.?]) +(?=[A-Z])', document.replace("\n", ' ')):
    tokens_in_sentence = tokenizer(str(sentence), truncation=False, padding=False)[0] # hugging face transformer tokenizer
    length += len(tokens_in_sentence)
    if length < token_max_length:
      sent.append(sentence)
    else:
      nested.append(sent)
      sent = [sentence]
      length = 0
  if sent:
    nested.append(sent)
  return nested

# Reference: https://github.com/MaartenGr/KeyBERT
@st.cache_resource
def load_keyword_model():
  kw_model = KeyBERT()
  return kw_model

def keyword_gen(kw_model, sequence:str):
  keywords = kw_model.extract_keywords(sequence, 
    keyphrase_ngram_range=(1, 1),
    stop_words='english', 
    use_mmr=True, 
    diversity=0.5,
    top_n=10)
  return keywords

# Reference: https://huggingface.co/facebook/bart-large-mnli
@st.cache_resource
def load_summary_model():
    model_name = "facebook/bart-large-cnn"
    summarizer = pipeline(task='summarization', model=model_name)
    return summarizer

# def load_summary_model():
#     model_name = "facebook/bart-large-mnli"
#     tokenizer = BartTokenizer.from_pretrained(model_name)
#     model = BartForConditionalGeneration.from_pretrained(model_name)
#     summarizer = pipeline(task='summarization', model=model, tokenizer=tokenizer, framework='pt')
#     return summarizer

def summarizer_gen(summarizer, sequence:str, maximum_tokens:int, minimum_tokens:int):
	output = summarizer(sequence, 
    num_beams=4, 
    length_penalty=2.0,
    max_length=maximum_tokens, 
    min_length=minimum_tokens, 
    do_sample=False, 
    early_stopping = True,
    no_repeat_ngram_size=3)
	return output[0].get('summary_text')

# # Reference: https://www.datatrigger.org/post/nlp_hugging_face/
# # Custom summarization pipeline (to handle long articles)
# def summarize(text, minimum_length_of_summary = 100):
#     # Tokenize and truncate
#     inputs = tokenizer_bart([text], truncation=True, max_length=1024, return_tensors='pt').to('cuda')
#     # Generate summary 
#     summary_ids = model_bart.generate(inputs['input_ids'], num_beams=4, min_length = minimum_length_of_summary, max_length=400, early_stopping=True)
#     # Untokenize
#     return([tokenizer_bart.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids][0])

# Reference: https://huggingface.co/spaces/team-zero-shot-nli/zero-shot-nli/blob/main/utils.py
@st.cache_resource
def load_model():
    model_name = "facebook/bart-large-mnli"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name)
    classifier = pipeline(task='zero-shot-classification', model=model, tokenizer=tokenizer, framework='pt')
    return classifier

def classifier_zero(classifier, sequence:str, labels:list, multi_class:bool):
    outputs = classifier(sequence, labels, multi_label=multi_class)
    return outputs['labels'], outputs['scores']