File size: 4,738 Bytes
5f814e7
e32c3f8
 
5f814e7
 
 
e32c3f8
 
5f814e7
 
 
e32c3f8
 
 
5f814e7
 
 
 
 
 
 
 
e32c3f8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5f814e7
e32c3f8
5f814e7
e32c3f8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5f814e7
 
e32c3f8
5f814e7
 
 
 
e32c3f8
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
# importing the necessary library
import re
import math

import spacy
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')

import gradio as gr

from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

from transformers import BartTokenizer, BartForConditionalGeneration


# initailizing the model pipeline   
model = BartForConditionalGeneration.from_pretrained("sshleifer/distilbart-cnn-12-6")
tokenizer = BartTokenizer.from_pretrained("sshleifer/distilbart-cnn-12-6")
nlp = spacy.load("en_core_web_sm")
    
    
def clean_text(text):
  text = text
  text = text.encode("ascii", errors="ignore").decode(
          "ascii"
    )  # remove non-ascii, Chinese characters
    
  text = re.sub(r"\n", " ", text)
  text = re.sub(r"\n\n", " ", text)
  text = re.sub(r"\t", " ", text)
  text = text.strip(" ")
  text = re.sub(
        " +", " ", text
    ).strip()  # get rid of multiple spaces and replace with a single
  return text     


# Defining a function to get the summary of the article
def final_summary(text):
    # reading in the text and tokenizing it into sentence
    text = text
    bullet_points = 10

    while (bullet_points >= 10):

      chunks = []
      sentences = nlp(text)
      for sentence in sentences.sents:
        chunks.append(str(sentence))

      output = []
      sentences_remaining = len(chunks)
      i = 0

      #looping through the sentences in an equal batch based on their length and summarizing them
      while sentences_remaining > 0:
          chunks_remaining = math.ceil(sentences_remaining / 10.0)
          next_chunk_size = math.ceil(sentences_remaining / chunks_remaining)
          sentence = "".join(chunks[i:i+next_chunk_size])

          i += next_chunk_size
          sentences_remaining -= next_chunk_size

          inputs = tokenizer(sentence, return_tensors="pt", padding="longest")
          #inputs = inputs.to(DEVICE)
          original_input_length = len(inputs["input_ids"][0])

          # checking if the length of the input batch is less than 150
          if original_input_length < 100:
              split_sentences = nlp(sentence)
              for split_sentence in split_sentences.sents:
                output.append(str(split_sentence).rstrip("."))
              

          # checking if the length of the input batch is greater than 1024
          elif original_input_length > 1024:
              sent = sent_tokenize(sentence)
              length_sent = len(sent)

              j = 0
              sent_remaining = math.ceil(length_sent / 2)

              # going through the batch that is greater than 1024 and dividing them
              while length_sent > 0:
                  halved_sentence = "".join(sent[j:j+sent_remaining])
                  halved_inputs = tokenizer(halved_sentence, return_tensors="pt")
                  #halved_inputs = halved_inputs.to(DEVICE)
                  halved_summary_ids = model.generate(halved_inputs["input_ids"])
                  j += sent_remaining
                  length_sent -= sent_remaining

                  # checking if the length of the output summary is less than the original text
                  if len(halved_summary_ids[0]) < len(halved_inputs["input_ids"][0]):
                      halved_summary = tokenizer.batch_decode(halved_summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
                      output.append(halved_summary)

          else:
              summary_ids = model.generate(inputs["input_ids"])

              if len(summary_ids[0]) < original_input_length:
                  summary = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
                  output.append(summary)

      final_output = []
      for paragraphs in output:
        lines = paragraphs.split(" . ")	 
        for line in lines:
          final_output.append(line.replace(" .", "").strip()) 
      text = ".".join(final_output)
      bullet_points = len(final_output)


    for i in range(len(final_output)):
        final_output[i] = "* " + final_output[i] + "."

      # final sentences are incoherent, so we will join them by bullet separator
    summary_bullet = "\n".join(final_output)

    return summary_bullet
  
# creating an interface for the headline generator using gradio
demo = gr.Interface(final_summary, inputs=[gr.Textbox(label="Drop your article here")],
                                          title = "ARTICLE SUMMARIZER",
                                          outputs=[gr.Textbox(label="Summary")],
                                          )             

# launching the app
if __name__ == "__main__":
    demo.launch(debug=True)