Spaces:

valurank
/

Article_Summarizer_12_6_testing

Running

App Files Files Community

abdulmatinomotoso commited on Jul 28, 2022

Commit

924e74c

•

1 Parent(s): 0dea9aa

Update app.py

Browse files

Files changed (1) hide show

app.py +80 -56

app.py CHANGED Viewed

@@ -36,65 +36,89 @@ from transformers import BartTokenizer, BartForConditionalGeneration
 model = BartForConditionalGeneration.from_pretrained("sshleifer/distilbart-cnn-12-6")
 tokenizer = BartTokenizer.from_pretrained("sshleifer/distilbart-cnn-12-6")
 #Defining a function to get the summary of the article
 def final_summary(file):
-  #reading in the text and tokenizing it into sentence
-  text = clean_text(file)
-  chunks = sent_tokenize(text)
-  output = []
-  sentences_remaining = len(chunks)
-  #looping through the sentences in a batch of 10 and summarizing them
-  i=0
-  while (sentences_remaining > 0):
-    chunks_remaining = math.ceil(sentences_remaining / 10.0)
-    next_chunk_size = math.ceil(sentences_remaining / chunks_remaining)
-    sentence = ' '.join(chunks[i:i+(next_chunk_size)])
-    i += next_chunk_size
-    sentences_remaining -= next_chunk_size
-    inputs = tokenizer(sentence, return_tensors="pt", padding='longest')
-    original_input_length = len(inputs['input_ids'][0])
-    if original_input_length < 150:
-      output.append(sentence)
-    elif original_input_length > 1024:
-      sent = sent_tokenize(sentence)
-      length_sent = len(sent)
-      j=0
-      sent_remaining = math.ceil(length_sent / 2)
-      while length_sent >0:
-        #next_sent_size = math.ceil(length_sent / sent_remaining)
-        halved_sentence = ' '.join(sent[j:j+(sent_remaining)])
-        halved_inputs = tokenizer(halved_sentence, return_tensors="pt")
-        halved_summary_ids = model.generate(halved_inputs["input_ids"])
-        j += sent_remaining
-        length_sent -= sent_remaining
-        if (len(halved_summary_ids[0])) < (len(halved_inputs['input_ids'][0])):
-          halved_summary = tokenizer.batch_decode(halved_summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-          output.append(halved_summary)
-    else:
-      summary_ids = model.generate(inputs["input_ids"])
-      if (len(summary_ids[0])) < original_input_length:
-        summary = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-        output.append(summary)
-#joining all the summary output together
-  summary = ' '.join(output)
-  lines1 = sent_tokenize(summary)
-  for i in range(len(lines1)):
-          lines1[i] = "* " + lines1[i].strip().replace(' .', '.')
-  summ_bullet1 = "\n".join(lines1)
-  return summ_bullet1
   #creating an interface for the headline generator using gradio
 demo = gr.Interface(final_summary, inputs=[gr.inputs.Textbox(label="Drop your article here", optional=False)],

 model = BartForConditionalGeneration.from_pretrained("sshleifer/distilbart-cnn-12-6")
 tokenizer = BartTokenizer.from_pretrained("sshleifer/distilbart-cnn-12-6")
+#Defining a function to get the summary of the article
 #Defining a function to get the summary of the article
 def final_summary(file):
+    #reading in the text and tokenizing it into sentence
+    text = clean_text(file)
+    chunks = []
+    sentences = nlp(text)
+    for sentence in sentences.sents:
+      chunks.append(str(sentence))
+    output = []
+    sentences_remaining = len(chunks)
+    i = 0
+    # looping through the sentences in an equal batch based on their length and summarizing them
+    while sentences_remaining > 0:
+        chunks_remaining = math.ceil(sentences_remaining / 10.0)
+        next_chunk_size = math.ceil(sentences_remaining / chunks_remaining)
+        sentence = "".join(chunks[i:i+next_chunk_size])
+        i += next_chunk_size
+        sentences_remaining -= next_chunk_size
+        inputs = tokenizer(sentence, return_tensors="pt", padding="longest")
+        #inputs = inputs.to(DEVICE)
+        original_input_length = len(inputs["input_ids"][0])
+        # checking if the length of the input batch is less than 150
+        if original_input_length < 100:
+            output.append(sentence)
+        # checking if the length of the input batch is greater than 1024
+        elif original_input_length > 1024:
+            sent = sent_tokenize(sentence)
+            length_sent = len(sent)
+            j = 0
+            sent_remaining = math.ceil(length_sent / 2)
+            # going through the batch that is greater than 1024 and dividing them
+            while length_sent > 0:
+                halved_sentence = "".join(sent[j:j+sent_remaining])
+                halved_inputs = tokenizer(halved_sentence, return_tensors="pt")
+                #halved_inputs = halved_inputs.to(DEVICE)
+                halved_summary_ids = model.generate(halved_inputs["input_ids"])
+                j += sent_remaining
+                length_sent -= sent_remaining
+                # checking if the length of the output summary is less than the original text
+                if len(halved_summary_ids[0]) < len(halved_inputs["input_ids"][0]):
+                    halved_summary = tokenizer.batch_decode(halved_summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+                    output.append(halved_summary)
+        else:
+            summary_ids = model.generate(inputs["input_ids"])
+            if len(summary_ids[0]) < original_input_length:
+                summary = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+                output.append(summary)
+    # joining all the summary output together
+    #summary = "".join(output)
+    #lines = summary.split(" . ")
+    lines = []
+    for summary in output:
+      summary = nlp(summary)
+      for line in summary.sents:
+        line = str(line)
+        if line != " ":
+          lines.append(line.replace(" .", ".").strip())
+    for i in range(len(lines)):
+        lines[i] = "* " + lines[i]
+    # final sentences are incoherent, so we will join them by bullet separator
+    summary_bullet = "\n".join(lines)
+    return summary_bullet
   #creating an interface for the headline generator using gradio
 demo = gr.Interface(final_summary, inputs=[gr.inputs.Textbox(label="Drop your article here", optional=False)],