abdulmatinomotoso commited on
Commit
924e74c
1 Parent(s): 0dea9aa

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +80 -56
app.py CHANGED
@@ -36,65 +36,89 @@ from transformers import BartTokenizer, BartForConditionalGeneration
36
  model = BartForConditionalGeneration.from_pretrained("sshleifer/distilbart-cnn-12-6")
37
  tokenizer = BartTokenizer.from_pretrained("sshleifer/distilbart-cnn-12-6")
38
 
 
39
  #Defining a function to get the summary of the article
40
  def final_summary(file):
41
- #reading in the text and tokenizing it into sentence
42
- text = clean_text(file)
43
- chunks = sent_tokenize(text)
44
- output = []
45
- sentences_remaining = len(chunks)
46
-
47
- #looping through the sentences in a batch of 10 and summarizing them
48
- i=0
49
- while (sentences_remaining > 0):
50
-
51
- chunks_remaining = math.ceil(sentences_remaining / 10.0)
52
- next_chunk_size = math.ceil(sentences_remaining / chunks_remaining)
53
- sentence = ' '.join(chunks[i:i+(next_chunk_size)])
54
-
55
- i += next_chunk_size
56
- sentences_remaining -= next_chunk_size
57
-
58
- inputs = tokenizer(sentence, return_tensors="pt", padding='longest')
59
- original_input_length = len(inputs['input_ids'][0])
60
-
61
- if original_input_length < 150:
62
- output.append(sentence)
63
-
64
- elif original_input_length > 1024:
65
- sent = sent_tokenize(sentence)
66
- length_sent = len(sent)
67
-
68
- j=0
69
- sent_remaining = math.ceil(length_sent / 2)
70
- while length_sent >0:
71
-
72
- #next_sent_size = math.ceil(length_sent / sent_remaining)
73
- halved_sentence = ' '.join(sent[j:j+(sent_remaining)])
74
- halved_inputs = tokenizer(halved_sentence, return_tensors="pt")
75
- halved_summary_ids = model.generate(halved_inputs["input_ids"])
76
- j += sent_remaining
77
- length_sent -= sent_remaining
78
-
79
- if (len(halved_summary_ids[0])) < (len(halved_inputs['input_ids'][0])):
80
- halved_summary = tokenizer.batch_decode(halved_summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
81
- output.append(halved_summary)
82
-
83
- else:
84
- summary_ids = model.generate(inputs["input_ids"])
85
-
86
- if (len(summary_ids[0])) < original_input_length:
87
- summary = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
88
- output.append(summary)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
 
90
- #joining all the summary output together
91
- summary = ' '.join(output)
92
- lines1 = sent_tokenize(summary)
93
- for i in range(len(lines1)):
94
- lines1[i] = "* " + lines1[i].strip().replace(' .', '.')
95
-
96
- summ_bullet1 = "\n".join(lines1)
97
- return summ_bullet1
98
 
99
  #creating an interface for the headline generator using gradio
100
  demo = gr.Interface(final_summary, inputs=[gr.inputs.Textbox(label="Drop your article here", optional=False)],
 
36
  model = BartForConditionalGeneration.from_pretrained("sshleifer/distilbart-cnn-12-6")
37
  tokenizer = BartTokenizer.from_pretrained("sshleifer/distilbart-cnn-12-6")
38
 
39
+ #Defining a function to get the summary of the article
40
  #Defining a function to get the summary of the article
41
  def final_summary(file):
42
+ #reading in the text and tokenizing it into sentence
43
+ text = clean_text(file)
44
+
45
+ chunks = []
46
+ sentences = nlp(text)
47
+ for sentence in sentences.sents:
48
+ chunks.append(str(sentence))
49
+
50
+ output = []
51
+ sentences_remaining = len(chunks)
52
+ i = 0
53
+
54
+ # looping through the sentences in an equal batch based on their length and summarizing them
55
+ while sentences_remaining > 0:
56
+ chunks_remaining = math.ceil(sentences_remaining / 10.0)
57
+ next_chunk_size = math.ceil(sentences_remaining / chunks_remaining)
58
+ sentence = "".join(chunks[i:i+next_chunk_size])
59
+
60
+ i += next_chunk_size
61
+ sentences_remaining -= next_chunk_size
62
+
63
+ inputs = tokenizer(sentence, return_tensors="pt", padding="longest")
64
+ #inputs = inputs.to(DEVICE)
65
+ original_input_length = len(inputs["input_ids"][0])
66
+
67
+ # checking if the length of the input batch is less than 150
68
+ if original_input_length < 100:
69
+ output.append(sentence)
70
+
71
+
72
+ # checking if the length of the input batch is greater than 1024
73
+ elif original_input_length > 1024:
74
+ sent = sent_tokenize(sentence)
75
+ length_sent = len(sent)
76
+
77
+ j = 0
78
+ sent_remaining = math.ceil(length_sent / 2)
79
+
80
+ # going through the batch that is greater than 1024 and dividing them
81
+ while length_sent > 0:
82
+ halved_sentence = "".join(sent[j:j+sent_remaining])
83
+ halved_inputs = tokenizer(halved_sentence, return_tensors="pt")
84
+ #halved_inputs = halved_inputs.to(DEVICE)
85
+ halved_summary_ids = model.generate(halved_inputs["input_ids"])
86
+ j += sent_remaining
87
+ length_sent -= sent_remaining
88
+
89
+ # checking if the length of the output summary is less than the original text
90
+ if len(halved_summary_ids[0]) < len(halved_inputs["input_ids"][0]):
91
+ halved_summary = tokenizer.batch_decode(halved_summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
92
+ output.append(halved_summary)
93
+
94
+ else:
95
+ summary_ids = model.generate(inputs["input_ids"])
96
+
97
+ if len(summary_ids[0]) < original_input_length:
98
+ summary = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
99
+ output.append(summary)
100
+
101
+ # joining all the summary output together
102
+ #summary = "".join(output)
103
+ #lines = summary.split(" . ")
104
+
105
+ lines = []
106
+ for summary in output:
107
+ summary = nlp(summary)
108
+ for line in summary.sents:
109
+ line = str(line)
110
+ if line != " ":
111
+ lines.append(line.replace(" .", ".").strip())
112
+
113
+ for i in range(len(lines)):
114
+ lines[i] = "* " + lines[i]
115
+
116
+ # final sentences are incoherent, so we will join them by bullet separator
117
+ summary_bullet = "\n".join(lines)
118
+
119
+ return summary_bullet
120
 
121
+
 
 
 
 
 
 
 
122
 
123
  #creating an interface for the headline generator using gradio
124
  demo = gr.Interface(final_summary, inputs=[gr.inputs.Textbox(label="Drop your article here", optional=False)],