abdulmatinomotoso commited on
Commit
1527ab6
1 Parent(s): 721e406

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +63 -9
app.py CHANGED
@@ -8,6 +8,7 @@ import gradio as gr
8
  from gradio.mix import Parallel
9
  from transformers import pipeline
10
  import numpy as np
 
11
 
12
  # Defining a function to read in the text file
13
  def read_in_text(url):
@@ -15,6 +16,21 @@ def read_in_text(url):
15
  article = file.read()
16
  return article
17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  #initailizing the model pipeline
19
  from transformers import BartTokenizer, BartForConditionalGeneration
20
 
@@ -24,19 +40,57 @@ tokenizer = BartTokenizer.from_pretrained("sshleifer/distilbart-cnn-12-6")
24
  #Defining a function to get the summary of the article
25
  def final_summary(file):
26
  #reading in the text and tokenizing it into sentence
27
- text = read_in_text(file.name)
28
  chunks = sent_tokenize(text)
29
  output = []
 
30
 
31
  #looping through the sentences in a batch of 10 and summarizing them
32
- for i in range(0,len(chunks), 10):
33
- sentence = ' '.join(chunks[i:i+10])
34
- inputs = tokenizer(sentence, max_length=1024, return_tensors="pt")
35
- summary_ids = model.generate(inputs["input_ids"])
36
- summary = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
37
- output.append(summary)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
- #joining all the summary output together
40
  summary = ' '.join(output)
41
  lines1 = sent_tokenize(summary)
42
  for i in range(len(lines1)):
@@ -46,7 +100,7 @@ def final_summary(file):
46
  return summ_bullet1
47
 
48
  #creating an interface for the headline generator using gradio
49
- demo = gr.Interface(final_summary, inputs=[gr.inputs.File(label="Drop your .txt file here", optional=False)],
50
  title = "ARTICLE SUMMARIZER",
51
  outputs=[gr.outputs.Textbox(label="Summary")],
52
  theme= "darkhuggingface")
 
8
  from gradio.mix import Parallel
9
  from transformers import pipeline
10
  import numpy as np
11
+ import math
12
 
13
  # Defining a function to read in the text file
14
  def read_in_text(url):
 
16
  article = file.read()
17
  return article
18
 
19
+ def clean_text(url):
20
+ text = url
21
+ text = text.encode("ascii", errors="ignore").decode(
22
+ "ascii"
23
+ ) # remove non-ascii, Chinese characters
24
+
25
+ text = re.sub(r"\n", " ", text)
26
+ text = re.sub(r"\n\n", " ", text)
27
+ text = re.sub(r"\t", " ", text)
28
+ text = text.strip(" ")
29
+ text = re.sub(
30
+ " +", " ", text
31
+ ).strip() # get rid of multiple spaces and replace with a single
32
+ return text
33
+
34
  #initailizing the model pipeline
35
  from transformers import BartTokenizer, BartForConditionalGeneration
36
 
 
40
  #Defining a function to get the summary of the article
41
  def final_summary(file):
42
  #reading in the text and tokenizing it into sentence
43
+ text = clean_text(file)
44
  chunks = sent_tokenize(text)
45
  output = []
46
+ sentences_remaining = len(chunks)
47
 
48
  #looping through the sentences in a batch of 10 and summarizing them
49
+ i=0
50
+ while (sentences_remaining > 0):
51
+
52
+ chunks_remaining = math.ceil(sentences_remaining / 10.0)
53
+ next_chunk_size = math.ceil(sentences_remaining / chunks_remaining)
54
+ sentence = ' '.join(chunks[i:i+(next_chunk_size)])
55
+
56
+ i += next_chunk_size
57
+ sentences_remaining -= next_chunk_size
58
+
59
+ inputs = tokenizer(sentence, return_tensors="pt", padding='longest')
60
+ if (len(inputs['input_ids'][0])) < 150:
61
+ output.append(sentence)
62
+
63
+ elif (len(inputs['input_ids'][0])) > 1024:
64
+ sent = sent_tokenize(sentence)
65
+ length_sent = len(sent)
66
+
67
+ j=0
68
+ sent_remaining = math.ceil(length_sent / 2)
69
+ while length_sent >0:
70
+
71
+ #next_sent_size = math.ceil(length_sent / sent_remaining)
72
+ halved_sentence = ' '.join(sent[j:j+(sent_remaining)])
73
+ inputs = tokenizer(halved_sentence, return_tensors="pt")
74
+ summary_ids = model.generate(inputs["input_ids"])
75
+ j += sent_remaining
76
+ length_sent -= sent_remaining
77
+
78
+ if (len(summary_ids[0])) < (len(inputs['input_ids'][0])):
79
+ summary = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
80
+ output.append(summary)
81
+ else:
82
+ continue
83
+
84
+ else:
85
+ summary_ids = model.generate(inputs["input_ids"])
86
+
87
+ if (len(summary_ids[0])) < (len(inputs['input_ids'][0])):
88
+ summary = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
89
+ output.append(summary)
90
+ else:
91
+ continue
92
 
93
+ #joining all the summary output together
94
  summary = ' '.join(output)
95
  lines1 = sent_tokenize(summary)
96
  for i in range(len(lines1)):
 
100
  return summ_bullet1
101
 
102
  #creating an interface for the headline generator using gradio
103
+ demo = gr.Interface(final_summary, inputs=[gr.inputs.Textbox(label="Drop your article here", optional=False)],
104
  title = "ARTICLE SUMMARIZER",
105
  outputs=[gr.outputs.Textbox(label="Summary")],
106
  theme= "darkhuggingface")