nickmuchi commited on
Commit
2fd9d6b
1 Parent(s): 1b5f436

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -2
app.py CHANGED
@@ -16,6 +16,7 @@ import docx2txt
16
  from io import StringIO
17
  from PyPDF2 import PdfFileReader
18
  import warnings
 
19
  warnings.filterwarnings("ignore")
20
 
21
 
@@ -63,6 +64,28 @@ def article_text_extractor(url: str):
63
  chunks[chunk_id] = " ".join(chunks[chunk_id])
64
 
65
  return article_header, chunks
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
 
67
  def preprocess_plain_text(x):
68
 
@@ -85,6 +108,7 @@ def extract_pdf(file):
85
  for i in range(count):
86
  page = pdfReader.getPage(i)
87
  all_text += page.extractText()
 
88
 
89
  return all_text
90
 
@@ -199,11 +223,11 @@ if is_url:
199
 
200
  elif upload_doc:
201
 
202
- clean_text = preprocess_plain_text(extract_text_from_file(upload_doc))
203
 
204
  else:
205
 
206
- clean_text = preprocess_plain_text(plain_text)
207
 
208
  summarize = st.button("Summarize")
209
 
 
16
  from io import StringIO
17
  from PyPDF2 import PdfFileReader
18
  import warnings
19
+ from nltk import sent_tokenize
20
  warnings.filterwarnings("ignore")
21
 
22
 
 
64
  chunks[chunk_id] = " ".join(chunks[chunk_id])
65
 
66
  return article_header, chunks
67
+
68
+ def chunk_clean_text(text):
69
+
70
+ sentences = sent_tokenize(text)
71
+ current_chunk = 0
72
+ chunks = []
73
+
74
+ for sentence in sentences:
75
+ if len(chunks) == current_chunk + 1:
76
+ if len(chunks[current_chunk]) + len(sentence.split(" ")) <= 500:
77
+ chunks[current_chunk].extend(sentence.split(" "))
78
+ else:
79
+ current_chunk += 1
80
+ chunks.append(sentence.split(" "))
81
+ else:
82
+ print(current_chunk)
83
+ chunks.append(sentence.split(" "))
84
+
85
+ for chunk_id in range(len(chunks)):
86
+ chunks[chunk_id] = " ".join(chunks[chunk_id])
87
+
88
+ return chunks
89
 
90
  def preprocess_plain_text(x):
91
 
 
108
  for i in range(count):
109
  page = pdfReader.getPage(i)
110
  all_text += page.extractText()
111
+
112
 
113
  return all_text
114
 
 
223
 
224
  elif upload_doc:
225
 
226
+ clean_text = chunk_clean_text(preprocess_plain_text(extract_text_from_file(upload_doc)))
227
 
228
  else:
229
 
230
+ clean_text = chunk_clean_text(preprocess_plain_text(plain_text))
231
 
232
  summarize = st.button("Summarize")
233