nickmuchi commited on
Commit
b1b5065
1 Parent(s): 85bdaf5

Update functions.py

Browse files
Files changed (1) hide show
  1. functions.py +26 -0
functions.py CHANGED
@@ -136,6 +136,32 @@ def process_corpus(corpus, _tokenizer, title, embedding_model, chunk_size=200, o
136
 
137
  return docsearch
138
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
139
  @st.experimental_singleton(suppress_st_warning=True)
140
  def gen_embeddings(embedding_model):
141
 
 
136
 
137
  return docsearch
138
 
139
+ @st.experimental_singleton(suppress_st_warning=True)
140
+ def chunk_and_preprocess_text(text,thresh=500):
141
+
142
+ """Chunk text longer than n tokens for summarization"""
143
+
144
+ sentences = sent_tokenize(clean_text(text))
145
+ #sentences = [i.text for i in list(article.sents)]
146
+
147
+ current_chunk = 0
148
+ chunks = []
149
+
150
+ for sentence in sentences:
151
+ if len(chunks) == current_chunk + 1:
152
+ if len(chunks[current_chunk]) + len(sentence.split(" ")) <= thresh:
153
+ chunks[current_chunk].extend(sentence.split(" "))
154
+ else:
155
+ current_chunk += 1
156
+ chunks.append(sentence.split(" "))
157
+ else:
158
+ chunks.append(sentence.split(" "))
159
+
160
+ for chunk_id in range(len(chunks)):
161
+ chunks[chunk_id] = " ".join(chunks[chunk_id])
162
+
163
+ return chunks
164
+
165
  @st.experimental_singleton(suppress_st_warning=True)
166
  def gen_embeddings(embedding_model):
167