Update functions.py
Browse files- functions.py +26 -0
functions.py
CHANGED
@@ -136,6 +136,32 @@ def process_corpus(corpus, _tokenizer, title, embedding_model, chunk_size=200, o
|
|
136 |
|
137 |
return docsearch
|
138 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
139 |
@st.experimental_singleton(suppress_st_warning=True)
|
140 |
def gen_embeddings(embedding_model):
|
141 |
|
|
|
136 |
|
137 |
return docsearch
|
138 |
|
139 |
+
@st.experimental_singleton(suppress_st_warning=True)
|
140 |
+
def chunk_and_preprocess_text(text,thresh=500):
|
141 |
+
|
142 |
+
"""Chunk text longer than n tokens for summarization"""
|
143 |
+
|
144 |
+
sentences = sent_tokenize(clean_text(text))
|
145 |
+
#sentences = [i.text for i in list(article.sents)]
|
146 |
+
|
147 |
+
current_chunk = 0
|
148 |
+
chunks = []
|
149 |
+
|
150 |
+
for sentence in sentences:
|
151 |
+
if len(chunks) == current_chunk + 1:
|
152 |
+
if len(chunks[current_chunk]) + len(sentence.split(" ")) <= thresh:
|
153 |
+
chunks[current_chunk].extend(sentence.split(" "))
|
154 |
+
else:
|
155 |
+
current_chunk += 1
|
156 |
+
chunks.append(sentence.split(" "))
|
157 |
+
else:
|
158 |
+
chunks.append(sentence.split(" "))
|
159 |
+
|
160 |
+
for chunk_id in range(len(chunks)):
|
161 |
+
chunks[chunk_id] = " ".join(chunks[chunk_id])
|
162 |
+
|
163 |
+
return chunks
|
164 |
+
|
165 |
@st.experimental_singleton(suppress_st_warning=True)
|
166 |
def gen_embeddings(embedding_model):
|
167 |
|