Upload process_documents.py
Browse files- process_documents.py +4 -1
process_documents.py
CHANGED
@@ -10,12 +10,15 @@ deep_strip = lambda text: re.sub(r"\s+", " ", text or "").strip()
|
|
10 |
|
11 |
def process_documents(urls):
|
12 |
snippets = []
|
|
|
13 |
for source_id, url in enumerate(urls):
|
14 |
if url.endswith(".pdf"):
|
15 |
snippets.extend(process_pdf(url, source_id))
|
|
|
16 |
else:
|
17 |
snippets.extend(process_web(url, source_id))
|
18 |
-
|
|
|
19 |
|
20 |
|
21 |
def process_web(url, source_id):
|
|
|
10 |
|
11 |
def process_documents(urls):
|
12 |
snippets = []
|
13 |
+
documents = []
|
14 |
for source_id, url in enumerate(urls):
|
15 |
if url.endswith(".pdf"):
|
16 |
snippets.extend(process_pdf(url, source_id))
|
17 |
+
documents.append("\n".join([snip.page_content for snip in snippets]))
|
18 |
else:
|
19 |
snippets.extend(process_web(url, source_id))
|
20 |
+
documents.append("\n".join([snip.page_content for snip in snippets]))
|
21 |
+
return snippets, documents
|
22 |
|
23 |
|
24 |
def process_web(url, source_id):
|