Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Update auditqa/doc_process.py
Browse files- auditqa/doc_process.py +23 -0
auditqa/doc_process.py
CHANGED
@@ -50,4 +50,27 @@ def process_pdf():
|
|
50 |
doc.metadata["year"] = file[-4:]
|
51 |
|
52 |
all_documents[category].append(doc_processed)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
|
|
|
50 |
doc.metadata["year"] = file[-4:]
|
51 |
|
52 |
all_documents[category].append(doc_processed)
|
53 |
+
|
54 |
+
for key, docs_processed in all_documents.items():
|
55 |
+
docs_processed = [item for sublist in docs_processed for item in sublist]
|
56 |
+
all_documents[key] = docs_processed
|
57 |
+
|
58 |
+
embeddings = HuggingFaceEmbeddings(
|
59 |
+
model_kwargs = {'device': device},
|
60 |
+
encode_kwargs = {'normalize_embeddings': True},
|
61 |
+
model_name="BAAI/bge-small-en-v1.5"
|
62 |
+
)
|
63 |
+
|
64 |
+
qdrant_collections = {}
|
65 |
+
|
66 |
+
for file,value in all_documents.items():
|
67 |
+
print("emebddings for:",file)
|
68 |
+
qdrant_collections[file] = Qdrant.from_documents(
|
69 |
+
value,
|
70 |
+
embeddings,
|
71 |
+
location=":memory:",
|
72 |
+
collection_name=file,
|
73 |
+
)
|
74 |
+
print("done")
|
75 |
+
return qdrant_collections
|
76 |
|