Spaces:

lindsay-qu
/

protein-retrieval-multimodal

Sleeping

lindsay-qu commited on Jan 19

Commit

6ec8a6d

•

1 Parent(s): 6c8a93e

Update core/retriever/chroma_retriever.py

Files changed (1) hide show

core/retriever/chroma_retriever.py CHANGED Viewed

@@ -31,6 +31,7 @@ class ChromaRetriever(BaseRetriever):
         if not os.path.exists("persist"):
             os.mkdir("persist")
         client = PersistentClient(path="persist")
         try:
             collection = client.get_collection(name=collection_name)
@@ -41,8 +42,11 @@ class ChromaRetriever(BaseRetriever):
             docs = pdf_loader.load()
             text_splitter = RecursiveCharacterTextSplitter(chunk_size=split_args["size"], chunk_overlap=split_args["overlap"])
-            texts = text_splitter.split_documents(docs)
-            texts = [text.page_content for text in texts]
             collection = client.create_collection(name=collection_name)
             if embed_model is not None:
@@ -50,12 +54,14 @@ class ChromaRetriever(BaseRetriever):
                 collection.add(
                     embeddings=embeddings,
                     documents=texts,
-                    ids=[str(i+1) for i in range(len(texts))]
                 )
             else:
                 collection.add(
                     documents=texts,
-                    ids=[str(i+1) for i in range(len(texts))]
                 )
         self.collection = collection
@@ -82,4 +88,4 @@ class ChromaRetriever(BaseRetriever):
                 query_texts=[query],
                 n_results=k,
             )
-        return results['documents'][0]

         if not os.path.exists("persist"):
             os.mkdir("persist")
         client = PersistentClient(path="persist")
+        print(client.list_collections())
         try:
             collection = client.get_collection(name=collection_name)
             docs = pdf_loader.load()
             text_splitter = RecursiveCharacterTextSplitter(chunk_size=split_args["size"], chunk_overlap=split_args["overlap"])
+            split_docs = text_splitter.split_documents(docs)
+            texts = [doc.page_content for doc in split_docs]
+            # TODO
+            titles = [doc.metadata["title"] for doc in split_docs]
             collection = client.create_collection(name=collection_name)
             if embed_model is not None:
                 collection.add(
                     embeddings=embeddings,
                     documents=texts,
+                    ids=[str(i+1) for i in range(len(texts))],
+                    metadatas=[{"title": title} for title in titles]
                 )
             else:
                 collection.add(
                     documents=texts,
+                    ids=[str(i+1) for i in range(len(texts))],
+                    metadatas=[{"title": title} for title in titles]
                 )
         self.collection = collection
                 query_texts=[query],
                 n_results=k,
             )
+        return results['documents'][0], [result["title"] for result in results['metadatas'][0]]