Spaces:

arpita-23
/

Shlok_Summerizer

Sleeping

App Files Files Community

arpita-23 commited on Jan 2

Commit

be74b6e

verified ·

1 Parent(s): 3d566b2

Update vector_bd_dir

Browse files

Files changed (1) hide show

vector_bd_dir +0 -56

vector_bd_dir CHANGED Viewed

@@ -1,56 +0,0 @@
-from langchain_text_splitters import CharacterTextSplitter
-from langchain_huggingface import HuggingFaceEmbeddings
-from langchain_chroma import Chroma
-from langchain.docstore.document import Document
-import pandas as pd
-import os
-import glob
-# Define a function to perform vectorization for multiple CSV files
-def vectorize_documents():
-    embeddings = HuggingFaceEmbeddings()
-    # Directory containing multiple CSV files
-    csv_directory = "Data"  # Replace with your folder name
-    csv_files = glob.glob(os.path.join(csv_directory, "*.csv"))  # Find all CSV files in the folder
-    documents = []
-    # Load and concatenate all CSV files
-    for file_path in csv_files:
-        df = pd.read_csv(file_path)
-        for _, row in df.iterrows():
-            # Combine all columns in the row into a single string
-            row_content = " ".join(row.astype(str))
-            documents.append(Document(page_content=row_content))
-    # Splitting the text and creating chunks of these documents
-    text_splitter = CharacterTextSplitter(
-        chunk_size=2000,
-        chunk_overlap=500
-    )
-    text_chunks = text_splitter.split_documents(documents)
-    # Process text chunks in batches
-    batch_size = 5000  # Chroma's batch size limit is 5461, set a slightly smaller size for safety
-    for i in range(0, len(text_chunks), batch_size):
-        batch = text_chunks[i:i + batch_size]
-        # Store the batch in Chroma vector DB
-        vectordb = Chroma.from_documents(
-            documents=batch,
-            embedding=embeddings,
-            persist_directory="vector_db_dir"
-        )
-    print("Documents Vectorized and saved in VectorDB")
-# Expose embeddings if needed
-embeddings = HuggingFaceEmbeddings()
-# Main guard to prevent execution on import
-if __name__ == "__main__":
-    vectorize_documents()