arpita-23 commited on
Commit
be74b6e
·
verified ·
1 Parent(s): 3d566b2

Update vector_bd_dir

Browse files
Files changed (1) hide show
  1. vector_bd_dir +0 -56
vector_bd_dir CHANGED
@@ -1,56 +0,0 @@
1
- from langchain_text_splitters import CharacterTextSplitter
2
- from langchain_huggingface import HuggingFaceEmbeddings
3
- from langchain_chroma import Chroma
4
- from langchain.docstore.document import Document
5
- import pandas as pd
6
- import os
7
- import glob
8
-
9
- # Define a function to perform vectorization for multiple CSV files
10
- def vectorize_documents():
11
- embeddings = HuggingFaceEmbeddings()
12
-
13
- # Directory containing multiple CSV files
14
- csv_directory = "Data" # Replace with your folder name
15
- csv_files = glob.glob(os.path.join(csv_directory, "*.csv")) # Find all CSV files in the folder
16
-
17
- documents = []
18
-
19
- # Load and concatenate all CSV files
20
- for file_path in csv_files:
21
- df = pd.read_csv(file_path)
22
- for _, row in df.iterrows():
23
- # Combine all columns in the row into a single string
24
- row_content = " ".join(row.astype(str))
25
- documents.append(Document(page_content=row_content))
26
-
27
- # Splitting the text and creating chunks of these documents
28
- text_splitter = CharacterTextSplitter(
29
- chunk_size=2000,
30
- chunk_overlap=500
31
- )
32
-
33
- text_chunks = text_splitter.split_documents(documents)
34
-
35
- # Process text chunks in batches
36
- batch_size = 5000 # Chroma's batch size limit is 5461, set a slightly smaller size for safety
37
- for i in range(0, len(text_chunks), batch_size):
38
- batch = text_chunks[i:i + batch_size]
39
-
40
- # Store the batch in Chroma vector DB
41
- vectordb = Chroma.from_documents(
42
- documents=batch,
43
- embedding=embeddings,
44
- persist_directory="vector_db_dir"
45
- )
46
-
47
- print("Documents Vectorized and saved in VectorDB")
48
-
49
- # Expose embeddings if needed
50
- embeddings = HuggingFaceEmbeddings()
51
-
52
-
53
-
54
- # Main guard to prevent execution on import
55
- if __name__ == "__main__":
56
- vectorize_documents()