Spaces:
Sleeping
Sleeping
File size: 2,874 Bytes
287a0bc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 |
import os
import argparse
from tqdm import tqdm
import chromadb
def main(
documents_directory: str = "documents",
collection_name: str = "documents_collection",
persist_directory: str = ".",
) -> None:
# Read all files in the data directory
documents = []
metadatas = []
files = os.listdir(documents_directory)
for filename in files:
with open(f"{documents_directory}/{filename}", "r") as file:
for line_number, line in enumerate(
tqdm((file.readlines()), desc=f"Reading {filename}"), 1
):
# Strip whitespace and append the line to the documents list
line = line.strip()
# Skip empty lines
if len(line) == 0:
continue
documents.append(line)
metadatas.append({"filename": filename, "line_number": line_number})
# Instantiate a persistent chroma client in the persist_directory.
# Learn more at docs.trychroma.com
client = chromadb.PersistentClient(path=persist_directory)
# If the collection already exists, we just return it. This allows us to add more
# data to an existing collection.
collection = client.get_or_create_collection(name=collection_name)
# Create ids from the current count
count = collection.count()
print(f"Collection already contains {count} documents")
ids = [str(i) for i in range(count, count + len(documents))]
# Load the documents in batches of 100
for i in tqdm(
range(0, len(documents), 100), desc="Adding documents", unit_scale=100
):
collection.add(
ids=ids[i : i + 100],
documents=documents[i : i + 100],
metadatas=metadatas[i : i + 100], # type: ignore
)
new_count = collection.count()
print(f"Added {new_count - count} documents")
if __name__ == "__main__":
# Read the data directory, collection name, and persist directory
parser = argparse.ArgumentParser(
description="Load documents from a directory into a Chroma collection"
)
# Add arguments
parser.add_argument(
"--data_directory",
type=str,
default="documents",
help="The directory where your text files are stored",
)
parser.add_argument(
"--collection_name",
type=str,
default="documents_collection",
help="The name of the Chroma collection",
)
parser.add_argument(
"--persist_directory",
type=str,
default="chroma_storage",
help="The directory where you want to store the Chroma collection",
)
# Parse arguments
args = parser.parse_args()
main(
documents_directory=args.data_directory,
collection_name=args.collection_name,
persist_directory=args.persist_directory,
)
|