Spaces:

veerukhannan
/

advisor

Sleeping

App Files Files Community

advisor / add_embeddings.py

veerukhannan

Create add_embeddings.py

bb05d9c verified 6 months ago

raw

history blame

2.83 kB

	import chromadb
	from chromadb.utils import embedding_functions
	from tqdm import tqdm
	import os
	from typing import List, Dict

	class TextEmbedder:
	def __init__(self, collection_name: str = "text_collection"):
	# Initialize ChromaDB client
	self.chroma_client = chromadb.Client()

	# Initialize embedding function
	self.embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(
	model_name="all-MiniLM-L6-v2"
	)

	# Create collection
	self.collection = self.chroma_client.create_collection(
	name=collection_name,
	embedding_function=self.embedding_function,
	metadata={"hnsw:space": "cosine"}
	)

	def process_files(self, text_file: str, index_file: str, chunk_size: int = 512):
	"""Process main text file and index file"""
	try:
	# Read main text file
	print("Reading main text file...")
	with open(text_file, 'r', encoding='utf-8') as f:
	text_content = f.read()

	# Read index file
	print("Reading index file...")
	with open(index_file, 'r', encoding='utf-8') as f:
	index_lines = f.readlines()

	# Create chunks from text content
	chunks = []
	for i in range(0, len(text_content), chunk_size):
	chunk = text_content[i:i + chunk_size]
	chunks.append(chunk)

	print(f"Created {len(chunks)} chunks from text")

	# Add documents to collection
	print("Adding documents to ChromaDB...")
	for i, chunk in enumerate(tqdm(chunks)):
	# Get corresponding index line if available
	index_text = index_lines[i].strip() if i < len(index_lines) else f"Chunk {i+1}"

	self.collection.add(
	documents=[chunk],
	ids=[f"doc_{i}"],
	metadatas=[{
	"index": index_text,
	"chunk_number": i,
	"source": "a2023-45.txt"
	}]
	)

	print("Successfully processed all documents!")
	return True

	except Exception as e:
	print(f"Error processing files: {str(e)}")
	return False

	def main():
	# Initialize embedder
	embedder = TextEmbedder()

	# Process files
	success = embedder.process_files(
	text_file='a2023-45.txt',
	index_file='index.txt'
	)

	if success:
	print("Embedding process completed successfully!")
	else:
	print("Embedding process failed!")

	if __name__ == "__main__":
	main()