Spaces:
Sleeping
Sleeping
import chromadb | |
from chromadb.utils import embedding_functions | |
from tqdm import tqdm | |
import os | |
from typing import List, Dict | |
class TextEmbedder: | |
def __init__(self, collection_name: str = "text_collection"): | |
# Initialize ChromaDB client | |
self.chroma_client = chromadb.Client() | |
# Initialize embedding function | |
self.embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction( | |
model_name="all-MiniLM-L6-v2" | |
) | |
# Create collection | |
self.collection = self.chroma_client.create_collection( | |
name=collection_name, | |
embedding_function=self.embedding_function, | |
metadata={"hnsw:space": "cosine"} | |
) | |
def process_files(self, text_file: str, index_file: str, chunk_size: int = 512): | |
"""Process main text file and index file""" | |
try: | |
# Read main text file | |
print("Reading main text file...") | |
with open(text_file, 'r', encoding='utf-8') as f: | |
text_content = f.read() | |
# Read index file | |
print("Reading index file...") | |
with open(index_file, 'r', encoding='utf-8') as f: | |
index_lines = f.readlines() | |
# Create chunks from text content | |
chunks = [] | |
for i in range(0, len(text_content), chunk_size): | |
chunk = text_content[i:i + chunk_size] | |
chunks.append(chunk) | |
print(f"Created {len(chunks)} chunks from text") | |
# Add documents to collection | |
print("Adding documents to ChromaDB...") | |
for i, chunk in enumerate(tqdm(chunks)): | |
# Get corresponding index line if available | |
index_text = index_lines[i].strip() if i < len(index_lines) else f"Chunk {i+1}" | |
self.collection.add( | |
documents=[chunk], | |
ids=[f"doc_{i}"], | |
metadatas=[{ | |
"index": index_text, | |
"chunk_number": i, | |
"source": "a2023-45.txt" | |
}] | |
) | |
print("Successfully processed all documents!") | |
return True | |
except Exception as e: | |
print(f"Error processing files: {str(e)}") | |
return False | |
def main(): | |
# Initialize embedder | |
embedder = TextEmbedder() | |
# Process files | |
success = embedder.process_files( | |
text_file='a2023-45.txt', | |
index_file='index.txt' | |
) | |
if success: | |
print("Embedding process completed successfully!") | |
else: | |
print("Embedding process failed!") | |
if __name__ == "__main__": | |
main() |