'''
This script loads the plain text of the One Big Beautiful Bill, splits it into chunks,
creates embeddings using OpenAI, and stores them in a Chroma vectorstore.
'''
import os
from dotenv import load_dotenv
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma

load_dotenv() # Load environment variables from .env file

# Configuration
SOURCE_TEXT_PATH = os.path.join("books", "one_big_beautiful_bill.txt")
CHROMA_DB_PATH = "chroma_db_bill_text"
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY") # Ensure your OPENAI_API_KEY is set as an environment variable

# Chunking parameters (can be adjusted)
CHUNK_SIZE = 1000
CHUNK_OVERLAP = 200

def process_and_store_text():
    '''Loads text, splits it, creates embeddings, and stores them in Chroma.'''
    if not OPENAI_API_KEY:
        print("Error: OPENAI_API_KEY environment variable not set.")
        print("Please set your OpenAI API key to proceed.")
        print("Example: export OPENAI_API_KEY='your_api_key_here' (Linux/macOS)")
        print("         $Env:OPENAI_API_KEY='your_api_key_here' (Windows PowerShell)")
        return

    if not os.path.exists(SOURCE_TEXT_PATH):
        print(f"Error: Source text file not found at {SOURCE_TEXT_PATH}")
        print("Please ensure 'fetch_doc.py' has been run successfully.")
        return

    try:
        # 1. Load the text document
        print(f"Loading text from {SOURCE_TEXT_PATH}...")
        loader = TextLoader(SOURCE_TEXT_PATH, encoding='utf-8')
        documents = loader.load()
        print(f"Loaded {len(documents)} document(s).")

        # 2. Split the text into chunks
        print(f"Splitting text into chunks (size: {CHUNK_SIZE}, overlap: {CHUNK_OVERLAP})...")
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=CHUNK_SIZE,
            chunk_overlap=CHUNK_OVERLAP,
            length_function=len
        )
        chunks = text_splitter.split_documents(documents)
        print(f"Split into {len(chunks)} chunks.")

        if not chunks:
            print("No chunks were created. Please check the source file and chunking parameters.")
            return

        # 3. Create OpenAI embeddings
        print("Initializing OpenAI embeddings model...")
        embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
        print("OpenAI embeddings model initialized.")

        # 4. Create and persist the Chroma vectorstore
        print(f"Creating/loading Chroma vectorstore at {CHROMA_DB_PATH}...")
        # If the directory already exists, Chroma will try to load it.
        # If you want to ensure a fresh store, you might want to delete the CHROMA_DB_PATH directory first.
        vectorstore = Chroma.from_documents(
            documents=chunks,
            embedding=embeddings,
            persist_directory=CHROMA_DB_PATH
        )
        print(f"Vectorstore created/updated and persisted at {CHROMA_DB_PATH}.")
        print("Processing complete.")

    except Exception as e:
        print(f"An unexpected error occurred: {e}")

if __name__ == "__main__":
    process_and_store_text()