Spaces:
Sleeping
Sleeping
from dotenv import load_dotenv | |
load_dotenv() | |
import os | |
import logging | |
from llama_index.core.settings import Settings | |
from llama_index.core.ingestion import IngestionPipeline | |
from llama_index.core.node_parser import SentenceSplitter | |
from llama_index.core.storage.docstore import SimpleDocumentStore | |
from llama_index.core.storage import StorageContext | |
from app.settings import init_settings | |
from app.engine.loaders import get_documents | |
from app.engine.vectordb import get_vector_store | |
logging.basicConfig(level=logging.INFO) | |
logger = logging.getLogger() | |
STORAGE_DIR = os.getenv("STORAGE_DIR", "storage") | |
def get_doc_store(): | |
# If the storage directory is there, load the document store from it. | |
# If not, set up an in-memory document store since we can't load from a directory that doesn't exist. | |
if os.path.exists(STORAGE_DIR): | |
return SimpleDocumentStore.from_persist_dir(STORAGE_DIR) | |
else: | |
return SimpleDocumentStore() | |
def run_pipeline(docstore, vector_store, documents): | |
pipeline = IngestionPipeline( | |
transformations=[ | |
SentenceSplitter( | |
chunk_size=Settings.chunk_size, | |
chunk_overlap=Settings.chunk_overlap, | |
), | |
Settings.embed_model, | |
], | |
docstore=docstore, | |
docstore_strategy="upserts_and_delete", | |
vector_store=vector_store, | |
) | |
# Run the ingestion pipeline and store the results | |
nodes = pipeline.run(show_progress=True, documents=documents) | |
return nodes | |
def persist_storage(docstore, vector_store): | |
storage_context = StorageContext.from_defaults( | |
docstore=docstore, | |
vector_store=vector_store, | |
) | |
storage_context.persist(STORAGE_DIR) | |
def generate_datasource(): | |
init_settings() | |
logger.info("Generate index for the provided data") | |
# Get the stores and documents or create new ones | |
documents = get_documents() | |
docstore = get_doc_store() | |
vector_store = get_vector_store() | |
# Run the ingestion pipeline | |
_ = run_pipeline(docstore, vector_store, documents) | |
# Build the index and persist storage | |
persist_storage(docstore, vector_store) | |
logger.info("Finished generating the index") | |
if __name__ == "__main__": | |
generate_datasource() | |