Spaces:
Sleeping
Sleeping
import os | |
# from dotenv import load_dotenv | |
from chromadb.config import Settings | |
# https://python.langchain.com/en/latest/modules/indexes/document_loaders/examples/excel.html?highlight=xlsx#microsoft-excel | |
from langchain.document_loaders import CSVLoader, PDFMinerLoader, TextLoader, UnstructuredExcelLoader, Docx2txtLoader | |
# load_dotenv() | |
ROOT_DIRECTORY = os.path.dirname(os.path.realpath(__file__)) | |
# Define the folder for storing database | |
SOURCE_DIRECTORY = f"{ROOT_DIRECTORY}/SOURCE_DOCUMENTS" | |
PERSIST_DIRECTORY = f"{ROOT_DIRECTORY}/DB" | |
# Can be changed to a specific number | |
INGEST_THREADS = os.cpu_count() or 8 | |
# Define the Chroma settings | |
CHROMA_SETTINGS = Settings( | |
chroma_db_impl="duckdb+parquet", persist_directory=PERSIST_DIRECTORY, anonymized_telemetry=False | |
) | |
# https://python.langchain.com/en/latest/_modules/langchain/document_loaders/excel.html#UnstructuredExcelLoader | |
DOCUMENT_MAP = { | |
".txt": TextLoader, | |
".md": TextLoader, | |
".py": TextLoader, | |
".pdf": PDFMinerLoader, | |
".csv": CSVLoader, | |
".xls": UnstructuredExcelLoader, | |
".xlsx": UnstructuredExcelLoader, | |
".docx": Docx2txtLoader, | |
".doc": Docx2txtLoader, | |
} | |
# Default Instructor Model | |
EMBEDDING_MODEL_NAME = "hkunlp/instructor-large" | |
# You can also choose a smaller model, don't forget to change HuggingFaceInstructEmbeddings | |
# to HuggingFaceEmbeddings in both ingest.py and run_localGPT.py | |
# EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2" | |