import logging logger = logging.getLogger() import os from langchain.docstore.document import Document from langchain.text_splitter import NLTKTextSplitter from langchain.callbacks import get_openai_callback from config import State FILE_DIR = 'files' KURS_URL = "https://www.kth.se/student/kurser/kurs/{course_code}?l={language}" DEFAULT_LANGUAGE = "en" CHUNK_SIZE = 1000 def ingest(state: State): with get_openai_callback() as cb: # make sure pwd is kth_qa pwd = os.getcwd() if pwd.split('/')[-1] != 'kth_qa': logger.error(f"pwd is not kth_qa, but {pwd}. Please run from kth_qa directory.") return text_splitter = NLTKTextSplitter.from_tiktoken_encoder( chunk_size=CHUNK_SIZE, chunk_overlap=100, ) file_folder_name = f'files/{DEFAULT_LANGUAGE}' file_folder = os.listdir(file_folder_name) all_langdocs = [] for file in file_folder: raw_docs = [] with open(f'{file_folder_name}/{file}', 'r') as f: text = f.read() filename = file.split('.')[0] course_code, language = filename.split('?l=') doc = Document(page_content=text, metadata={"course": course_code}) raw_docs.append(doc) logger.debug(f"loaded file {file}") langdocs = text_splitter.split_documents(raw_docs) logger.debug(f"split documents into {len(langdocs)} chunks") all_langdocs.extend(langdocs) logger.info(f"split all documents into {len(all_langdocs)} chunks") logger.info(f"Adding documents to pinecone...") state.store.add_documents(all_langdocs) logger.info(f"...done!") logger.info(f"Total cost of openai api calls: {cb.total_cost}") if __name__ == "__main__": logging.basicConfig(level=logging.INFO) logger.setLevel(logging.INFO) state = State() ingest(state)