| |
| """ |
| Created on Tue Jul 25 10:36:41 2023 |
| |
| This script uses LangChain and Chroma to load, split and store PID data |
| |
| @author: intern.giwon.kim |
| """ |
| from langchain.embeddings.openai import OpenAIEmbeddings |
| from langchain.vectorstores import Chroma |
| from langchain.text_splitter import CharacterTextSplitter |
| from langchain.document_loaders import UnstructuredURLLoader |
| import os |
| from langchain.document_loaders import PyPDFLoader |
| from langchain.document_loaders import Docx2txtLoader |
| from langchain.document_loaders import TextLoader |
| import datetime |
| |
| os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY") |
|
|
| |
| |
| |
| |
| def preProcess(): |
| |
| now = datetime.datetime.now() |
| start_time = now.time() |
| print("Loading Document - " + str(start_time)) |
| documents = [] |
| doc_num = 0 |
| for file in os.listdir('DataSource'): |
| if file.endswith('.pdf'): |
| pdf_path = './DataSource/' + file |
| loader = PyPDFLoader(pdf_path) |
| documents.extend(loader.load()) |
| elif file.endswith('.docx') or file.endswith('.doc'): |
| doc_path = './DataSource/' + file |
| loader = Docx2txtLoader(doc_path) |
| documents.extend(loader.load()) |
| elif file.endswith('.txt'): |
| text_path = './DataSource/' + file |
| loader = TextLoader(text_path, encoding='latin-1') |
| documents.extend(loader.load()) |
| doc_num = doc_num + 1 |
| print(f"{doc_num} number of document loaded") |
| |
| |
| |
| |
| now = datetime.datetime.now() |
| print("Splitting Document - " + str(now.time())) |
| text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0) |
| documents = text_splitter.split_documents(documents) |
| |
| |
| now = datetime.datetime.now() |
| print("Embedding Document - " + str(now.time())) |
| embeddings = OpenAIEmbeddings() |
| db2 = Chroma.from_documents(documents, embeddings, persist_directory="ChromaDB/") |
| db2.persist() |
| db2 = None |
|
|
|
|