Spaces:

gkim93
/

PID

Runtime error

App Files Files Community

PID / PreProcessing.py

gkim93

Update PreProcessing.py

f6aedeb over 2 years ago

raw

history blame contribute delete

2.35 kB

	# -- coding: utf-8 --
	"""
	Created on Tue Jul 25 10:36:41 2023

	This script uses LangChain and Chroma to load, split and store PID data

	@author: intern.giwon.kim
	"""
	from langchain.embeddings.openai import OpenAIEmbeddings
	from langchain.vectorstores import Chroma
	from langchain.text_splitter import CharacterTextSplitter
	from langchain.document_loaders import UnstructuredURLLoader
	import os
	from langchain.document_loaders import PyPDFLoader
	from langchain.document_loaders import Docx2txtLoader
	from langchain.document_loaders import TextLoader
	import datetime
	#Set Open AI API Key
	os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

	# urls = [
	# "https://www.adb.org/sites/default/files/project-documents/49006/49006-003-pcr-en.pdf",
	# "https://www.adb.org/sites/default/files/project-documents/38412/38412-013-38412-023-38412-033-43069-012-pcr-en.pdf",
	# ]
	def preProcess():
	# Data Ingestion
	now = datetime.datetime.now()
	start_time = now.time()
	print("Loading Document - " + str(start_time))
	documents = []
	doc_num = 0
	for file in os.listdir('DataSource'):
	if file.endswith('.pdf'):
	pdf_path = './DataSource/' + file
	loader = PyPDFLoader(pdf_path)
	documents.extend(loader.load())
	elif file.endswith('.docx') or file.endswith('.doc'):
	doc_path = './DataSource/' + file
	loader = Docx2txtLoader(doc_path)
	documents.extend(loader.load())
	elif file.endswith('.txt'):
	text_path = './DataSource/' + file
	loader = TextLoader(text_path, encoding='latin-1')
	documents.extend(loader.load())
	doc_num = doc_num + 1
	print(f"{doc_num} number of document loaded")
	#Document Loading
	# loader = UnstructuredURLLoader(urls=urls)

	#Document Chunking
	now = datetime.datetime.now()
	print("Splitting Document - " + str(now.time()))
	text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
	documents = text_splitter.split_documents(documents)

	#Save Chroma Vector data
	now = datetime.datetime.now()
	print("Embedding Document - " + str(now.time()))
	embeddings = OpenAIEmbeddings()
	db2 = Chroma.from_documents(documents, embeddings, persist_directory="ChromaDB/")
	db2.persist()
	db2 = None