hohieu's picture
init project
841b0ff
raw
history blame
2.01 kB
import os
from pyvi.ViTokenizer import tokenize
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
import pymongo
from generate_embedding import generate_embedding
os.environ["OPENAI_API_KEY"] = "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"
# Connect DB
client = pymongo.MongoClient(
"mongodb+srv://rag:p9vojYc9fafYwxE9@rag.xswi7nq.mongodb.net/?retryWrites=true&w=majority&appName=RAG"
)
db = client.rag
collection = db.pdf
def insertData(chunk):
return collection.insert_many(chunk)
def deleteByUserId(user_id: str):
return collection.delete_many({"user_id": user_id})
def readFromPDF():
# load PDF
loader = PyPDFLoader("data/cds.pdf")
pages = loader.load_and_split()
pages = list(filter(lambda page: page.metadata['page'] >= 10, pages))
text_splitter = RecursiveCharacterTextSplitter(chunk_size=768, chunk_overlap=200)
chunks = text_splitter.split_documents(pages)
items = []
for index, chunk in enumerate(chunks):
print(index)
items.append({"page_content": chunk.page_content, "index": index})
return items
def indexData(user_id: str):
items = readFromPDF()
contents = []
for item in items:
tokenized_page_content = tokenize(item["page_content"])
content = {
"page_content": item["page_content"],
"page_content_embedding": generate_embedding(tokenized_page_content),
"user_id": user_id,
"index": item["index"],
}
contents.append(content)
deleteByUserId(user_id)
insertData(contents)
indexData("cds.pdf")
# prompt = hub.pull("rlm/rag-prompt")
# llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
# def format_docs(docs):
# return "\n\n".join(doc.page_content for doc in docs)
# rag_chain = (
# {"context": retriever | format_docs, "question": RunnablePassthrough()}
# | prompt
# | llm
# | StrOutputParser()
# )