Spaces:
Runtime error
Runtime error
import os | |
from pyvi.ViTokenizer import tokenize | |
from langchain_community.document_loaders import PyPDFLoader | |
from langchain_text_splitters import RecursiveCharacterTextSplitter | |
import pymongo | |
from generate_embedding import generate_embedding | |
os.environ["OPENAI_API_KEY"] = "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" | |
# Connect DB | |
client = pymongo.MongoClient( | |
"mongodb+srv://rag:p9vojYc9fafYwxE9@rag.xswi7nq.mongodb.net/?retryWrites=true&w=majority&appName=RAG" | |
) | |
db = client.rag | |
collection = db.pdf | |
def insertData(chunk): | |
return collection.insert_many(chunk) | |
def deleteByUserId(user_id: str): | |
return collection.delete_many({"user_id": user_id}) | |
def readFromPDF(): | |
# load PDF | |
loader = PyPDFLoader("data/cds.pdf") | |
pages = loader.load_and_split() | |
pages = list(filter(lambda page: page.metadata['page'] >= 10, pages)) | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=768, chunk_overlap=200) | |
chunks = text_splitter.split_documents(pages) | |
items = [] | |
for index, chunk in enumerate(chunks): | |
print(index) | |
items.append({"page_content": chunk.page_content, "index": index}) | |
return items | |
def indexData(user_id: str): | |
items = readFromPDF() | |
contents = [] | |
for item in items: | |
tokenized_page_content = tokenize(item["page_content"]) | |
content = { | |
"page_content": item["page_content"], | |
"page_content_embedding": generate_embedding(tokenized_page_content), | |
"user_id": user_id, | |
"index": item["index"], | |
} | |
contents.append(content) | |
deleteByUserId(user_id) | |
insertData(contents) | |
indexData("cds.pdf") | |
# prompt = hub.pull("rlm/rag-prompt") | |
# llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0) | |
# def format_docs(docs): | |
# return "\n\n".join(doc.page_content for doc in docs) | |
# rag_chain = ( | |
# {"context": retriever | format_docs, "question": RunnablePassthrough()} | |
# | prompt | |
# | llm | |
# | StrOutputParser() | |
# ) | |