suneeln-duke's picture
push
85300c0
raw
history blame
No virus
2.47 kB
import os
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores.chroma import Chroma
import os
from langchain.vectorstores.chroma import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
import PyPDF2
def read_pages(pdf_file):
pages = []
reader = PyPDF2.PdfReader(pdf_file)
for page_number in range(len(reader.pages)):
page = reader.pages[page_number]
page_content = page.extract_text()
pages.append(page_content)
return pages
def get_chunks(file_path):
loader = PyPDFLoader(file_path)
documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=300,
chunk_overlap=100,
length_function=len,
add_start_index=True,
)
chunks = text_splitter.split_documents(documents)
return chunks
def get_vectordb(chunks, CHROMA_PATH):
CHROMA_PATH = f"../../data/chroma/{CHROMA_PATH}"
if os.path.exists(CHROMA_PATH):
db = Chroma(persist_directory=CHROMA_PATH, embedding_function=OpenAIEmbeddings())
else:
db = Chroma.from_documents(
chunks, OpenAIEmbeddings(), persist_directory=CHROMA_PATH
)
db.persist()
print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}.")
return db
def gen_summary(text, db):
PROMPT_TEMPLATE = """
Answer the question based only on the following context:
{context}
---
Answer the question based on the above context: {question}
"""
query_text = f"""
Summarize the given chunk from a story. The summary should be of narrartive nature and be around 5-7 sentences long.
```{text}```
Generate response in the following JSON format:
{{
"summary": "Your summary here.",
"text: "The original text here."
}}
"""
results = db.similarity_search_with_relevance_scores(query_text, k=5)
context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
prompt = prompt_template.format(context=context_text, question=query_text)
model = ChatOpenAI()
response_text = model.predict(prompt)
return eval(response_text)