Spaces:

suneeln-duke
/

nexusai-v2

Runtime error

App Files Files Community

nexusai-v2 / scripts /summarization /rag_summ.py

suneeln-duke

push

85300c0 4 months ago

raw

history blame

No virus

2.47 kB

	import os

	from langchain.document_loaders import PyPDFLoader
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain.embeddings import OpenAIEmbeddings
	from langchain.vectorstores.chroma import Chroma
	import os

	from langchain.vectorstores.chroma import Chroma
	from langchain.embeddings import OpenAIEmbeddings
	from langchain.chat_models import ChatOpenAI
	from langchain.prompts import ChatPromptTemplate


	import PyPDF2

	def read_pages(pdf_file):
	pages = []

	reader = PyPDF2.PdfReader(pdf_file)

	for page_number in range(len(reader.pages)):

	page = reader.pages[page_number]

	page_content = page.extract_text()

	pages.append(page_content)

	return pages

	def get_chunks(file_path):

	loader = PyPDFLoader(file_path)

	documents = loader.load()

	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=300,
	chunk_overlap=100,
	length_function=len,
	add_start_index=True,
	)

	chunks = text_splitter.split_documents(documents)

	return chunks

	def get_vectordb(chunks, CHROMA_PATH):

	CHROMA_PATH = f"../../data/chroma/{CHROMA_PATH}"

	if os.path.exists(CHROMA_PATH):
	db = Chroma(persist_directory=CHROMA_PATH, embedding_function=OpenAIEmbeddings())

	else:
	db = Chroma.from_documents(
	chunks, OpenAIEmbeddings(), persist_directory=CHROMA_PATH
	)

	db.persist()

	print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}.")

	return db

	def gen_summary(text, db):

	PROMPT_TEMPLATE = """
	Answer the question based only on the following context:

	{context}

	---

	Answer the question based on the above context: {question}
	"""

	query_text = f"""

	Summarize the given chunk from a story. The summary should be of narrartive nature and be around 5-7 sentences long.

	```{text}```

	Generate response in the following JSON format:

	{{
	"summary": "Your summary here.",
	"text: "The original text here."
	}}

	"""

	results = db.similarity_search_with_relevance_scores(query_text, k=5)

	context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
	prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
	prompt = prompt_template.format(context=context_text, question=query_text)

	model = ChatOpenAI()
	response_text = model.predict(prompt)

	return eval(response_text)