Spaces:

shoshana-levitt
/

pdf-ingestor

Sleeping

App Files Files Community

shoshana-levitt commited on Jul 23

Commit

9966ecd

•

1 Parent(s): 9e2be3a

add app2.py

Browse files

Files changed (3) hide show

__pycache__/app.cpython-310.pyc +0 -0
__pycache__/app2.cpython-310.pyc +0 -0
app2.py +163 -0

__pycache__/app.cpython-310.pyc CHANGED Viewed

Binary files a/__pycache__/app.cpython-310.pyc and b/__pycache__/app.cpython-310.pyc differ

__pycache__/app2.cpython-310.pyc ADDED Viewed

Binary file (4.16 kB). View file

app2.py ADDED Viewed

	@@ -0,0 +1,163 @@

+from fastapi import FastAPI
+from langchain_community.document_loaders import PyPDFLoader
+from langchain_community.embeddings import OpenAIEmbeddings
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_community.vectorstores import Chroma
+from langchain.chains import RetrievalQAWithSourcesChain
+from langchain_community.chat_models import ChatOpenAI
+from langchain.prompts.chat import (
+    ChatPromptTemplate,
+    SystemMessagePromptTemplate,
+    HumanMessagePromptTemplate,
+)
+import os
+import chainlit as cl
+import tempfile
+from dotenv import load_dotenv
+load_dotenv()
+app = FastAPI()
+import tiktoken
+def tiktoken_len(text):
+    tokens = tiktoken.encoding_for_model("gpt-3.5-turbo").encode(
+        text,
+    )
+    return len(tokens)
+# Split the document into chunks
+text_splitter = RecursiveCharacterTextSplitter(
+    chunk_size=500,           # 500 tokens per chunk, experiment with this value
+    chunk_overlap=50,        # 50 tokens overlap between chunks, experiment with this value
+    length_function=tiktoken_len,
+)
+# Load the embeddings model
+from langchain_openai.embeddings import OpenAIEmbeddings
+embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")
+from langchain_openai import ChatOpenAI
+openai_chat_model = ChatOpenAI(model="gpt-3.5-turbo")
+from langchain_core.prompts import ChatPromptTemplate
+RAG_PROMPT = """
+SYSTEM:
+You are a professional personal assistant.
+CONTEXT:
+{context}
+QUERY:
+{question}
+"""
+rag_prompt = ChatPromptTemplate.from_template(RAG_PROMPT)
+from operator import itemgetter
+from langchain.schema.output_parser import StrOutputParser
+from langchain.schema.runnable import RunnablePassthrough
+@cl.on_chat_start
+async def init():
+    files = None
+    # Wait for the user to upload a file
+    while files is None:
+        files = await cl.AskFileMessage(
+            content="Please upload a file to start chatting!", accept=["pdf"]
+        ).send()
+    file = files[0]
+    msg = cl.Message(content=f"Processing `{file.name}`...")
+    await msg.send()
+    with tempfile.NamedTemporaryFile(delete=False) as temp:
+        temp.write(file.content)
+        temp_path = temp.name
+    # Load the PDF using PyPDFLoader into an array of documents, where each document contains the page content and metadata with page number.
+    loader = PyPDFLoader(temp_path)
+    docs = loader.load_and_split()  # Define `docs` by loading and splitting the PDF
+    # Split the documents into chunks
+    split_chunks = text_splitter.split_documents(docs)  # Split the `docs` into chunks
+    # Combine the page content into a single text variable.
+    text = ' '.join([page.page_content for page in docs])  # Use `docs` to create the `text` variable
+    # Split the text into chunks
+    texts = text_splitter.split_text(text)  # Split the `text` into chunks
+    # Create metadata for each chunk
+    metadatas = [{"source": f"{i}-word"} for i in range(len(texts))]  # Create metadata for each chunk
+    # Create a Chroma vector store
+    embeddings = OpenAIEmbeddings()
+    docsearch = await cl.make_async(Chroma.from_texts)(
+        texts, embeddings, metadatas=metadatas  # Use `texts` and `metadatas` to create the vector store
+    )
+    # Create a chain that uses the Chroma vector store
+    chain = RetrievalQAWithSourcesChain.from_chain_type(
+        ChatOpenAI(temperature=0),
+        chain_type="stuff",
+        retriever=docsearch.as_retriever(),  # Use the Chroma retriever
+    )
+    # Save the metadata and texts in the user session
+    cl.user_session.set("metadatas", metadatas)  # Save `metadatas` in the user session
+    cl.user_session.set("texts", texts)  # Save `texts` in the user session
+    # Let the user know that the system is ready
+    msg.content = f"`{file.name}` processed. You can now ask questions!"
+    await msg.update()
+    cl.user_session.set("chain", chain)
+@cl.on_message
+async def process_response(message):
+    chain = cl.user_session.get("chain")
+    if chain is None:
+        await cl.Message(content="The system is not initialized. Please upload a PDF file first.").send()
+        return
+    # Use the chain to process the user's question
+    response = await chain.acall({
+        "question": message.content
+    })
+    answer = response["answer"]
+    sources = response["sources"].strip()
+    source_elements = []
+    # Get the metadata and texts from the user session
+    metadatas = cl.user_session.get("metadatas")
+    all_sources = [m["source"] for m in metadatas]
+    texts = cl.user_session.get("texts")
+    if sources:
+        found_sources = []
+        # Add the sources to the message
+        for source in sources.split(","):
+            source_name = source.strip().replace(".", "")
+            # Get the index of the source
+            try:
+                index = all_sources.index(source_name)
+            except ValueError:
+                continue
+            text = texts[index]
+            found_sources.append(source_name)
+            # Create the text element referenced in the message
+            source_elements.append(cl.Text(content=text, name=source_name))
+        if found_sources:
+            answer += f"\nSources: {', '.join(found_sources)}"
+        else:
+            answer += "\nNo sources found"
+    await cl.Message(content=answer, elements=source_elements).send()