shoshana-levitt commited on
Commit
9966ecd
1 Parent(s): 9e2be3a

add app2.py

Browse files
__pycache__/app.cpython-310.pyc CHANGED
Binary files a/__pycache__/app.cpython-310.pyc and b/__pycache__/app.cpython-310.pyc differ
 
__pycache__/app2.cpython-310.pyc ADDED
Binary file (4.16 kB). View file
 
app2.py ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI
2
+ from langchain_community.document_loaders import PyPDFLoader
3
+ from langchain_community.embeddings import OpenAIEmbeddings
4
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
5
+ from langchain_community.vectorstores import Chroma
6
+ from langchain.chains import RetrievalQAWithSourcesChain
7
+ from langchain_community.chat_models import ChatOpenAI
8
+ from langchain.prompts.chat import (
9
+ ChatPromptTemplate,
10
+ SystemMessagePromptTemplate,
11
+ HumanMessagePromptTemplate,
12
+ )
13
+ import os
14
+ import chainlit as cl
15
+ import tempfile
16
+ from dotenv import load_dotenv
17
+
18
+ load_dotenv()
19
+
20
+ app = FastAPI()
21
+
22
+ import tiktoken
23
+ def tiktoken_len(text):
24
+ tokens = tiktoken.encoding_for_model("gpt-3.5-turbo").encode(
25
+ text,
26
+ )
27
+ return len(tokens)
28
+
29
+ # Split the document into chunks
30
+ text_splitter = RecursiveCharacterTextSplitter(
31
+ chunk_size=500, # 500 tokens per chunk, experiment with this value
32
+ chunk_overlap=50, # 50 tokens overlap between chunks, experiment with this value
33
+ length_function=tiktoken_len,
34
+ )
35
+
36
+ # Load the embeddings model
37
+ from langchain_openai.embeddings import OpenAIEmbeddings
38
+
39
+ embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")
40
+
41
+ from langchain_openai import ChatOpenAI
42
+ openai_chat_model = ChatOpenAI(model="gpt-3.5-turbo")
43
+
44
+ from langchain_core.prompts import ChatPromptTemplate
45
+
46
+ RAG_PROMPT = """
47
+ SYSTEM:
48
+ You are a professional personal assistant.
49
+
50
+ CONTEXT:
51
+ {context}
52
+
53
+ QUERY:
54
+ {question}
55
+ """
56
+ rag_prompt = ChatPromptTemplate.from_template(RAG_PROMPT)
57
+
58
+ from operator import itemgetter
59
+ from langchain.schema.output_parser import StrOutputParser
60
+ from langchain.schema.runnable import RunnablePassthrough
61
+
62
+ @cl.on_chat_start
63
+ async def init():
64
+ files = None
65
+
66
+ # Wait for the user to upload a file
67
+ while files is None:
68
+ files = await cl.AskFileMessage(
69
+ content="Please upload a file to start chatting!", accept=["pdf"]
70
+ ).send()
71
+
72
+ file = files[0]
73
+
74
+ msg = cl.Message(content=f"Processing `{file.name}`...")
75
+ await msg.send()
76
+
77
+ with tempfile.NamedTemporaryFile(delete=False) as temp:
78
+ temp.write(file.content)
79
+ temp_path = temp.name
80
+
81
+ # Load the PDF using PyPDFLoader into an array of documents, where each document contains the page content and metadata with page number.
82
+ loader = PyPDFLoader(temp_path)
83
+ docs = loader.load_and_split() # Define `docs` by loading and splitting the PDF
84
+
85
+ # Split the documents into chunks
86
+ split_chunks = text_splitter.split_documents(docs) # Split the `docs` into chunks
87
+
88
+ # Combine the page content into a single text variable.
89
+ text = ' '.join([page.page_content for page in docs]) # Use `docs` to create the `text` variable
90
+
91
+ # Split the text into chunks
92
+ texts = text_splitter.split_text(text) # Split the `text` into chunks
93
+
94
+ # Create metadata for each chunk
95
+ metadatas = [{"source": f"{i}-word"} for i in range(len(texts))] # Create metadata for each chunk
96
+
97
+ # Create a Chroma vector store
98
+ embeddings = OpenAIEmbeddings()
99
+ docsearch = await cl.make_async(Chroma.from_texts)(
100
+ texts, embeddings, metadatas=metadatas # Use `texts` and `metadatas` to create the vector store
101
+ )
102
+
103
+ # Create a chain that uses the Chroma vector store
104
+ chain = RetrievalQAWithSourcesChain.from_chain_type(
105
+ ChatOpenAI(temperature=0),
106
+ chain_type="stuff",
107
+ retriever=docsearch.as_retriever(), # Use the Chroma retriever
108
+ )
109
+
110
+ # Save the metadata and texts in the user session
111
+ cl.user_session.set("metadatas", metadatas) # Save `metadatas` in the user session
112
+ cl.user_session.set("texts", texts) # Save `texts` in the user session
113
+
114
+ # Let the user know that the system is ready
115
+ msg.content = f"`{file.name}` processed. You can now ask questions!"
116
+ await msg.update()
117
+
118
+ cl.user_session.set("chain", chain)
119
+
120
+ @cl.on_message
121
+ async def process_response(message):
122
+ chain = cl.user_session.get("chain")
123
+
124
+ if chain is None:
125
+ await cl.Message(content="The system is not initialized. Please upload a PDF file first.").send()
126
+ return
127
+
128
+ # Use the chain to process the user's question
129
+ response = await chain.acall({
130
+ "question": message.content
131
+ })
132
+
133
+ answer = response["answer"]
134
+ sources = response["sources"].strip()
135
+ source_elements = []
136
+
137
+ # Get the metadata and texts from the user session
138
+ metadatas = cl.user_session.get("metadatas")
139
+ all_sources = [m["source"] for m in metadatas]
140
+ texts = cl.user_session.get("texts")
141
+
142
+ if sources:
143
+ found_sources = []
144
+
145
+ # Add the sources to the message
146
+ for source in sources.split(","):
147
+ source_name = source.strip().replace(".", "")
148
+ # Get the index of the source
149
+ try:
150
+ index = all_sources.index(source_name)
151
+ except ValueError:
152
+ continue
153
+ text = texts[index]
154
+ found_sources.append(source_name)
155
+ # Create the text element referenced in the message
156
+ source_elements.append(cl.Text(content=text, name=source_name))
157
+
158
+ if found_sources:
159
+ answer += f"\nSources: {', '.join(found_sources)}"
160
+ else:
161
+ answer += "\nNo sources found"
162
+
163
+ await cl.Message(content=answer, elements=source_elements).send()