CSAle commited on
Commit
f25b2b3
0 Parent(s):

Adding Initial App

Browse files
Files changed (7) hide show
  1. .chainlit/.langchain.db +0 -0
  2. .chainlit/config.toml +29 -0
  3. .gitignore +4 -0
  4. Dockerfile +7 -0
  5. app.py +128 -0
  6. chainlit.md +11 -0
  7. requirements.txt +6 -0
.chainlit/.langchain.db ADDED
Binary file (12.3 kB). View file
 
.chainlit/config.toml ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ # Name of the app and chatbot.
3
+ name = "Arxiv Chatbot"
4
+ # Description of the app and chatbot. This is used for HTML tags.
5
+ # description = ""
6
+
7
+ # If true (default), the app will be available to anonymous users (once deployed).
8
+ # If false, users will need to authenticate and be part of the project to use the app.
9
+ public = true
10
+
11
+ # The project ID (found on https://cloud.chainlit.io).
12
+ # If provided, all the message data will be stored in the cloud.
13
+ # The project ID is required when public is set to false.
14
+ #id = ""
15
+
16
+ # Whether to enable telemetry (default: true). No personal data is collected.
17
+ enable_telemetry = false
18
+
19
+ # List of environment variables to be provided by each user to use the app.
20
+ user_env = ["OPENAI_API_KEY"]
21
+
22
+ # Hide the chain of thought details from the user in the UI.
23
+ hide_cot = false
24
+
25
+ # Link to your github repo. This will add a github button in the UI's header.
26
+ # github = ""
27
+
28
+ # Limit the number of requests per user.
29
+ #request_limit = "10 per day"
.gitignore ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ .env
2
+ .vscode
3
+ .chroma
4
+ __pycache__
Dockerfile ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ FROM 3.8.17-alpine3.18
2
+ # copy the requirements.txt file first to avoid cache invalidations
3
+ COPY requirements.txt /app/requirements.txt
4
+ WORKDIR /app
5
+ RUN pip install -r requirements.txt
6
+ COPY . /app
7
+ CMD ["chainlit", "app.py"]
app.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.embeddings.openai import OpenAIEmbeddings
2
+ from langchain.document_loaders import PyMuPDFLoader
3
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
4
+ from langchain.vectorstores import Chroma
5
+ from langchain.chains import RetrievalQAWithSourcesChain
6
+ from langchain.chat_models import ChatOpenAI
7
+ from langchain.prompts.chat import (
8
+ ChatPromptTemplate,
9
+ SystemMessagePromptTemplate,
10
+ HumanMessagePromptTemplate,
11
+ )
12
+ import os
13
+ import arxiv
14
+ import chainlit as cl
15
+ from chainlit import user_session
16
+
17
+ user_env = user_session.get("env")
18
+
19
+ system_template = """Use the following pieces of context to answer the users question.
20
+ If you don't know the answer, just say that you don't know, don't try to make up an answer.
21
+ ALWAYS return a "SOURCES" part in your answer.
22
+ The "SOURCES" part should be a reference to the source of the document from which you got your answer.
23
+
24
+ Example of your response should be:
25
+
26
+ ```
27
+ The answer is foo
28
+
29
+ SOURCES:
30
+ Title: xyz
31
+ Page Number: 1
32
+ URL: https://arxiv.org/abs/X.Y.Z
33
+ ```
34
+
35
+ Begin!
36
+ ----------------
37
+ {summaries}"""
38
+ messages = [
39
+ SystemMessagePromptTemplate.from_template(system_template),
40
+ HumanMessagePromptTemplate.from_template("{question}"),
41
+ ]
42
+ prompt = ChatPromptTemplate.from_messages(messages)
43
+ chain_type_kwargs = {"prompt": prompt}
44
+
45
+
46
+ @cl.langchain_factory
47
+ def init():
48
+ arxiv_query = None
49
+
50
+ # Wait for the user to ask an Arxiv question
51
+ while arxiv_query == None:
52
+ arxiv_query = cl.AskUserMessage(
53
+ content="Please enter a topic to begin!", timeout=15
54
+ ).send()
55
+
56
+ # Obtain the top 30 results from Arxiv for the query
57
+ search = arxiv.Search(
58
+ query=arxiv_query["content"],
59
+ max_results=30,
60
+ sort_by=arxiv.SortCriterion.Relevance,
61
+ )
62
+
63
+ # download each of the pdfs
64
+ pdf_data = []
65
+
66
+ for result in search.results():
67
+ loader = PyMuPDFLoader(result.pdf_url)
68
+ loaded_pdf = loader.load()
69
+
70
+ for document in loaded_pdf:
71
+ document.metadata["source"] = result.entry_id
72
+ document.metadata["file_path"] = result.pdf_url
73
+ document.metadata["title"] = result.title
74
+ pdf_data.append(document)
75
+
76
+ # Create a Chroma vector store
77
+ embeddings = OpenAIEmbeddings(disallowed_special=())
78
+ docsearch = Chroma.from_documents(pdf_data, embeddings)
79
+
80
+ # Create a chain that uses the Chroma vector store
81
+ chain = RetrievalQAWithSourcesChain.from_chain_type(
82
+ ChatOpenAI(
83
+ model_name="gpt-4",
84
+ temperature=0,
85
+ openai_api_key=user_env.get("OPENAI_API_KEY"),
86
+ ),
87
+ chain_type="stuff",
88
+ retriever=docsearch.as_retriever(),
89
+ return_source_documents=True,
90
+ )
91
+
92
+ # Let the user know that the system is ready
93
+ cl.Message(
94
+ content=f"We found a few papers about `{arxiv_query['content']}` you can now ask questions!"
95
+ ).send()
96
+
97
+ return chain
98
+
99
+
100
+ @cl.langchain_postprocess
101
+ def process_response(res):
102
+ answer = res["answer"]
103
+ source_elements_dict = {}
104
+ source_elements = []
105
+ for idx, source in enumerate(res["source_documents"]):
106
+ title = source.metadata["title"]
107
+
108
+ if title not in source_elements_dict:
109
+ source_elements_dict[title] = {
110
+ "page_number": [source.metadata["page"]],
111
+ "url": source.metadata["file_path"],
112
+ }
113
+
114
+ else:
115
+ source_elements_dict[title]["page_number"].append(source.metadata["page"])
116
+
117
+ # sort the page numbers
118
+ source_elements_dict[title]["page_number"].sort()
119
+
120
+ for title, source in source_elements_dict.items():
121
+ # create a string for the page numbers
122
+ page_numbers = ", ".join([str(x) for x in source["page_number"]])
123
+ text_for_source = f"Page Number(s): {page_numbers}\nURL: {source['url']}"
124
+ source_elements.append(
125
+ cl.Text(name=title, text=text_for_source, display="inline")
126
+ )
127
+
128
+ cl.Message(content=answer, elements=source_elements).send()
chainlit.md ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ⚠️ Warning ⚠️
2
+
3
+ You will need a GPT-4 API key to use this app due to large context size!
4
+
5
+ # Welcome to AskArxiv powered by Chainlit!
6
+
7
+ In this app, you'll be able to enter a topic - and then ask ~30 papers from Arxiv about that topic!
8
+
9
+ ### Link To Demo
10
+
11
+ [Hugging Face Space]()
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ arxiv==1.4.7
2
+ langchain==0.0.193
3
+ chainlit
4
+ openai
5
+ chromadb
6
+ tiktoken