whitead commited on
Commit
e19b69e
1 Parent(s): ffb89fc

Updated to enable github repos

Browse files
Files changed (2) hide show
  1. app.py +121 -34
  2. requirements.txt +3 -2
app.py CHANGED
@@ -1,5 +1,12 @@
1
  import gradio as gr
2
  import paperqa
 
 
 
 
 
 
 
3
 
4
 
5
  docs = None
@@ -8,7 +15,7 @@ docs = None
8
  def request_pathname(files):
9
  if files is None:
10
  return [[]]
11
- return [[file.name, file.name.split('/')[-1]] for file in files]
12
 
13
 
14
  def validate_dataset(dataset, openapi):
@@ -25,8 +32,13 @@ def validate_dataset(dataset, openapi):
25
  return "⚠️Waiting for documents and key..."
26
 
27
 
28
- def do_ask(question, button, openapi, dataset, slider, progress=gr.Progress()):
29
- global docs
 
 
 
 
 
30
  docs_ready = dataset.iloc[-1, 0] != ""
31
  if button == "✨Ready✨" and type(openapi) is str and len(openapi) > 0 and docs_ready:
32
  if docs is None: # don't want to rebuild index if it's already built
@@ -35,30 +47,81 @@ def do_ask(question, button, openapi, dataset, slider, progress=gr.Progress()):
35
  docs = paperqa.Docs()
36
  # dataset is pandas dataframe
37
  for _, row in dataset.iterrows():
38
- key = None
39
- if ',' not in row['citation string']:
40
- key = row['citation string']
41
- docs.add(row['filepath'], row['citation string'],
42
- key=key, disable_check=True)
 
43
  else:
44
- return "", "", ""
45
- progress(0, "Building Index...")
46
  docs._build_faiss_index()
47
- progress(0.25, "Querying...")
48
- result = docs.query(question, length_prompt=f'use {slider:d} words')
49
- progress(1.0, "Done!")
 
 
 
 
 
50
  # format the passages
51
- passages = ""
52
  for i, (key, passage) in enumerate(result.passages.items()):
53
  passages += f'{i+1}. {key}\n\n >{passage} \n\n'
54
- return result.formatted_answer, result.context, passages
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
 
56
 
57
  with gr.Blocks() as demo:
 
 
 
 
58
  gr.Markdown(f"""
59
  # Document Question and Answer (v{paperqa.__version__})
60
 
61
- This tool will enable asking questions of your uploaded text or PDF documents.
 
 
 
62
  It uses OpenAI's GPT models and thus you must enter your API key below. This
63
  tool is under active development and currently uses many tokens - up to 10,000
64
  for a single query. That is $0.10-0.20 per query, so please be careful!
@@ -66,38 +129,60 @@ with gr.Blocks() as demo:
66
  * [PaperQA](https://github.com/whitead/paper-qa) is the code used to build this tool.
67
  * [langchain](https://github.com/hwchase17/langchain) is the main library this tool utilizes.
68
 
69
- ## Instructions
70
-
71
  1. Enter API Key ([What is that?](https://platform.openai.com/account/api-keys))
72
- 2. Upload your documents and modify citation strings if you want (to look prettier)
73
  """)
74
  openai_api_key = gr.Textbox(
75
  label="OpenAI API Key", placeholder="sk-...", type="password")
76
- uploaded_files = gr.File(
77
- label="Your Documents Upload (PDF or txt)", file_count="multiple", )
78
- dataset = gr.Dataframe(
79
- headers=["filepath", "citation string"],
80
- datatype=["str", "str"],
81
- col_count=(2, "fixed"),
82
- interactive=True,
83
- label="Documents and Citations"
84
- )
 
 
 
 
 
 
 
 
 
85
  buildb = gr.Textbox("⚠️Waiting for documents and key...",
86
- label="Status", interactive=False, show_label=True)
 
 
 
 
 
 
87
  openai_api_key.change(validate_dataset, inputs=[
88
  dataset, openai_api_key], outputs=[buildb])
89
  dataset.change(validate_dataset, inputs=[
90
  dataset, openai_api_key], outputs=[buildb])
91
  uploaded_files.change(request_pathname, inputs=[
92
- uploaded_files], outputs=[dataset])
 
 
93
  query = gr.Textbox(
94
  placeholder="Enter your question here...", label="Question")
95
- slider = gr.Slider(25, 200, value=100, step=5,
96
- label='Suggested # of words in answer')
 
 
 
 
 
 
 
97
  ask = gr.Button("Ask Question")
98
  gr.Markdown("## Answer")
99
  answer = gr.Markdown(label="Answer")
100
- with gr.Accordion("Context", open=False):
101
  gr.Markdown(
102
  "### Context\n\nThe following context was used to generate the answer:")
103
  context = gr.Markdown(label="Context")
@@ -107,7 +192,9 @@ with gr.Blocks() as demo:
107
  "### Raw Text\n\nThe following raw text was used to generate the answer:")
108
  passages = gr.Markdown(label="Passages")
109
  ask.click(fn=do_ask, inputs=[query, buildb,
110
- openai_api_key, dataset, slider], outputs=[answer, context, passages])
 
 
111
 
112
  demo.queue(concurrency_count=20)
113
  demo.launch(show_error=True)
 
1
  import gradio as gr
2
  import paperqa
3
+ import pickle
4
+ from pathlib import Path
5
+ import requests
6
+ import zipfile
7
+ import io
8
+ import tempfile
9
+ import os
10
 
11
 
12
  docs = None
 
15
  def request_pathname(files):
16
  if files is None:
17
  return [[]]
18
+ return [[file.name, file.name.split('/')[-1], None] for file in files], [[len(files), 0]]
19
 
20
 
21
  def validate_dataset(dataset, openapi):
 
32
  return "⚠️Waiting for documents and key..."
33
 
34
 
35
+ def make_stats(docs):
36
+ return [[len(docs.doc_previews), sum([x[0] for x in docs.doc_previews])]]
37
+
38
+
39
+ # , progress=gr.Progress()):
40
+ def do_ask(question, button, openapi, dataset, length, do_marg, k, max_sources, docs):
41
+ passages = ""
42
  docs_ready = dataset.iloc[-1, 0] != ""
43
  if button == "✨Ready✨" and type(openapi) is str and len(openapi) > 0 and docs_ready:
44
  if docs is None: # don't want to rebuild index if it's already built
 
47
  docs = paperqa.Docs()
48
  # dataset is pandas dataframe
49
  for _, row in dataset.iterrows():
50
+ try:
51
+ docs.add(row['filepath'], row['citation string'],
52
+ key=row['key'], disable_check=True)
53
+ yield "", "", "", docs, make_stats(docs)
54
+ except Exception as e:
55
+ pass
56
  else:
57
+ yield "", "", "", docs, [[0, 0]]
58
+ #progress(0, "Building Index...")
59
  docs._build_faiss_index()
60
+ #progress(0.25, "Querying...")
61
+ for i, result in enumerate(docs.query_gen(question,
62
+ length_prompt=f'use {length:d} words',
63
+ marginal_relevance=do_marg,
64
+ k=k, max_sources=max_sources)):
65
+ #progress(0.25 + 0.1 * i, "Generating Context" + str(i))
66
+ yield result.formatted_answer, result.context, passages, docs, make_stats(docs)
67
+ #progress(1.0, "Done!")
68
  # format the passages
 
69
  for i, (key, passage) in enumerate(result.passages.items()):
70
  passages += f'{i+1}. {key}\n\n >{passage} \n\n'
71
+ yield result.formatted_answer, result.context, passages, docs, make_stats(docs)
72
+
73
+
74
+ def download_repo(gh_repo, pbar=gr.Progress()):
75
+ # download zipped version of repo
76
+ r = requests.get(f'https://api.github.com/repos/{gh_repo}/zipball')
77
+ files = []
78
+ if r.status_code == 200:
79
+ pbar(1, 'Downloaded')
80
+
81
+ # iterate through files in zip
82
+ with zipfile.ZipFile(io.BytesIO(r.content)) as z:
83
+ for i, f in enumerate(z.namelist()):
84
+ # skip directories
85
+ if f.endswith('/'):
86
+ continue
87
+ # try to read as plaintext (skip binary files)
88
+ try:
89
+ text = z.read(f).decode('utf-8')
90
+ except UnicodeDecodeError:
91
+ continue
92
+ # check if it's bigger than 1MB or smaller than 10 bytes
93
+ if len(text) > 1e6 or len(text) < 10:
94
+ continue
95
+ # have to save to temporary file so we have a path
96
+ with tempfile.NamedTemporaryFile(delete=False) as tmp:
97
+ tmp.write(text.encode('utf-8'))
98
+ tmp.flush()
99
+ path = tmp.name
100
+ # strip off the first directory of f
101
+ rel_path = '/'.join(f.split('/')[1:])
102
+ key = os.path.basename(f)
103
+ citation = f'[{rel_path}](https://github.com/{gh_repo}/tree/main/{rel_path})'
104
+ files.append([path, citation, key])
105
+ yield files, [[len(files), 0]]
106
+ pbar(int((i+1)/len(z.namelist()) * 99),
107
+ f'Added {f}')
108
+ pbar(100, 'Done')
109
+ else:
110
+ raise ValueError('Unknown Github Repo')
111
 
112
 
113
  with gr.Blocks() as demo:
114
+
115
+ docs = gr.State(None)
116
+ openai_api_key = gr.State('')
117
+
118
  gr.Markdown(f"""
119
  # Document Question and Answer (v{paperqa.__version__})
120
 
121
+ *By Andrew White ([@andrewwhite01](https://twitter.com/andrewwhite01))*
122
+
123
+ This tool will enable asking questions of your uploaded text, PDF documents,
124
+ or scrape github repos.
125
  It uses OpenAI's GPT models and thus you must enter your API key below. This
126
  tool is under active development and currently uses many tokens - up to 10,000
127
  for a single query. That is $0.10-0.20 per query, so please be careful!
 
129
  * [PaperQA](https://github.com/whitead/paper-qa) is the code used to build this tool.
130
  * [langchain](https://github.com/hwchase17/langchain) is the main library this tool utilizes.
131
 
 
 
132
  1. Enter API Key ([What is that?](https://platform.openai.com/account/api-keys))
133
+ 2. Upload your documents and modify citation strings if you want (to look prettier in answer)
134
  """)
135
  openai_api_key = gr.Textbox(
136
  label="OpenAI API Key", placeholder="sk-...", type="password")
137
+ with gr.Tab('File Upload'):
138
+ uploaded_files = gr.File(
139
+ label="Your Documents Upload (PDF or txt)", file_count="multiple", )
140
+ with gr.Tab('Github Repo'):
141
+ gh_repo = gr.Textbox(
142
+ label="Github Repo", placeholder="whitead/paper-qa")
143
+ download = gr.Button("Download Repo")
144
+
145
+ with gr.Accordion("See Docs:", open=False):
146
+ dataset = gr.Dataframe(
147
+ headers=["filepath", "citation string", "key"],
148
+ datatype=["str", "str", "str"],
149
+ col_count=(3, "fixed"),
150
+ interactive=True,
151
+ label="Documents and Citations",
152
+ overflow_row_behaviour='paginate',
153
+ max_rows=5
154
+ )
155
  buildb = gr.Textbox("⚠️Waiting for documents and key...",
156
+ label="Status", interactive=False, show_label=True,
157
+ max_lines=1)
158
+ stats = gr.Dataframe(headers=['Docs', 'Chunks'],
159
+ datatype=['number', 'number'],
160
+ col_count=(2, "fixed"),
161
+ interactive=False,
162
+ label="Doc Stats")
163
  openai_api_key.change(validate_dataset, inputs=[
164
  dataset, openai_api_key], outputs=[buildb])
165
  dataset.change(validate_dataset, inputs=[
166
  dataset, openai_api_key], outputs=[buildb])
167
  uploaded_files.change(request_pathname, inputs=[
168
+ uploaded_files], outputs=[dataset, stats])
169
+ download.click(fn=download_repo, inputs=[
170
+ gh_repo], outputs=[dataset, stats])
171
  query = gr.Textbox(
172
  placeholder="Enter your question here...", label="Question")
173
+ with gr.Row():
174
+ length = gr.Slider(25, 200, value=100, step=5,
175
+ label='Words in answer')
176
+ marg = gr.Checkbox(True, label='Max marginal relevance')
177
+ k = gr.Slider(1, 20, value=10, step=1,
178
+ label='Chunks to examine')
179
+ sources = gr.Slider(1, 10, value=5, step=1,
180
+ label='Contexts to include')
181
+
182
  ask = gr.Button("Ask Question")
183
  gr.Markdown("## Answer")
184
  answer = gr.Markdown(label="Answer")
185
+ with gr.Accordion("Context", open=True):
186
  gr.Markdown(
187
  "### Context\n\nThe following context was used to generate the answer:")
188
  context = gr.Markdown(label="Context")
 
192
  "### Raw Text\n\nThe following raw text was used to generate the answer:")
193
  passages = gr.Markdown(label="Passages")
194
  ask.click(fn=do_ask, inputs=[query, buildb,
195
+ openai_api_key, dataset,
196
+ length, marg, k, sources,
197
+ docs], outputs=[answer, context, passages, docs, stats])
198
 
199
  demo.queue(concurrency_count=20)
200
  demo.launch(show_error=True)
requirements.txt CHANGED
@@ -1,2 +1,3 @@
1
- paper-qa>=0.0.8
2
- gradio
 
 
1
+ paper-qa>=0.0.17
2
+ gradio
3
+ requests