Tahnik fedor-ch commited on
Commit
4985c2b
0 Parent(s):

Duplicate from fedor-ch/langchain-ynp-test

Browse files

Co-authored-by: Chemashkinf <fedor-ch@users.noreply.huggingface.co>

Files changed (4) hide show
  1. .gitattributes +34 -0
  2. README.md +13 -0
  3. app.py +287 -0
  4. requirements.txt +8 -0
.gitattributes ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: "Chat with PDF •\_OpenAI"
3
+ emoji: 📄🤖
4
+ colorFrom: purple
5
+ colorTo: pink
6
+ sdk: gradio
7
+ sdk_version: 3.27.0
8
+ python_version: 3.10.9
9
+ app_file: app.py
10
+ pinned: false
11
+ duplicated_from: fedor-ch/langchain-ynp-test
12
+ ---
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,287 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ import time
4
+
5
+ from langchain.document_loaders import OnlinePDFLoader
6
+ from langchain.text_splitter import CharacterTextSplitter
7
+ from langchain.llms import OpenAI
8
+ from langchain.embeddings import OpenAIEmbeddings
9
+ from langchain.vectorstores import Chroma
10
+ from langchain.chains import ConversationalRetrievalChain
11
+ from langchain import PromptTemplate
12
+ from transformers import Pix2StructForConditionalGeneration, Pix2StructProcessor
13
+ import requests
14
+ from PIL import Image
15
+ import torch
16
+
17
+
18
+
19
+ # _template = """Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question.
20
+ # Chat History:
21
+ # {chat_history}
22
+ # Follow Up Input: {question}
23
+ # Standalone question:"""
24
+
25
+ # CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(_template)
26
+
27
+ # template = """
28
+ # You are given the following extracted parts of a long document and a question. Provide a short structured answer.
29
+ # If you don't know the answer, look on the web. Don't try to make up an answer.
30
+ # Question: {question}
31
+ # =========
32
+ # {context}
33
+ # =========
34
+ # Answer in Markdown:"""
35
+
36
+ torch.hub.download_url_to_file('https://raw.githubusercontent.com/vis-nlp/ChartQA/main/ChartQA%20Dataset/val/png/20294671002019.png', 'chart_example.png')
37
+ torch.hub.download_url_to_file('https://raw.githubusercontent.com/vis-nlp/ChartQA/main/ChartQA%20Dataset/test/png/multi_col_1081.png', 'chart_example_2.png')
38
+ torch.hub.download_url_to_file('https://raw.githubusercontent.com/vis-nlp/ChartQA/main/ChartQA%20Dataset/test/png/18143564004789.png', 'chart_example_3.png')
39
+ torch.hub.download_url_to_file('https://sharkcoder.com/files/article/matplotlib-bar-plot.png', 'chart_example_4.png')
40
+
41
+
42
+ model_name = "google/matcha-chartqa"
43
+ model = Pix2StructForConditionalGeneration.from_pretrained(model_name)
44
+ processor = Pix2StructProcessor.from_pretrained(model_name)
45
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
46
+ model.to(device)
47
+
48
+ def filter_output(output):
49
+ return output.replace("<0x0A>", "")
50
+
51
+ def chart_qa(image, question):
52
+ inputs = processor(images=image, text=question, return_tensors="pt").to(device)
53
+ predictions = model.generate(**inputs, max_new_tokens=512)
54
+ return filter_output(processor.decode(predictions[0], skip_special_tokens=True))
55
+
56
+ def loading_pdf():
57
+ return "Loading..."
58
+
59
+
60
+ def pdf_changes(pdf_doc, open_ai_key):
61
+ if open_ai_key is not None:
62
+ os.environ['OPENAI_API_KEY'] = open_ai_key
63
+ loader = OnlinePDFLoader(pdf_doc.name)
64
+ documents = loader.load()
65
+ text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
66
+ texts = text_splitter.split_documents(documents)
67
+ embeddings = OpenAIEmbeddings()
68
+ db = Chroma.from_documents(texts, embeddings)
69
+ retriever = db.as_retriever()
70
+ global qa
71
+ qa = ConversationalRetrievalChain.from_llm(
72
+ llm=OpenAI(temperature=0.5),
73
+ retriever=retriever,
74
+ return_source_documents=True)
75
+ return "Ready"
76
+ else:
77
+ return "You forgot OpenAI API key"
78
+
79
+ def add_text(history, text):
80
+ history = history + [(text, None)]
81
+ return history, ""
82
+
83
+ def bot(history):
84
+ response = infer(history[-1][0], history)
85
+ history[-1][1] = ""
86
+
87
+ for character in response:
88
+ history[-1][1] += character
89
+ time.sleep(0.05)
90
+ yield history
91
+
92
+
93
+ def infer(question, history):
94
+ res = []
95
+ for human, ai in history[:-1]:
96
+ pair = (human, ai)
97
+ res.append(pair)
98
+
99
+ chat_history = res
100
+ #print(chat_history)
101
+ query = question
102
+ result = qa({"question": query, "chat_history": chat_history})
103
+ #print(result)
104
+ return result["answer"]
105
+
106
+ css="""
107
+ #col-container {max-width: 700px; margin-left: auto; margin-right: auto;}
108
+ """
109
+
110
+ title = """
111
+ <div style="text-align: center;">
112
+ <h1>YnP LangChain Test </h1>
113
+ <p style="text-align: center;">Please specify OpenAI Key before use</p>
114
+ </div>
115
+ """
116
+
117
+
118
+ # with gr.Blocks(css=css) as demo:
119
+ # with gr.Column(elem_id="col-container"):
120
+ # gr.HTML(title)
121
+
122
+ # with gr.Column():
123
+ # openai_key = gr.Textbox(label="You OpenAI API key", type="password")
124
+ # pdf_doc = gr.File(label="Load a pdf", file_types=['.pdf'], type="file")
125
+ # with gr.Row():
126
+ # langchain_status = gr.Textbox(label="Status", placeholder="", interactive=False)
127
+ # load_pdf = gr.Button("Load pdf to langchain")
128
+
129
+ # chatbot = gr.Chatbot([], elem_id="chatbot").style(height=350)
130
+ # question = gr.Textbox(label="Question", placeholder="Type your question and hit Enter ")
131
+ # submit_btn = gr.Button("Send Message")
132
+
133
+ # load_pdf.click(loading_pdf, None, langchain_status, queue=False)
134
+ # load_pdf.click(pdf_changes, inputs=[pdf_doc, openai_key], outputs=[langchain_status], queue=False)
135
+ # question.submit(add_text, [chatbot, question], [chatbot, question]).then(
136
+ # bot, chatbot, chatbot
137
+ # )
138
+ # submit_btn.click(add_text, [chatbot, question], [chatbot, question]).then(
139
+ # bot, chatbot, chatbot)
140
+
141
+ # demo.launch()
142
+
143
+
144
+ """functions"""
145
+
146
+ def load_file():
147
+ return "Loading..."
148
+
149
+ def load_xlsx(name):
150
+ import pandas as pd
151
+
152
+ xls_file = rf'{name}'
153
+ data = pd.read_excel(xls_file)
154
+ return data
155
+
156
+ def table_loader(table_file, open_ai_key):
157
+ import os
158
+ from langchain.llms import OpenAI
159
+ from langchain.agents import create_pandas_dataframe_agent
160
+ from pandas import read_csv
161
+
162
+ global agent
163
+ if open_ai_key is not None:
164
+ os.environ['OPENAI_API_KEY'] = open_ai_key
165
+ else:
166
+ return "Enter API"
167
+
168
+ if table_file.name.endswith('.xlsx') or table_file.name.endswith('.xls'):
169
+ data = load_xlsx(table_file.name)
170
+ agent = create_pandas_dataframe_agent(OpenAI(temperature=0), data)
171
+ return "Ready!"
172
+ elif table_file.name.endswith('.csv'):
173
+ data = read_csv(table_file.name)
174
+ agent = create_pandas_dataframe_agent(OpenAI(temperature=0), data)
175
+ return "Ready!"
176
+ else:
177
+ return "Wrong file format! Upload excel file or csv!"
178
+
179
+ def run(query):
180
+ from langchain.callbacks import get_openai_callback
181
+
182
+ with get_openai_callback() as cb:
183
+ response = (agent.run(query))
184
+ costs = (f"Total Cost (USD): ${cb.total_cost}")
185
+ output = f'{response} \n {costs}'
186
+ return output
187
+
188
+ def respond(message, chat_history):
189
+ import time
190
+
191
+ bot_message = run(message)
192
+ chat_history.append((message, bot_message))
193
+ time.sleep(0.5)
194
+ return "", chat_history
195
+
196
+
197
+ with gr.Blocks() as demo:
198
+ with gr.Column(elem_id="col-container"):
199
+ gr.HTML(title)
200
+ key = gr.Textbox(
201
+ show_label=False,
202
+ placeholder="Your OpenAI key",
203
+ type = 'password',
204
+ ).style(container=False)
205
+
206
+ # PDF processing tab
207
+ with gr.Tab("PDFs"):
208
+
209
+ with gr.Row():
210
+
211
+ with gr.Column(scale=0.5):
212
+ langchain_status = gr.Textbox(label="Status", placeholder="", interactive=False)
213
+ load_pdf = gr.Button("Load pdf to langchain")
214
+
215
+ with gr.Column(scale=0.5):
216
+ pdf_doc = gr.File(label="Load a pdf", file_types=['.pdf'], type="file")
217
+
218
+
219
+ with gr.Row():
220
+
221
+ with gr.Column(scale=1):
222
+ chatbot = gr.Chatbot([], elem_id="chatbot").style(height=350)
223
+
224
+ with gr.Row():
225
+
226
+ with gr.Column(scale=0.85):
227
+ question = gr.Textbox(
228
+ show_label=False,
229
+ placeholder="Enter text and press enter, or upload an image",
230
+ ).style(container=False)
231
+
232
+ with gr.Column(scale=0.15, min_width=0):
233
+ clr_btn = gr.Button("Clear!")
234
+
235
+ load_pdf.click(loading_pdf, None, langchain_status, queue=False)
236
+ load_pdf.click(pdf_changes, inputs=[pdf_doc, key], outputs=[langchain_status], queue=True)
237
+ question.submit(add_text, [chatbot, question], [chatbot, question]).then(
238
+ bot, chatbot, chatbot
239
+ )
240
+
241
+ # XLSX and CSV processing tab
242
+ with gr.Tab("Spreadsheets"):
243
+ with gr.Row():
244
+
245
+ with gr.Column(scale=0.5):
246
+ status_sh = gr.Textbox(label="Status", placeholder="", interactive=False)
247
+ load_table = gr.Button("Load csv|xlsx to langchain")
248
+
249
+ with gr.Column(scale=0.5):
250
+ raw_table = gr.File(label="Load a table file (xls or csv)", file_types=['.csv, xlsx, xls'], type="file")
251
+
252
+
253
+ with gr.Row():
254
+
255
+ with gr.Column(scale=1):
256
+ chatbot_sh = gr.Chatbot([], elem_id="chatbot").style(height=350)
257
+
258
+
259
+ with gr.Row():
260
+
261
+ with gr.Column(scale=0.85):
262
+ question_sh = gr.Textbox(
263
+ show_label=False,
264
+ placeholder="Enter text and press enter, or upload an image",
265
+ ).style(container=False)
266
+
267
+ with gr.Column(scale=0.15, min_width=0):
268
+ clr_btn = gr.Button("Clear!")
269
+
270
+ load_table.click(load_file, None, status_sh, queue=False)
271
+ load_table.click(table_loader, inputs=[raw_table, key], outputs=[status_sh], queue=False)
272
+
273
+ question_sh.submit(respond, [question_sh, chatbot_sh], [question_sh, chatbot_sh])
274
+ clr_btn.click(lambda: None, None, chatbot_sh, queue=False)
275
+
276
+
277
+ with gr.Tab("Charts"):
278
+ image = gr.Image(type="pil", label="Chart")
279
+ question = gr.Textbox(label="Question")
280
+ load_chart = gr.Button("Load chart and question!")
281
+ answer = gr.Textbox(label="Model Output")
282
+
283
+ load_chart.click(chart_qa, [image, question], answer)
284
+
285
+
286
+ demo.queue(concurrency_count=3)
287
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ openai
2
+ tiktoken
3
+ chromadb
4
+ langchain
5
+ unstructured
6
+ unstructured[local-inference]
7
+ pandas
8
+ tabulate