Omnibus commited on
Commit
9c8a515
1 Parent(s): 7d04725

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +543 -0
app.py ADDED
@@ -0,0 +1,543 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ #import urllib.request
3
+ import requests
4
+ import bs4
5
+ import lxml
6
+ import os
7
+ #import subprocess
8
+ from huggingface_hub import InferenceClient,HfApi
9
+ import random
10
+ import json
11
+ import datetime
12
+ from pypdf import PdfReader
13
+ import uuid
14
+ #from query import tasks
15
+ from agent import (
16
+ PREFIX,
17
+ SAVE_MEMORY,
18
+ COMPRESS_DATA_PROMPT,
19
+ COMPRESS_DATA_PROMPT_SMALL,
20
+ LOG_PROMPT,
21
+ LOG_RESPONSE,
22
+ )
23
+ client = InferenceClient(
24
+ "mistralai/Mixtral-8x7B-Instruct-v0.1"
25
+ )
26
+ reponame="Omnibus/tmp"
27
+ save_data=f'https://huggingface.co/datasets/{reponame}/raw/main/'
28
+ token_self = os.environ['HF_TOKEN']
29
+ api=HfApi(token=token_self)
30
+
31
+ def find_all(purpose,task,history, url, result):
32
+ return_list=[]
33
+ print (url)
34
+ print (f"trying URL:: {url}")
35
+ try:
36
+ if url != "" and url != None:
37
+ out = []
38
+ source = requests.get(url)
39
+ if source.status_code ==200:
40
+ soup = bs4.BeautifulSoup(source.content,'lxml')
41
+
42
+ rawp=(f'RAW TEXT RETURNED: {soup.text}')
43
+ cnt=0
44
+ cnt+=len(rawp)
45
+ out.append(rawp)
46
+ out.append("HTML fragments: ")
47
+ q=("a","p","span","content","article")
48
+ for p in soup.find_all("a"):
49
+ out.append([{"LINK TITLE":p.get('title'),"URL":p.get('href'),"STRING":p.string}])
50
+ c=0
51
+ out = str(out)
52
+ rl = len(out)
53
+ print(f'rl:: {rl}')
54
+ for i in str(out):
55
+ if i == " " or i=="," or i=="\n" or i=="/" or i=="." or i=="<":
56
+ c +=1
57
+ print (f'c:: {c}')
58
+ #if c > MAX_HISTORY:
59
+ #print("compressing...")
60
+ #rawp = compress_data(c,purpose,task,out,result)
61
+ #result += rawp
62
+ rawp=out
63
+ return True, rawp
64
+ else:
65
+ return False, "Enter Valid URL"
66
+ except Exception as e:
67
+ print (e)
68
+ return False, f'Error: {e}'
69
+
70
+ #else:
71
+ # history = "observation: The search query I used did not return a valid response"
72
+
73
+ return "MAIN", None, history, task
74
+
75
+ def read_txt(txt_path):
76
+ text=""
77
+ with open(txt_path,"r") as f:
78
+ text = f.read()
79
+ f.close()
80
+ print (text)
81
+ return text
82
+
83
+ def read_pdf(pdf_path):
84
+ text=""
85
+ reader = PdfReader(f'{pdf_path}')
86
+ number_of_pages = len(reader.pages)
87
+ for i in range(number_of_pages):
88
+ page = reader.pages[i]
89
+ text = f'{text}\n{page.extract_text()}'
90
+ print (text)
91
+ return text
92
+
93
+ error_box=[]
94
+ def read_pdf_online(url):
95
+ uid=uuid.uuid4()
96
+ print(f"reading {url}")
97
+ response = requests.get(url, stream=True)
98
+ print(response.status_code)
99
+ text=""
100
+ #################
101
+
102
+ #####################
103
+ try:
104
+ if response.status_code == 200:
105
+ with open("test.pdf", "wb") as f:
106
+ f.write(response.content)
107
+ #f.close()
108
+ #out = Path("./data.pdf")
109
+ #print (out)
110
+ reader = PdfReader("test.pdf")
111
+ number_of_pages = len(reader.pages)
112
+ print(number_of_pages)
113
+ for i in range(number_of_pages):
114
+ page = reader.pages[i]
115
+ text = f'{text}\n{page.extract_text()}'
116
+ print(f"PDF_TEXT:: {text}")
117
+ return text
118
+ else:
119
+ text = response.status_code
120
+ error_box.append(url)
121
+ print(text)
122
+ return text
123
+
124
+
125
+ except Exception as e:
126
+ print (e)
127
+ return e
128
+
129
+
130
+ VERBOSE = True
131
+ MAX_HISTORY = 100
132
+ MAX_DATA = 20000
133
+
134
+ def format_prompt(message, history):
135
+ prompt = "<s>"
136
+ for user_prompt, bot_response in history:
137
+ prompt += f"[INST] {user_prompt} [/INST]"
138
+ prompt += f" {bot_response}</s> "
139
+ prompt += f"[INST] {message} [/INST]"
140
+ return prompt
141
+
142
+
143
+
144
+ def run_gpt(
145
+ prompt_template,
146
+ stop_tokens,
147
+ max_tokens,
148
+ seed,
149
+ **prompt_kwargs,
150
+ ):
151
+ print(seed)
152
+ timestamp=datetime.datetime.now()
153
+
154
+ generate_kwargs = dict(
155
+ temperature=0.9,
156
+ max_new_tokens=max_tokens,
157
+ top_p=0.95,
158
+ repetition_penalty=1.0,
159
+ do_sample=True,
160
+ seed=seed,
161
+ )
162
+
163
+ content = PREFIX.format(
164
+ timestamp=timestamp,
165
+ purpose="Compile the provided data and complete the users task"
166
+ ) + prompt_template.format(**prompt_kwargs)
167
+ if VERBOSE:
168
+ print(LOG_PROMPT.format(content))
169
+
170
+
171
+ #formatted_prompt = format_prompt(f"{system_prompt}, {prompt}", history)
172
+ #formatted_prompt = format_prompt(f'{content}', history)
173
+
174
+ stream = client.text_generation(content, **generate_kwargs, stream=True, details=True, return_full_text=False)
175
+ resp = ""
176
+ for response in stream:
177
+ resp += response.token.text
178
+ #yield resp
179
+
180
+ if VERBOSE:
181
+ print(LOG_RESPONSE.format(resp))
182
+ return resp
183
+
184
+
185
+ def compress_data(c, instruct, history):
186
+ seed=random.randint(1,1000000000)
187
+
188
+ print (c)
189
+ #tot=len(purpose)
190
+ #print(tot)
191
+ divr=int(c)/MAX_DATA
192
+ divi=int(divr)+1 if divr != int(divr) else int(divr)
193
+ chunk = int(int(c)/divr)
194
+ print(f'chunk:: {chunk}')
195
+ print(f'divr:: {divr}')
196
+ print (f'divi:: {divi}')
197
+ out = []
198
+ #out=""
199
+ s=0
200
+ e=chunk
201
+ print(f'e:: {e}')
202
+ new_history=""
203
+ #task = f'Compile this data to fulfill the task: {task}, and complete the purpose: {purpose}\n'
204
+ for z in range(divi):
205
+ print(f's:e :: {s}:{e}')
206
+
207
+ hist = history[s:e]
208
+
209
+ resp = run_gpt(
210
+ COMPRESS_DATA_PROMPT_SMALL,
211
+ stop_tokens=["observation:", "task:", "action:", "thought:"],
212
+ max_tokens=8192,
213
+ seed=seed,
214
+ direction=instruct,
215
+ knowledge="",
216
+ history=hist,
217
+ )
218
+ out.append(resp)
219
+ #new_history = resp
220
+ print (resp)
221
+ #out+=resp
222
+ e=e+chunk
223
+ s=s+chunk
224
+ return out
225
+
226
+
227
+ def compress_data_og(c, instruct, history):
228
+ seed=random.randint(1,1000000000)
229
+
230
+ print (c)
231
+ #tot=len(purpose)
232
+ #print(tot)
233
+ divr=int(c)/MAX_DATA
234
+ divi=int(divr)+1 if divr != int(divr) else int(divr)
235
+ chunk = int(int(c)/divr)
236
+ print(f'chunk:: {chunk}')
237
+ print(f'divr:: {divr}')
238
+ print (f'divi:: {divi}')
239
+ out = []
240
+ #out=""
241
+ s=0
242
+ e=chunk
243
+ print(f'e:: {e}')
244
+ new_history=""
245
+ #task = f'Compile this data to fulfill the task: {task}, and complete the purpose: {purpose}\n'
246
+ for z in range(divi):
247
+ print(f's:e :: {s}:{e}')
248
+
249
+ hist = history[s:e]
250
+
251
+ resp = run_gpt(
252
+ COMPRESS_DATA_PROMPT,
253
+ stop_tokens=["observation:", "task:", "action:", "thought:"],
254
+ max_tokens=8192,
255
+ seed=seed,
256
+ direction=instruct,
257
+ knowledge=new_history,
258
+ history=hist,
259
+ )
260
+
261
+ new_history = resp
262
+ print (resp)
263
+ out+=resp
264
+ e=e+chunk
265
+ s=s+chunk
266
+ '''
267
+ resp = run_gpt(
268
+ COMPRESS_DATA_PROMPT,
269
+ stop_tokens=["observation:", "task:", "action:", "thought:"],
270
+ max_tokens=8192,
271
+ seed=seed,
272
+ direction=instruct,
273
+ knowledge=new_history,
274
+ history="All data has been recieved.",
275
+ )'''
276
+ print ("final" + resp)
277
+ #history = "observation: {}\n".format(resp)
278
+ return resp
279
+
280
+
281
+
282
+ def summarize(inp,history,report_check,sum_mem_check,data=None,files=None,url=None,pdf_url=None,pdf_batch=None):
283
+ json_box=[]
284
+ if inp == "":
285
+ inp = "Process this data"
286
+ history.clear()
287
+ history = [(inp,"Working on it...")]
288
+ yield "",history,error_box,json_box
289
+
290
+ if pdf_batch.startswith("http"):
291
+ c=0
292
+ data=""
293
+ for i in str(pdf_batch):
294
+ if i==",":
295
+ c+=1
296
+ print (f'c:: {c}')
297
+
298
+ try:
299
+ for i in range(c+1):
300
+ batch_url = pdf_batch.split(",",c)[i]
301
+ bb = read_pdf_online(batch_url)
302
+ data=f'{data}\nFile Name URL ({batch_url}):\n{bb}'
303
+ except Exception as e:
304
+ print(e)
305
+ #data=f'{data}\nError reading URL ({batch_url})'
306
+ if pdf_url.startswith("http"):
307
+ print("PDF_URL")
308
+ out = read_pdf_online(pdf_url)
309
+ data=out
310
+ if url.startswith("http"):
311
+ val, out = find_all(url)
312
+ if not val:
313
+ data="Error"
314
+ rawp = str(out)
315
+ else:
316
+ data=out
317
+ if files:
318
+ for i, file in enumerate(files):
319
+ try:
320
+ print (file)
321
+ if file.endswith(".pdf"):
322
+ zz=read_pdf(file)
323
+ print (zz)
324
+ data=f'{data}\nFile Name ({file}):\n{zz}'
325
+ elif file.endswith(".txt"):
326
+ zz=read_txt(file)
327
+ print (zz)
328
+ data=f'{data}\nFile Name ({file}):\n{zz}'
329
+ except Exception as e:
330
+ data=f'{data}\nError opening File Name ({file})'
331
+ print (e)
332
+ if data != "Error" and data != "":
333
+ print(inp)
334
+ out = str(data)
335
+ rl = len(out)
336
+ print(f'rl:: {rl}')
337
+ c=1
338
+ for i in str(out):
339
+ if i == " " or i=="," or i=="\n":
340
+ c +=1
341
+ print (f'c:: {c}')
342
+ if sum_mem_check=="Memory":
343
+ save_memory(inp,out)
344
+ rawp = "Complete"
345
+ if sum_mem_check=="Summarize":
346
+ json_out = compress_data(c,inp,out)
347
+
348
+ out = str(json_out)
349
+ if report_check:
350
+ rl = len(out)
351
+ print(f'rl:: {rl}')
352
+ c=1
353
+ for i in str(out):
354
+ if i == " " or i=="," or i=="\n":
355
+ c +=1
356
+ print (f'c2:: {c}')
357
+ rawp = compress_data_og(c,inp,out)
358
+ else:
359
+ rawp = out
360
+ else:
361
+ rawp = "Provide a valid data source"
362
+ history.clear()
363
+ history.append((inp,rawp))
364
+ yield "", history,error_box,json_out
365
+ SAVE_MEMORY = """
366
+ You are attempting to complete the task
367
+ task: {task}
368
+ Data:
369
+ {history}
370
+ Instructions:
371
+ Compile and categorize the data above into a JSON dictionary string
372
+ Include ALL text, datapoints, titles, descriptions, and source urls indexed into an easy to search JSON format
373
+ Your final response should be only the final formatted JSON string enclosed in brackets, and nothing else.
374
+ Required keys:
375
+ "keywords":["short", "list", "of", "keywords", "relevant", "to", "this", "entry"]
376
+ "title":"title of entry"
377
+ "description":"description of entry"
378
+ "content":"full content of data about entry"
379
+ "url":"https://url.source"
380
+ """
381
+
382
+ def save_memory(purpose, history):
383
+ uid=uuid.uuid4()
384
+ history=str(history)
385
+ c=0
386
+ inp = str(history)
387
+ rl = len(inp)
388
+ print(f'rl:: {rl}')
389
+ for i in str(inp):
390
+ if i == " " or i=="," or i=="\n" or i=="/" or i=="." or i=="<":
391
+ c +=1
392
+ print (f'c:: {c}')
393
+
394
+ seed=random.randint(1,1000000000)
395
+
396
+ print (c)
397
+ #tot=len(purpose)
398
+ #print(tot)
399
+ divr=int(c)/MAX_DATA
400
+ divi=int(divr)+1 if divr != int(divr) else int(divr)
401
+ chunk = int(int(c)/divr)
402
+ print(f'chunk:: {chunk}')
403
+ print(f'divr:: {divr}')
404
+ print (f'divi:: {divi}')
405
+ #out = []
406
+ #out=""
407
+ s=0
408
+ e=chunk
409
+ print(f'e:: {e}')
410
+ new_history=""
411
+ task = f'Index this Data\n'
412
+ for z in range(divi):
413
+ print(f's:e :: {s}:{e}')
414
+
415
+ hist = inp[s:e]
416
+
417
+ resp = run_gpt(
418
+ SAVE_MEMORY,
419
+ stop_tokens=["observation:", "task:", "action:", "thought:"],
420
+ max_tokens=4096,
421
+ seed=seed,
422
+ purpose=purpose,
423
+ task=task,
424
+ history=hist,
425
+ ).strip('\n')
426
+ #new_history = resp
427
+ print (resp)
428
+ #out+=resp
429
+ e=e+chunk
430
+ s=s+chunk
431
+ print ("final1" + resp)
432
+ try:
433
+ resp='[{'+resp.split('[{')[1].split('</s>')[0]
434
+ print ("final2\n" + resp)
435
+ print(f"keywords:: {resp['keywords']}")
436
+ except Exception as e:
437
+ resp = resp
438
+ print(e)
439
+ timestamp=str(datetime.datetime.now())
440
+ timename=timestamp.replace(" ","--").replace(":","-").replace(".","-")
441
+ json_object=resp
442
+ #json_object = json.dumps(out_box)
443
+ #json_object = json.dumps(out_box,indent=4)
444
+ with open(f"tmp-{uid}.json", "w") as outfile:
445
+ outfile.write(json_object)
446
+ api.upload_file(
447
+ path_or_fileobj=f"tmp-{uid}.json",
448
+ path_in_repo=f"/mem-test2/{timename}.json",
449
+ repo_id=reponame,
450
+ #repo_id=save_data.split('datasets/',1)[1].split('/raw',1)[0],
451
+ token=token_self,
452
+ repo_type="dataset",
453
+ )
454
+ lines = resp.strip().strip("\n").split("\n")
455
+ r = requests.get(f'{save_data}mem-test2/main.json')
456
+ print(f'status code main:: {r.status_code}')
457
+ if r.status_code==200:
458
+
459
+ lod = json.loads(r.text)
460
+ #lod = eval(lod)
461
+ print (f'lod:: {lod}')
462
+ else:
463
+ lod = []
464
+ for i,line in enumerate(lines):
465
+ key_box=[]
466
+ print(f'LINE:: {line}')
467
+ if ":" in line:
468
+ print(f'line:: {line}')
469
+
470
+ if "keywords" in line[:16]:
471
+ print(f'trying:: {line}')
472
+ keyw=line.split(":")[1]
473
+ print (keyw)
474
+ print (keyw.split("[")[1].split("]")[0])
475
+ keyw=keyw.split("[")[1].split("]")[0]
476
+ for ea in keyw.split(","):
477
+ s1=""
478
+ ea=ea.strip().strip("\n")
479
+ for ev in ea:
480
+ if ev.isalnum():
481
+ s1+=ev
482
+ if ev == " ":
483
+ s1+=ev
484
+ #ea=s1
485
+ print(s1)
486
+ key_box.append(s1)
487
+ lod.append({"file_name":timename,"keywords":key_box})
488
+ json_object = json.dumps(lod, indent=4)
489
+ with open(f"tmp2-{uid}.json", "w") as outfile2:
490
+ outfile2.write(json_object)
491
+ api.upload_file(
492
+ path_or_fileobj=f"tmp2-{uid}.json",
493
+ path_in_repo=f"/mem-test2/main.json",
494
+ repo_id=reponame,
495
+ #repo_id=save_data.split('datasets/',1)[1].split('/raw',1)[0],
496
+ token=token_self,
497
+ repo_type="dataset",
498
+ )
499
+
500
+ #return [resp]
501
+
502
+
503
+
504
+
505
+
506
+ #################################
507
+ def clear_fn():
508
+ return "",[(None,None)]
509
+
510
+ with gr.Blocks() as app:
511
+ gr.HTML("""<center><h1>Mixtral 8x7B TLDR Summarizer + Web</h1><h3>Summarize Data of unlimited length</h3>""")
512
+ chatbot = gr.Chatbot(label="Mixtral 8x7B Chatbot",show_copy_button=True)
513
+ with gr.Row():
514
+ with gr.Column(scale=3):
515
+ prompt=gr.Textbox(label = "Instructions (optional)")
516
+ with gr.Column(scale=1):
517
+ report_check=gr.Checkbox(label="Return Report", value=True)
518
+ sum_mem_check=gr.Radio(label="Output",choices=["Summary","Memory"])
519
+ button=gr.Button()
520
+
521
+ #models_dd=gr.Dropdown(choices=[m for m in return_list],interactive=True)
522
+ with gr.Row():
523
+ stop_button=gr.Button("Stop")
524
+ clear_btn = gr.Button("Clear")
525
+ with gr.Row():
526
+ with gr.Tab("Text"):
527
+ data=gr.Textbox(label="Input Data (paste text)", lines=6)
528
+ with gr.Tab("File"):
529
+ file=gr.Files(label="Input File(s) (.pdf .txt)")
530
+ with gr.Tab("Raw HTML"):
531
+ url = gr.Textbox(label="URL")
532
+ with gr.Tab("PDF URL"):
533
+ pdf_url = gr.Textbox(label="PDF URL")
534
+ with gr.Tab("PDF Batch"):
535
+ pdf_batch = gr.Textbox(label="PDF URL Batch (comma separated)")
536
+ json_out=gr.JSON()
537
+ e_box=gr.Textbox()
538
+ #text=gr.JSON()
539
+ #inp_query.change(search_models,inp_query,models_dd)
540
+ clear_btn.click(clear_fn,None,[prompt,chatbot])
541
+ go=button.click(summarize,[prompt,chatbot,report_check,sum_mem_check,data,file,url,pdf_url,pdf_batch],[prompt,chatbot,e_box,json_out])
542
+ stop_button.click(None,None,None,cancels=[go])
543
+ app.queue(default_concurrency_limit=20).launch(show_api=False)