file-indexing / app.py
Omnibus's picture
Create app.py
9c8a515 verified
raw
history blame
No virus
16.3 kB
import gradio as gr
#import urllib.request
import requests
import bs4
import lxml
import os
#import subprocess
from huggingface_hub import InferenceClient,HfApi
import random
import json
import datetime
from pypdf import PdfReader
import uuid
#from query import tasks
from agent import (
PREFIX,
SAVE_MEMORY,
COMPRESS_DATA_PROMPT,
COMPRESS_DATA_PROMPT_SMALL,
LOG_PROMPT,
LOG_RESPONSE,
)
client = InferenceClient(
"mistralai/Mixtral-8x7B-Instruct-v0.1"
)
reponame="Omnibus/tmp"
save_data=f'https://huggingface.co/datasets/{reponame}/raw/main/'
token_self = os.environ['HF_TOKEN']
api=HfApi(token=token_self)
def find_all(purpose,task,history, url, result):
return_list=[]
print (url)
print (f"trying URL:: {url}")
try:
if url != "" and url != None:
out = []
source = requests.get(url)
if source.status_code ==200:
soup = bs4.BeautifulSoup(source.content,'lxml')
rawp=(f'RAW TEXT RETURNED: {soup.text}')
cnt=0
cnt+=len(rawp)
out.append(rawp)
out.append("HTML fragments: ")
q=("a","p","span","content","article")
for p in soup.find_all("a"):
out.append([{"LINK TITLE":p.get('title'),"URL":p.get('href'),"STRING":p.string}])
c=0
out = str(out)
rl = len(out)
print(f'rl:: {rl}')
for i in str(out):
if i == " " or i=="," or i=="\n" or i=="/" or i=="." or i=="<":
c +=1
print (f'c:: {c}')
#if c > MAX_HISTORY:
#print("compressing...")
#rawp = compress_data(c,purpose,task,out,result)
#result += rawp
rawp=out
return True, rawp
else:
return False, "Enter Valid URL"
except Exception as e:
print (e)
return False, f'Error: {e}'
#else:
# history = "observation: The search query I used did not return a valid response"
return "MAIN", None, history, task
def read_txt(txt_path):
text=""
with open(txt_path,"r") as f:
text = f.read()
f.close()
print (text)
return text
def read_pdf(pdf_path):
text=""
reader = PdfReader(f'{pdf_path}')
number_of_pages = len(reader.pages)
for i in range(number_of_pages):
page = reader.pages[i]
text = f'{text}\n{page.extract_text()}'
print (text)
return text
error_box=[]
def read_pdf_online(url):
uid=uuid.uuid4()
print(f"reading {url}")
response = requests.get(url, stream=True)
print(response.status_code)
text=""
#################
#####################
try:
if response.status_code == 200:
with open("test.pdf", "wb") as f:
f.write(response.content)
#f.close()
#out = Path("./data.pdf")
#print (out)
reader = PdfReader("test.pdf")
number_of_pages = len(reader.pages)
print(number_of_pages)
for i in range(number_of_pages):
page = reader.pages[i]
text = f'{text}\n{page.extract_text()}'
print(f"PDF_TEXT:: {text}")
return text
else:
text = response.status_code
error_box.append(url)
print(text)
return text
except Exception as e:
print (e)
return e
VERBOSE = True
MAX_HISTORY = 100
MAX_DATA = 20000
def format_prompt(message, history):
prompt = "<s>"
for user_prompt, bot_response in history:
prompt += f"[INST] {user_prompt} [/INST]"
prompt += f" {bot_response}</s> "
prompt += f"[INST] {message} [/INST]"
return prompt
def run_gpt(
prompt_template,
stop_tokens,
max_tokens,
seed,
**prompt_kwargs,
):
print(seed)
timestamp=datetime.datetime.now()
generate_kwargs = dict(
temperature=0.9,
max_new_tokens=max_tokens,
top_p=0.95,
repetition_penalty=1.0,
do_sample=True,
seed=seed,
)
content = PREFIX.format(
timestamp=timestamp,
purpose="Compile the provided data and complete the users task"
) + prompt_template.format(**prompt_kwargs)
if VERBOSE:
print(LOG_PROMPT.format(content))
#formatted_prompt = format_prompt(f"{system_prompt}, {prompt}", history)
#formatted_prompt = format_prompt(f'{content}', history)
stream = client.text_generation(content, **generate_kwargs, stream=True, details=True, return_full_text=False)
resp = ""
for response in stream:
resp += response.token.text
#yield resp
if VERBOSE:
print(LOG_RESPONSE.format(resp))
return resp
def compress_data(c, instruct, history):
seed=random.randint(1,1000000000)
print (c)
#tot=len(purpose)
#print(tot)
divr=int(c)/MAX_DATA
divi=int(divr)+1 if divr != int(divr) else int(divr)
chunk = int(int(c)/divr)
print(f'chunk:: {chunk}')
print(f'divr:: {divr}')
print (f'divi:: {divi}')
out = []
#out=""
s=0
e=chunk
print(f'e:: {e}')
new_history=""
#task = f'Compile this data to fulfill the task: {task}, and complete the purpose: {purpose}\n'
for z in range(divi):
print(f's:e :: {s}:{e}')
hist = history[s:e]
resp = run_gpt(
COMPRESS_DATA_PROMPT_SMALL,
stop_tokens=["observation:", "task:", "action:", "thought:"],
max_tokens=8192,
seed=seed,
direction=instruct,
knowledge="",
history=hist,
)
out.append(resp)
#new_history = resp
print (resp)
#out+=resp
e=e+chunk
s=s+chunk
return out
def compress_data_og(c, instruct, history):
seed=random.randint(1,1000000000)
print (c)
#tot=len(purpose)
#print(tot)
divr=int(c)/MAX_DATA
divi=int(divr)+1 if divr != int(divr) else int(divr)
chunk = int(int(c)/divr)
print(f'chunk:: {chunk}')
print(f'divr:: {divr}')
print (f'divi:: {divi}')
out = []
#out=""
s=0
e=chunk
print(f'e:: {e}')
new_history=""
#task = f'Compile this data to fulfill the task: {task}, and complete the purpose: {purpose}\n'
for z in range(divi):
print(f's:e :: {s}:{e}')
hist = history[s:e]
resp = run_gpt(
COMPRESS_DATA_PROMPT,
stop_tokens=["observation:", "task:", "action:", "thought:"],
max_tokens=8192,
seed=seed,
direction=instruct,
knowledge=new_history,
history=hist,
)
new_history = resp
print (resp)
out+=resp
e=e+chunk
s=s+chunk
'''
resp = run_gpt(
COMPRESS_DATA_PROMPT,
stop_tokens=["observation:", "task:", "action:", "thought:"],
max_tokens=8192,
seed=seed,
direction=instruct,
knowledge=new_history,
history="All data has been recieved.",
)'''
print ("final" + resp)
#history = "observation: {}\n".format(resp)
return resp
def summarize(inp,history,report_check,sum_mem_check,data=None,files=None,url=None,pdf_url=None,pdf_batch=None):
json_box=[]
if inp == "":
inp = "Process this data"
history.clear()
history = [(inp,"Working on it...")]
yield "",history,error_box,json_box
if pdf_batch.startswith("http"):
c=0
data=""
for i in str(pdf_batch):
if i==",":
c+=1
print (f'c:: {c}')
try:
for i in range(c+1):
batch_url = pdf_batch.split(",",c)[i]
bb = read_pdf_online(batch_url)
data=f'{data}\nFile Name URL ({batch_url}):\n{bb}'
except Exception as e:
print(e)
#data=f'{data}\nError reading URL ({batch_url})'
if pdf_url.startswith("http"):
print("PDF_URL")
out = read_pdf_online(pdf_url)
data=out
if url.startswith("http"):
val, out = find_all(url)
if not val:
data="Error"
rawp = str(out)
else:
data=out
if files:
for i, file in enumerate(files):
try:
print (file)
if file.endswith(".pdf"):
zz=read_pdf(file)
print (zz)
data=f'{data}\nFile Name ({file}):\n{zz}'
elif file.endswith(".txt"):
zz=read_txt(file)
print (zz)
data=f'{data}\nFile Name ({file}):\n{zz}'
except Exception as e:
data=f'{data}\nError opening File Name ({file})'
print (e)
if data != "Error" and data != "":
print(inp)
out = str(data)
rl = len(out)
print(f'rl:: {rl}')
c=1
for i in str(out):
if i == " " or i=="," or i=="\n":
c +=1
print (f'c:: {c}')
if sum_mem_check=="Memory":
save_memory(inp,out)
rawp = "Complete"
if sum_mem_check=="Summarize":
json_out = compress_data(c,inp,out)
out = str(json_out)
if report_check:
rl = len(out)
print(f'rl:: {rl}')
c=1
for i in str(out):
if i == " " or i=="," or i=="\n":
c +=1
print (f'c2:: {c}')
rawp = compress_data_og(c,inp,out)
else:
rawp = out
else:
rawp = "Provide a valid data source"
history.clear()
history.append((inp,rawp))
yield "", history,error_box,json_out
SAVE_MEMORY = """
You are attempting to complete the task
task: {task}
Data:
{history}
Instructions:
Compile and categorize the data above into a JSON dictionary string
Include ALL text, datapoints, titles, descriptions, and source urls indexed into an easy to search JSON format
Your final response should be only the final formatted JSON string enclosed in brackets, and nothing else.
Required keys:
"keywords":["short", "list", "of", "keywords", "relevant", "to", "this", "entry"]
"title":"title of entry"
"description":"description of entry"
"content":"full content of data about entry"
"url":"https://url.source"
"""
def save_memory(purpose, history):
uid=uuid.uuid4()
history=str(history)
c=0
inp = str(history)
rl = len(inp)
print(f'rl:: {rl}')
for i in str(inp):
if i == " " or i=="," or i=="\n" or i=="/" or i=="." or i=="<":
c +=1
print (f'c:: {c}')
seed=random.randint(1,1000000000)
print (c)
#tot=len(purpose)
#print(tot)
divr=int(c)/MAX_DATA
divi=int(divr)+1 if divr != int(divr) else int(divr)
chunk = int(int(c)/divr)
print(f'chunk:: {chunk}')
print(f'divr:: {divr}')
print (f'divi:: {divi}')
#out = []
#out=""
s=0
e=chunk
print(f'e:: {e}')
new_history=""
task = f'Index this Data\n'
for z in range(divi):
print(f's:e :: {s}:{e}')
hist = inp[s:e]
resp = run_gpt(
SAVE_MEMORY,
stop_tokens=["observation:", "task:", "action:", "thought:"],
max_tokens=4096,
seed=seed,
purpose=purpose,
task=task,
history=hist,
).strip('\n')
#new_history = resp
print (resp)
#out+=resp
e=e+chunk
s=s+chunk
print ("final1" + resp)
try:
resp='[{'+resp.split('[{')[1].split('</s>')[0]
print ("final2\n" + resp)
print(f"keywords:: {resp['keywords']}")
except Exception as e:
resp = resp
print(e)
timestamp=str(datetime.datetime.now())
timename=timestamp.replace(" ","--").replace(":","-").replace(".","-")
json_object=resp
#json_object = json.dumps(out_box)
#json_object = json.dumps(out_box,indent=4)
with open(f"tmp-{uid}.json", "w") as outfile:
outfile.write(json_object)
api.upload_file(
path_or_fileobj=f"tmp-{uid}.json",
path_in_repo=f"/mem-test2/{timename}.json",
repo_id=reponame,
#repo_id=save_data.split('datasets/',1)[1].split('/raw',1)[0],
token=token_self,
repo_type="dataset",
)
lines = resp.strip().strip("\n").split("\n")
r = requests.get(f'{save_data}mem-test2/main.json')
print(f'status code main:: {r.status_code}')
if r.status_code==200:
lod = json.loads(r.text)
#lod = eval(lod)
print (f'lod:: {lod}')
else:
lod = []
for i,line in enumerate(lines):
key_box=[]
print(f'LINE:: {line}')
if ":" in line:
print(f'line:: {line}')
if "keywords" in line[:16]:
print(f'trying:: {line}')
keyw=line.split(":")[1]
print (keyw)
print (keyw.split("[")[1].split("]")[0])
keyw=keyw.split("[")[1].split("]")[0]
for ea in keyw.split(","):
s1=""
ea=ea.strip().strip("\n")
for ev in ea:
if ev.isalnum():
s1+=ev
if ev == " ":
s1+=ev
#ea=s1
print(s1)
key_box.append(s1)
lod.append({"file_name":timename,"keywords":key_box})
json_object = json.dumps(lod, indent=4)
with open(f"tmp2-{uid}.json", "w") as outfile2:
outfile2.write(json_object)
api.upload_file(
path_or_fileobj=f"tmp2-{uid}.json",
path_in_repo=f"/mem-test2/main.json",
repo_id=reponame,
#repo_id=save_data.split('datasets/',1)[1].split('/raw',1)[0],
token=token_self,
repo_type="dataset",
)
#return [resp]
#################################
def clear_fn():
return "",[(None,None)]
with gr.Blocks() as app:
gr.HTML("""<center><h1>Mixtral 8x7B TLDR Summarizer + Web</h1><h3>Summarize Data of unlimited length</h3>""")
chatbot = gr.Chatbot(label="Mixtral 8x7B Chatbot",show_copy_button=True)
with gr.Row():
with gr.Column(scale=3):
prompt=gr.Textbox(label = "Instructions (optional)")
with gr.Column(scale=1):
report_check=gr.Checkbox(label="Return Report", value=True)
sum_mem_check=gr.Radio(label="Output",choices=["Summary","Memory"])
button=gr.Button()
#models_dd=gr.Dropdown(choices=[m for m in return_list],interactive=True)
with gr.Row():
stop_button=gr.Button("Stop")
clear_btn = gr.Button("Clear")
with gr.Row():
with gr.Tab("Text"):
data=gr.Textbox(label="Input Data (paste text)", lines=6)
with gr.Tab("File"):
file=gr.Files(label="Input File(s) (.pdf .txt)")
with gr.Tab("Raw HTML"):
url = gr.Textbox(label="URL")
with gr.Tab("PDF URL"):
pdf_url = gr.Textbox(label="PDF URL")
with gr.Tab("PDF Batch"):
pdf_batch = gr.Textbox(label="PDF URL Batch (comma separated)")
json_out=gr.JSON()
e_box=gr.Textbox()
#text=gr.JSON()
#inp_query.change(search_models,inp_query,models_dd)
clear_btn.click(clear_fn,None,[prompt,chatbot])
go=button.click(summarize,[prompt,chatbot,report_check,sum_mem_check,data,file,url,pdf_url,pdf_batch],[prompt,chatbot,e_box,json_out])
stop_button.click(None,None,None,cancels=[go])
app.queue(default_concurrency_limit=20).launch(show_api=False)