file-indexing

Sleeping

App Files Files Community

file-indexing / app.py

Omnibus

Update app.py

c093ae5 verified 8 months ago

raw

history blame

16.7 kB

	import gradio as gr
	#import urllib.request
	import requests
	import bs4
	import lxml
	import os
	#import subprocess
	from huggingface_hub import InferenceClient,HfApi
	import random
	import json
	import datetime
	from pypdf import PdfReader
	import uuid
	#from query import tasks
	from agent import (
	PREFIX,
	COMPRESS_DATA_PROMPT,
	COMPRESS_DATA_PROMPT_SMALL,
	LOG_PROMPT,
	LOG_RESPONSE,
	)
	client = InferenceClient(
	"mistralai/Mixtral-8x7B-Instruct-v0.1"
	)
	reponame="Omnibus/tmp"
	save_data=f'https://huggingface.co/datasets/{reponame}/raw/main/'
	token_self = os.environ['HF_TOKEN']
	api=HfApi(token=token_self)

	def find_all(url):
	return_list=[]
	print (url)
	print (f"trying URL:: {url}")
	try:
	if url != "" and url != None:
	out = []
	source = requests.get(url)
	print(source.status_code)
	if source.status_code ==200:
	print('trying')
	soup = bs4.BeautifulSoup(source.content,'lxml')

	rawp=(f'RAW TEXT RETURNED: {soup.text}')
	print (rawp)
	cnt=0
	cnt+=len(rawp)
	out.append(rawp)
	out.append("HTML fragments: ")
	q=("a","p","span","content","article")
	for p in soup.find_all("a"):
	out.append([{"LINK TITLE":p.get('title'),"URL":p.get('href'),"STRING":p.string}])
	c=0
	out = str(out)
	rl = len(out)
	print(f'rl:: {rl}')
	for i in str(out):
	if i == " " or i=="," or i=="\n" or i=="/" or i=="." or i=="<":
	c +=1
	print (f'c:: {c}')
	#if c > MAX_HISTORY:
	#print("compressing...")
	#rawp = compress_data(c,purpose,task,out,result)
	#result += rawp
	rawp=out
	return True, rawp
	else:

	return False, f'Status:: {source.status_code}'
	else:
	print('passing')
	return False, "Enter Valid URL"
	except Exception as e:
	print (e)
	return False, f'Error: {e}'


	def read_txt(txt_path):
	text=""
	with open(txt_path,"r") as f:
	text = f.read()
	f.close()
	print (text)
	return text

	def read_pdf(pdf_path):
	text=""
	reader = PdfReader(f'{pdf_path}')
	number_of_pages = len(reader.pages)
	for i in range(number_of_pages):
	page = reader.pages[i]
	text = f'{text}\n{page.extract_text()}'
	print (text)
	return text

	error_box=[]
	def read_pdf_online(url):
	uid=uuid.uuid4()
	print(f"reading {url}")
	response = requests.get(url, stream=True)
	print(response.status_code)
	text=""
	#################

	#####################
	try:
	if response.status_code == 200:
	with open("test.pdf", "wb") as f:
	f.write(response.content)
	#f.close()
	#out = Path("./data.pdf")
	#print (out)
	reader = PdfReader("test.pdf")
	number_of_pages = len(reader.pages)
	print(number_of_pages)
	for i in range(number_of_pages):
	page = reader.pages[i]
	text = f'{text}\n{page.extract_text()}'
	print(f"PDF_TEXT:: {text}")
	return text
	else:
	text = response.status_code
	error_box.append(url)
	print(text)
	return text


	except Exception as e:
	print (e)
	return e


	VERBOSE = True
	MAX_HISTORY = 100
	MAX_DATA = 20000

	def format_prompt(message, history):
	prompt = "<s>"
	for user_prompt, bot_response in history:
	prompt += f"[INST] {user_prompt} [/INST]"
	prompt += f" {bot_response}</s> "
	prompt += f"[INST] {message} [/INST]"
	return prompt



	def run_gpt(
	prompt_template,
	stop_tokens,
	max_tokens,
	seed,
	**prompt_kwargs,
	):
	print(seed)
	timestamp=datetime.datetime.now()

	generate_kwargs = dict(
	temperature=0.9,
	max_new_tokens=max_tokens,
	top_p=0.95,
	repetition_penalty=1.0,
	do_sample=True,
	seed=seed,
	)

	content = PREFIX.format(
	timestamp=timestamp,
	purpose="Compile the provided data and complete the users task"
	) + prompt_template.format(**prompt_kwargs)
	if VERBOSE:
	print(LOG_PROMPT.format(content))


	#formatted_prompt = format_prompt(f"{system_prompt}, {prompt}", history)
	#formatted_prompt = format_prompt(f'{content}', history)

	stream = client.text_generation(content, **generate_kwargs, stream=True, details=True, return_full_text=False)
	resp = ""
	for response in stream:
	resp += response.token.text
	#yield resp

	if VERBOSE:
	print(LOG_RESPONSE.format(resp))
	return resp


	def compress_data(c, instruct, history):
	seed=random.randint(1,1000000000)

	print (c)
	#tot=len(purpose)
	#print(tot)
	divr=int(c)/MAX_DATA
	divi=int(divr)+1 if divr != int(divr) else int(divr)
	chunk = int(int(c)/divr)
	print(f'chunk:: {chunk}')
	print(f'divr:: {divr}')
	print (f'divi:: {divi}')
	out = []
	#out=""
	s=0
	e=chunk
	print(f'e:: {e}')
	new_history=""
	#task = f'Compile this data to fulfill the task: {task}, and complete the purpose: {purpose}\n'
	for z in range(divi):
	print(f's:e :: {s}:{e}')

	hist = history[s:e]

	resp = run_gpt(
	COMPRESS_DATA_PROMPT_SMALL,
	stop_tokens=["observation:", "task:", "action:", "thought:"],
	max_tokens=8192,
	seed=seed,
	direction=instruct,
	knowledge="",
	history=hist,
	)
	out.append(resp)
	#new_history = resp
	print (resp)
	#out+=resp
	e=e+chunk
	s=s+chunk
	return out


	def compress_data_og(c, instruct, history):
	seed=random.randint(1,1000000000)

	print (c)
	#tot=len(purpose)
	#print(tot)
	divr=int(c)/MAX_DATA
	divi=int(divr)+1 if divr != int(divr) else int(divr)
	chunk = int(int(c)/divr)
	print(f'chunk:: {chunk}')
	print(f'divr:: {divr}')
	print (f'divi:: {divi}')
	out = []
	#out=""
	s=0
	e=chunk
	print(f'e:: {e}')
	new_history=""
	#task = f'Compile this data to fulfill the task: {task}, and complete the purpose: {purpose}\n'
	for z in range(divi):
	print(f's:e :: {s}:{e}')

	hist = history[s:e]

	resp = run_gpt(
	COMPRESS_DATA_PROMPT,
	stop_tokens=["observation:", "task:", "action:", "thought:"],
	max_tokens=8192,
	seed=seed,
	direction=instruct,
	knowledge=new_history,
	history=hist,
	)

	new_history = resp
	print (resp)
	out+=resp
	e=e+chunk
	s=s+chunk
	'''
	resp = run_gpt(
	COMPRESS_DATA_PROMPT,
	stop_tokens=["observation:", "task:", "action:", "thought:"],
	max_tokens=8192,
	seed=seed,
	direction=instruct,
	knowledge=new_history,
	history="All data has been recieved.",
	)'''
	print ("final" + resp)
	#history = "observation: {}\n".format(resp)
	return resp



	def summarize(inp,history,report_check,sum_mem_check,data=None,files=None,url=None,pdf_url=None,pdf_batch=None):
	json_box=[]
	if inp == "":
	inp = "Process this data"
	history.clear()
	history = [(inp,"Working on it...")]
	yield "",history,error_box,json_box

	if pdf_batch.startswith("http"):
	c=0
	data=""
	for i in str(pdf_batch):
	if i==",":
	c+=1
	print (f'c:: {c}')

	try:
	for i in range(c+1):
	batch_url = pdf_batch.split(",",c)[i]
	bb = read_pdf_online(batch_url)
	data=f'{data}\nFile Name URL ({batch_url}):\n{bb}'
	except Exception as e:
	print(e)
	#data=f'{data}\nError reading URL ({batch_url})'
	if pdf_url.startswith("http"):
	print("PDF_URL")
	out = read_pdf_online(pdf_url)
	data=out
	if url.startswith("http"):
	val, out = find_all(url)
	if not val:
	data="Error"
	rawp = str(out)
	else:
	data=out
	if files:
	for i, file in enumerate(files):
	try:
	print (file)
	if file.endswith(".pdf"):
	zz=read_pdf(file)
	print (zz)
	data=f'{data}\nFile Name ({file}):\n{zz}'
	elif file.endswith(".txt"):
	zz=read_txt(file)
	print (zz)
	data=f'{data}\nFile Name ({file}):\n{zz}'
	except Exception as e:
	data=f'{data}\nError opening File Name ({file})'
	print (e)
	if data != "Error" and data != "":
	print(inp)
	out = str(data)
	rl = len(out)
	print(f'rl:: {rl}')
	c=1
	for i in str(out):
	if i == " " or i=="," or i=="\n":
	c +=1
	print (f'c:: {c}')
	if sum_mem_check=="Memory":
	json_out = save_memory(inp,out)
	rawp = "Complete"
	if sum_mem_check=="Summarize":
	json_out = compress_data(c,inp,out)

	out = str(json_out)
	if report_check:
	rl = len(out)
	print(f'rl:: {rl}')
	c=1
	for i in str(out):
	if i == " " or i=="," or i=="\n":
	c +=1
	print (f'c2:: {c}')
	rawp = compress_data_og(c,inp,out)
	else:
	rawp = out
	else:
	rawp = "Provide a valid data source"
	history.clear()
	history.append((inp,rawp))
	yield "", history,error_box,json_out
	SAVE_MEMORY = """
	You are attempting to complete the task
	task: {task}
	Data:
	{history}
	Instructions:
	Compile and categorize the data above into a JSON dictionary string
	Include ALL text, datapoints, titles, descriptions, and source urls indexed into an easy to search JSON format
	Your final response should be only the final formatted JSON string enclosed in brackets, and nothing else.
	Required keys:
	"keywords":["short", "list", "of", "important", "keywords", "found", "in", "this", "entry"]
	"title":"title of entry"
	"description":"A sentence summarizing the topic of this entry"
	"content":"A brief paragraph summarizing the important datapoints found in this entry"
	"url":"https://url.source"
	"""

	def save_memory(purpose, history):
	uid=uuid.uuid4()
	history=str(history)
	c=1
	inp = str(history)
	rl = len(inp)
	print(f'rl:: {rl}')
	for i in str(inp):
	if i == " " or i=="," or i=="\n" or i=="/" or i=="\\" or i=="." or i=="<":
	c +=1
	print (f'c:: {c}')

	seed=random.randint(1,1000000000)

	print (c)
	#tot=len(purpose)
	#print(tot)
	divr=int(c)/MAX_DATA
	divi=int(divr)+1 if divr != int(divr) else int(divr)
	chunk = int(int(c)/divr)
	print(f'chunk:: {chunk}')
	print(f'divr:: {divr}')
	print (f'divi:: {divi}')
	out_box = []
	#out=""
	s=0
	ee=chunk
	print(f'e:: {ee}')
	new_history=""
	task = f'Index this Data\n'
	for z in range(divi):
	print(f's:e :: {s}:{ee}')

	hist = inp[s:ee]

	resp = run_gpt(
	SAVE_MEMORY,
	stop_tokens=["observation:", "task:", "action:", "thought:"],
	max_tokens=4096,
	seed=seed,
	purpose=purpose,
	task=task,
	history=hist,
	).strip('\n')
	#new_history = resp
	#print (resp)
	#out+=resp

	#print ("final1" + resp)
	try:
	resp='[{'+resp.split('[{')[1].split('</s>')[0]
	#print ("final2\n" + resp)
	#print(f"keywords:: {resp['keywords']}")
	except Exception as e:
	resp = resp
	print(e)
	timestamp=str(datetime.datetime.now())
	timename=timestamp.replace(" ","--").replace(":","-").replace(".","-")
	json_object=resp
	#json_object = json.dumps(out_box)
	#json_object = json.dumps(out_box,indent=4)
	with open(f"tmp-{uid}.json", "w") as outfile:
	outfile.write(json_object)

	outfile.close()
	api.upload_file(
	path_or_fileobj=f"tmp-{uid}.json",
	path_in_repo=f"/mem-test2/{timename}---{s}-{ee}.json",
	repo_id=reponame,
	#repo_id=save_data.split('datasets/',1)[1].split('/raw',1)[0],
	token=token_self,
	repo_type="dataset",
	)
	lines = resp.strip().strip("\n").split("\n")
	r = requests.get(f'{save_data}mem-test2/main.json')
	print(f'status code main:: {r.status_code}')
	try:
	print(f"KEYWORDS:: {json_object['keywords']}")
	except Exception as e:
	print(f"KEYWORDS:: {e}")
	if r.status_code==200:

	lod = json.loads(r.text)
	#lod = eval(lod)
	print (f'lod:: {lod}')
	if not r.status_code==200:
	lod = []
	for i,line in enumerate(lines):
	key_box=[]
	print(f'LINE:: {line}')
	if ":" in line:
	print(f'line:: {line}')

	if "keywords" in line:
	print(f'trying:: {line}')
	keyw=line.split(":")[1]
	print (keyw)
	print (keyw.split("[")[1].split("]")[0])
	keyw=keyw.split("[")[1].split("]")[0]
	for ea in keyw.split(","):
	s1=""
	ea=ea.strip().strip("\n")
	for ev in ea:
	if ev.isalnum():
	s1+=ev
	if ev == " ":
	s1+=ev
	#ea=s1
	print(s1)
	key_box.append(s1)
	lod.append({"file_name":f"{timename}---{s}-{ee}","keywords":key_box,"index":f"{s}:{ee}"})
	json_object = json.dumps(lod, indent=4)
	with open(f"tmp2-{uid}.json", "w") as outfile2:
	outfile2.write(json_object)
	outfile2.close()
	api.upload_file(
	path_or_fileobj=f"tmp2-{uid}.json",
	path_in_repo=f"/mem-test2/main.json",
	repo_id=reponame,
	#repo_id=save_data.split('datasets/',1)[1].split('/raw',1)[0],
	token=token_self,
	repo_type="dataset",
	)
	ee=ee+chunk
	s=s+chunk
	out_box.append(resp)
	return out_box





	#################################
	def clear_fn():
	return "",[(None,None)]

	with gr.Blocks() as app:
	gr.HTML("""<center><h1>Mixtral 8x7B TLDR Summarizer + Web</h1><h3>Summarize Data of unlimited length</h3>""")
	chatbot = gr.Chatbot(label="Mixtral 8x7B Chatbot",show_copy_button=True)
	with gr.Row():
	with gr.Column(scale=3):
	prompt=gr.Textbox(label = "Instructions (optional)")
	with gr.Column(scale=1):
	report_check=gr.Checkbox(label="Return Report", value=True)
	sum_mem_check=gr.Radio(label="Output",choices=["Summary","Memory"])
	button=gr.Button()

	#models_dd=gr.Dropdown(choices=[m for m in return_list],interactive=True)
	with gr.Row():
	stop_button=gr.Button("Stop")
	clear_btn = gr.Button("Clear")
	with gr.Row():
	with gr.Tab("Text"):
	data=gr.Textbox(label="Input Data (paste text)", lines=6)
	with gr.Tab("File"):
	file=gr.Files(label="Input File(s) (.pdf .txt)")
	with gr.Tab("Raw HTML"):
	url = gr.Textbox(label="URL")
	with gr.Tab("PDF URL"):
	pdf_url = gr.Textbox(label="PDF URL")
	with gr.Tab("PDF Batch"):
	pdf_batch = gr.Textbox(label="PDF URL Batch (comma separated)")
	json_out=gr.JSON()
	e_box=gr.Textbox()
	#text=gr.JSON()
	#inp_query.change(search_models,inp_query,models_dd)
	clear_btn.click(clear_fn,None,[prompt,chatbot])
	go=button.click(summarize,[prompt,chatbot,report_check,sum_mem_check,data,file,url,pdf_url,pdf_batch],[prompt,chatbot,e_box,json_out])
	stop_button.click(None,None,None,cancels=[go])
	app.queue(default_concurrency_limit=20).launch(show_api=False)