import gradio as gr #import urllib.request import requests import bs4 import lxml import os #import subprocess from huggingface_hub import InferenceClient,HfApi import random import json import datetime from pypdf import PdfReader import uuid #from query import tasks from agent import ( PREFIX, COMPRESS_DATA_PROMPT, COMPRESS_DATA_PROMPT_SMALL, LOG_PROMPT, LOG_RESPONSE, ) client = InferenceClient( "mistralai/Mixtral-8x7B-Instruct-v0.1" ) reponame="Omnibus/tmp" save_data=f'https://huggingface.co/datasets/{reponame}/raw/main/' token_self = os.environ['HF_TOKEN'] api=HfApi(token=token_self) def find_all(url): return_list=[] print (url) print (f"trying URL:: {url}") try: if url != "" and url != None: out = [] source = requests.get(url) print(source.status_code) if source.status_code ==200: print('trying') soup = bs4.BeautifulSoup(source.content,'lxml') rawp=(f'RAW TEXT RETURNED: {soup.text}') print (rawp) cnt=0 cnt+=len(rawp) out.append(rawp) out.append("HTML fragments: ") q=("a","p","span","content","article") for p in soup.find_all("a"): out.append([{"LINK TITLE":p.get('title'),"URL":p.get('href'),"STRING":p.string}]) c=0 out = str(out) rl = len(out) print(f'rl:: {rl}') for i in str(out): if i == " " or i=="," or i=="\n" or i=="/" or i=="." or i=="<": c +=1 print (f'c:: {c}') #if c > MAX_HISTORY: #print("compressing...") #rawp = compress_data(c,purpose,task,out,result) #result += rawp rawp=out return True, rawp else: return False, f'Status:: {source.status_code}' else: print('passing') return False, "Enter Valid URL" except Exception as e: print (e) return False, f'Error: {e}' def read_txt(txt_path): text="" with open(txt_path,"r") as f: text = f.read() f.close() print (text) return text def read_pdf(pdf_path): text="" reader = PdfReader(f'{pdf_path}') number_of_pages = len(reader.pages) for i in range(number_of_pages): page = reader.pages[i] text = f'{text}\n{page.extract_text()}' print (text) return text error_box=[] def read_pdf_online(url): uid=uuid.uuid4() print(f"reading {url}") response = requests.get(url, stream=True) print(response.status_code) text="" ################# ##################### try: if response.status_code == 200: with open("test.pdf", "wb") as f: f.write(response.content) #f.close() #out = Path("./data.pdf") #print (out) reader = PdfReader("test.pdf") number_of_pages = len(reader.pages) print(number_of_pages) for i in range(number_of_pages): page = reader.pages[i] text = f'{text}\n{page.extract_text()}' print(f"PDF_TEXT:: {text}") return text else: text = response.status_code error_box.append(url) print(text) return text except Exception as e: print (e) return e VERBOSE = True MAX_HISTORY = 100 MAX_DATA = 20000 def format_prompt(message, history): prompt = "" for user_prompt, bot_response in history: prompt += f"[INST] {user_prompt} [/INST]" prompt += f" {bot_response} " prompt += f"[INST] {message} [/INST]" return prompt def run_gpt( prompt_template, stop_tokens, max_tokens, seed, **prompt_kwargs, ): print(seed) timestamp=datetime.datetime.now() generate_kwargs = dict( temperature=0.9, max_new_tokens=max_tokens, top_p=0.95, repetition_penalty=1.0, do_sample=True, seed=seed, ) content = PREFIX.format( timestamp=timestamp, purpose="Compile the provided data and complete the users task" ) + prompt_template.format(**prompt_kwargs) if VERBOSE: print(LOG_PROMPT.format(content)) #formatted_prompt = format_prompt(f"{system_prompt}, {prompt}", history) #formatted_prompt = format_prompt(f'{content}', history) stream = client.text_generation(content, **generate_kwargs, stream=True, details=True, return_full_text=False) resp = "" for response in stream: resp += response.token.text #yield resp if VERBOSE: print(LOG_RESPONSE.format(resp)) return resp def compress_data(c, instruct, history): seed=random.randint(1,1000000000) print (c) #tot=len(purpose) #print(tot) divr=int(c)/MAX_DATA divi=int(divr)+1 if divr != int(divr) else int(divr) chunk = int(int(c)/divr) print(f'chunk:: {chunk}') print(f'divr:: {divr}') print (f'divi:: {divi}') out = [] #out="" s=0 e=chunk print(f'e:: {e}') new_history="" #task = f'Compile this data to fulfill the task: {task}, and complete the purpose: {purpose}\n' for z in range(divi): print(f's:e :: {s}:{e}') hist = history[s:e] resp = run_gpt( COMPRESS_DATA_PROMPT_SMALL, stop_tokens=["observation:", "task:", "action:", "thought:"], max_tokens=8192, seed=seed, direction=instruct, knowledge="", history=hist, ) out.append(resp) #new_history = resp print (resp) #out+=resp e=e+chunk s=s+chunk return out def compress_data_og(c, instruct, history): seed=random.randint(1,1000000000) print (c) #tot=len(purpose) #print(tot) divr=int(c)/MAX_DATA divi=int(divr)+1 if divr != int(divr) else int(divr) chunk = int(int(c)/divr) print(f'chunk:: {chunk}') print(f'divr:: {divr}') print (f'divi:: {divi}') out = [] #out="" s=0 e=chunk print(f'e:: {e}') new_history="" #task = f'Compile this data to fulfill the task: {task}, and complete the purpose: {purpose}\n' for z in range(divi): print(f's:e :: {s}:{e}') hist = history[s:e] resp = run_gpt( COMPRESS_DATA_PROMPT, stop_tokens=["observation:", "task:", "action:", "thought:"], max_tokens=8192, seed=seed, direction=instruct, knowledge=new_history, history=hist, ) new_history = resp print (resp) out+=resp e=e+chunk s=s+chunk ''' resp = run_gpt( COMPRESS_DATA_PROMPT, stop_tokens=["observation:", "task:", "action:", "thought:"], max_tokens=8192, seed=seed, direction=instruct, knowledge=new_history, history="All data has been recieved.", )''' print ("final" + resp) #history = "observation: {}\n".format(resp) return resp def summarize(inp,history,report_check,sum_check,mem_check,data=None,files=None,url=None,pdf_url=None,pdf_batch=None): json_box=[] if inp == "": inp = "Process this data" history.clear() history = [(inp,"Working on it...")] yield "",history,error_box,json_box if pdf_batch.startswith("http"): c=0 data="" for i in str(pdf_batch): if i==",": c+=1 print (f'c:: {c}') try: for i in range(c+1): batch_url = pdf_batch.split(",",c)[i] bb = read_pdf_online(batch_url) data=f'{data}\nFile Name URL ({batch_url}):\n{bb}' except Exception as e: print(e) #data=f'{data}\nError reading URL ({batch_url})' if pdf_url.startswith("http"): print("PDF_URL") out = read_pdf_online(pdf_url) data=out if url.startswith("http"): val, out = find_all(url) if not val: data="Error" rawp = str(out) else: data=out if files: for i, file in enumerate(files): try: print (file) if file.endswith(".pdf"): zz=read_pdf(file) print (zz) data=f'{data}\nFile Name ({file}):\n{zz}' elif file.endswith(".txt"): zz=read_txt(file) print (zz) data=f'{data}\nFile Name ({file}):\n{zz}' except Exception as e: data=f'{data}\nError opening File Name ({file})' print (e) if data != "Error" and data != "": print(inp) out = str(data) rl = len(out) print(f'rl:: {rl}') c=1 for i in str(out): if i == " " or i=="," or i=="\n": c +=1 print (f'c:: {c}') if mem_check: json_out = save_memory(inp,out) rawp = "Complete" if sum_check: json_out = compress_data(c,inp,out) out = str(json_out) if report_check: rl = len(out) print(f'rl:: {rl}') c=1 for i in str(out): if i == " " or i=="," or i=="\n": c +=1 print (f'c2:: {c}') rawp = compress_data_og(c,inp,out) else: rawp = out else: rawp = "Provide a valid data source" history.clear() history.append((inp,rawp)) yield "", history,error_box,json_out SAVE_MEMORY = """ You are attempting to complete the task task: {task} Data: {history} Instructions: Compile and categorize the data above into a JSON dictionary string Include ALL text, datapoints, titles, descriptions, and source urls indexed into an easy to search JSON format Required keys: "keywords":["short", "list", "of", "important", "keywords", "found", "in", "this", "entry"], "title":"title of entry", "description":"A sentence summarizing the topic of this entry", "content":"A brief paragraph summarizing the important datapoints found in this entry", "url":"https://url.source" """ def format_json(inp): new_json=[] start_json={} print("FORMATTING:::") for i,line in enumerate(inp): line = line.strip() if "{" in line: print (line) start_json={} #print(f'test:: {line}') if "keywords" in line and ":" in line: start_json['keywords']=line.split(":")[1].strip(",") print (line) if "title" in line and ":" in line: start_json['title']=line.split(":")[1].strip(",") print (line) if "description" in line and ":" in line: start_json['description']=line.split(":")[1].strip(",") print (line) if "content" in line and ":" in line: start_json['content']=line.split(":")[1].strip(",") print (line) if "url" in line and ":" in line: start_json['url']=line.split(":")[1].strip(",") print (line) if "}" in line: new_json.append(start_json) print (new_json) #if line.startswith(("keywords","title","description","content","url","{","}")): # print (line) ''' print (f'NEW LINE:: {line}') if line.startswith("keywords"): keywords_val = line.split(":")[1] if line.startswith("title"): title_val = line.split(":")[1] if line.startswith("description"): description_val = line.split(":")[1] if line.startswith("content"): content_val = line.split(":")[1] if line.startswith("url"): url_val = line.split(":")[1] "keywords": ["texas", "news", "breaking", "houston", "dallas", "shooting"], "title": "Breaking News from CBS11 - CBS Texas", "description": "The latest news and headlines from CBS Texas.", "content": "CBS Texas provides the latest news and headlines. The source url is https://www.cbsnews.com/texas/local-news/", "url": "http ''' #if line.startswith() def save_memory(purpose, history): uid=uuid.uuid4() history=str(history) c=1 inp = str(history) rl = len(inp) print(f'rl:: {rl}') for i in str(inp): if i == " " or i=="," or i=="\n" or i=="/" or i=="\\" or i=="." or i=="<": c +=1 print (f'c:: {c}') seed=random.randint(1,1000000000) print (c) #tot=len(purpose) #print(tot) divr=int(c)/MAX_DATA divi=int(divr)+1 if divr != int(divr) else int(divr) chunk = int(int(c)/divr) print(f'chunk:: {chunk}') print(f'divr:: {divr}') print (f'divi:: {divi}') out_box = [] #out="" s=0 ee=chunk print(f'e:: {ee}') new_history="" task = f'Index this Data\n' for z in range(divi): print(f's:e :: {s}:{ee}') hist = inp[s:ee] resp = run_gpt( SAVE_MEMORY, stop_tokens=["observation:", "task:", "action:", "thought:"], max_tokens=4096, seed=seed, purpose=purpose, task=task, history=hist, ).strip('\n') #new_history = resp #print (resp) #out+=resp #print ("final1" + resp) try: resp='[{'+resp.split('[{')[1].split('')[0] #print ("final2\n" + resp) #print(f"keywords:: {resp['keywords']}") except Exception as e: resp = resp print(e) timestamp=str(datetime.datetime.now()) timename=timestamp.replace(" ","--").replace(":","-").replace(".","-") json_object=resp #json_object = json.dumps(out_box) #json_object = json.dumps(out_box,indent=4) with open(f"tmp-{uid}.json", "w") as outfile: outfile.write(json_object) outfile.close() api.upload_file( path_or_fileobj=f"tmp-{uid}.json", path_in_repo=f"/mem-test2/{timename}---{s}-{ee}.json", repo_id=reponame, #repo_id=save_data.split('datasets/',1)[1].split('/raw',1)[0], token=token_self, repo_type="dataset", ) lines = resp.strip().strip("\n").split("\n") format_json(lines) r = requests.get(f'{save_data}mem-test2/main.json') print(f'status code main:: {r.status_code}') try: print(f"KEYWORDS:: {json_object['keywords']}") except Exception as e: print(f"KEYWORDS:: {e}") if r.status_code==200: lod = json.loads(r.text) #lod = eval(lod) print (f'lod:: {lod}') if not r.status_code==200: lod = [] for i,line in enumerate(lines): key_box=[] desc="" #print(f'LINE:: {line}') if ":" in line: print(f'line:: {line}') if "keywords" in line and ":" in line: print(f'trying:: {line}') keyw=line.split(":")[1] print (keyw) print (keyw.split("[")[1].split("]")[0]) keyw=keyw.split("[")[1].split("]")[0] for ea in keyw.split(","): s1="" ea=ea.strip().strip("\n") for ev in ea: if ev.isalnum(): s1+=ev if ev == " ": s1+=ev #ea=s1 print(s1) key_box.append(s1) if "description" in line and ":" in line: #print(f'trying:: {line}') desc=line.split(":")[1] if key_box and desc: lod.append({"file_name":f"{timename}---{s}-{ee}","keywords":key_box,"description":str(desc),"index":f"{s}:{ee}"}) key_box = [] desc="" if lod: json_object = json.dumps(lod, indent=4) with open(f"tmp2-{uid}.json", "w") as outfile2: outfile2.write(json_object) outfile2.close() api.upload_file( path_or_fileobj=f"tmp2-{uid}.json", path_in_repo=f"/mem-test2/main.json", repo_id=reponame, #repo_id=save_data.split('datasets/',1)[1].split('/raw',1)[0], token=token_self, repo_type="dataset", ) ee=ee+chunk s=s+chunk out_box.append(resp) return out_box ################################# def clear_fn(): return "",[(None,None)] with gr.Blocks() as app: gr.HTML("""

Mixtral 8x7B TLDR Summarizer + Web

Summarize Data of unlimited length

""") chatbot = gr.Chatbot(label="Mixtral 8x7B Chatbot",show_copy_button=True) with gr.Row(): with gr.Column(scale=3): prompt=gr.Textbox(label = "Instructions (optional)") with gr.Column(scale=1): report_check=gr.Checkbox(label="Return Report", value=True) sum_check=gr.Checkbox(label="Summarize", value=True) mem_check=gr.Checkbox(label="Memory", value=True) #sum_mem_check=gr.Radio(label="Output",choices=["Summary","Memory"]) button=gr.Button() #models_dd=gr.Dropdown(choices=[m for m in return_list],interactive=True) with gr.Row(): stop_button=gr.Button("Stop") clear_btn = gr.Button("Clear") with gr.Row(): with gr.Tab("Text"): data=gr.Textbox(label="Input Data (paste text)", lines=6) with gr.Tab("File"): file=gr.Files(label="Input File(s) (.pdf .txt)") with gr.Tab("Raw HTML"): url = gr.Textbox(label="URL") with gr.Tab("PDF URL"): pdf_url = gr.Textbox(label="PDF URL") with gr.Tab("PDF Batch"): pdf_batch = gr.Textbox(label="PDF URL Batch (comma separated)") json_out=gr.JSON() e_box=gr.Textbox() #text=gr.JSON() #inp_query.change(search_models,inp_query,models_dd) clear_btn.click(clear_fn,None,[prompt,chatbot]) go=button.click(summarize,[prompt,chatbot,report_check,sum_check,mem_check,data,file,url,pdf_url,pdf_batch],[prompt,chatbot,e_box,json_out]) stop_button.click(None,None,None,cancels=[go]) app.queue(default_concurrency_limit=20).launch(show_api=False)