file-indexing

Sleeping

File size: 23,297 Bytes

import gradio as gr
#import urllib.request
import requests
import bs4
import lxml
import os
#import subprocess
from huggingface_hub import InferenceClient,HfApi
import random
import json
import datetime
from pypdf import PdfReader
import uuid
#from query import tasks
from agent import (
    PREFIX,
    COMPRESS_DATA_PROMPT,
    COMPRESS_DATA_PROMPT_SMALL,
    LOG_PROMPT,
    LOG_RESPONSE,
)
client = InferenceClient(
    "mistralai/Mixtral-8x7B-Instruct-v0.1"
)
reponame="Omnibus/tmp"
save_data=f'https://huggingface.co/datasets/{reponame}/raw/main/'
token_self = os.environ['HF_TOKEN']
api=HfApi(token=token_self)

def find_all(url):
    return_list=[]
    print (url)
    print (f"trying URL:: {url}")        
    try:
        if url != "" and url != None:    
            out = []
            source = requests.get(url)
            print(source.status_code)
            if source.status_code ==200:
                print('trying')
                soup = bs4.BeautifulSoup(source.content,'lxml')
               
                rawp=(f'RAW TEXT RETURNED: {soup.text}')
                print (rawp)
                cnt=0
                cnt+=len(rawp)
                out.append(rawp)
                out.append("HTML fragments: ")
                q=("a","p","span","content","article")
                for p in soup.find_all("a"):
                    out.append([{"LINK TITLE":p.get('title'),"URL":p.get('href'),"STRING":p.string}])
                c=0
                out = str(out)
                rl = len(out)
                print(f'rl:: {rl}')
                for i in str(out):
                    if i == " " or i=="," or i=="\n" or i=="/" or i=="." or i=="<":
                        c +=1
                print (f'c:: {c}')
                #if c > MAX_HISTORY:
                #print("compressing...")
                #rawp = compress_data(c,purpose,task,out,result)  
                #result += rawp
                rawp=out
                return True, rawp
            else:
                
                return False, f'Status:: {source.status_code}'
        else: 
            print('passing')
            return False, "Enter Valid URL"
    except Exception as e:
        print (e)
        return False, f'Error: {e}'


def read_txt(txt_path):
    text=""
    with open(txt_path,"r") as f:
        text = f.read()
    f.close()
    print (text)
    return text

def read_pdf(pdf_path):
    text=""
    reader = PdfReader(f'{pdf_path}')
    number_of_pages = len(reader.pages)
    for i in range(number_of_pages):
        page = reader.pages[i]
        text = f'{text}\n{page.extract_text()}'
    print (text)
    return text

error_box=[]
def read_pdf_online(url):
    uid=uuid.uuid4()
    print(f"reading {url}")
    response = requests.get(url, stream=True)
    print(response.status_code)
    text=""
#################
    
#####################
    try:
        if response.status_code == 200:
            with open("test.pdf", "wb") as f:
                f.write(response.content)
            #f.close()
            #out = Path("./data.pdf")
            #print (out)
            reader = PdfReader("test.pdf")
            number_of_pages = len(reader.pages)
            print(number_of_pages)
            for i in range(number_of_pages):
                page = reader.pages[i]
                text = f'{text}\n{page.extract_text()}'
                print(f"PDF_TEXT:: {text}")
            return text
        else:
            text = response.status_code
            error_box.append(url)
            print(text)
            return text


    except Exception as e:
        print (e)
        return e


VERBOSE = True
MAX_HISTORY = 100
MAX_DATA = 20000

def format_prompt(message, history):
  prompt = "<s>"
  for user_prompt, bot_response in history:
    prompt += f"[INST] {user_prompt} [/INST]"
    prompt += f" {bot_response}</s> "
  prompt += f"[INST] {message} [/INST]"
  return prompt



def run_gpt(
    prompt_template,
    stop_tokens,
    max_tokens,
    seed,
    **prompt_kwargs,
):
    print(seed)
    timestamp=datetime.datetime.now()
    
    generate_kwargs = dict(
        temperature=0.9,
        max_new_tokens=max_tokens,
        top_p=0.95,
        repetition_penalty=1.0,
        do_sample=True,
        seed=seed,
    )
    
    content = PREFIX.format(
        timestamp=timestamp,
        purpose="Compile the provided data and complete the users task"
    ) + prompt_template.format(**prompt_kwargs)
    if VERBOSE:
        print(LOG_PROMPT.format(content))
    
    
    #formatted_prompt = format_prompt(f"{system_prompt}, {prompt}", history)
    #formatted_prompt = format_prompt(f'{content}', history)

    stream = client.text_generation(content, **generate_kwargs, stream=True, details=True, return_full_text=False)
    resp = ""
    for response in stream:
        resp += response.token.text
        #yield resp

    if VERBOSE:
        print(LOG_RESPONSE.format(resp))
    return resp

    
def compress_data(c, instruct, history):
    seed=random.randint(1,1000000000)
    
    print (c)
    #tot=len(purpose)
    #print(tot)
    divr=int(c)/MAX_DATA
    divi=int(divr)+1 if divr != int(divr) else int(divr)
    chunk = int(int(c)/divr)
    print(f'chunk:: {chunk}')
    print(f'divr:: {divr}')
    print (f'divi:: {divi}')
    out = []
    #out=""
    s=0
    e=chunk
    print(f'e:: {e}')
    new_history=""
    #task = f'Compile this data to fulfill the task: {task}, and complete the purpose: {purpose}\n'
    for z in range(divi):
        print(f's:e :: {s}:{e}')
        
        hist = history[s:e]
        
        resp = run_gpt(
            COMPRESS_DATA_PROMPT_SMALL,
            stop_tokens=["observation:", "task:", "action:", "thought:"],
            max_tokens=8192,
            seed=seed,
            direction=instruct,
            knowledge="",
            history=hist,
        )
        out.append(resp)
        #new_history = resp
        print (resp)
        #out+=resp
        e=e+chunk
        s=s+chunk
    return out

    
def compress_data_og(c, instruct, history):
    seed=random.randint(1,1000000000)
    
    print (c)
    #tot=len(purpose)
    #print(tot)
    divr=int(c)/MAX_DATA
    divi=int(divr)+1 if divr != int(divr) else int(divr)
    chunk = int(int(c)/divr)
    print(f'chunk:: {chunk}')
    print(f'divr:: {divr}')
    print (f'divi:: {divi}')
    out = []
    #out=""
    s=0
    e=chunk
    print(f'e:: {e}')
    new_history=""
    #task = f'Compile this data to fulfill the task: {task}, and complete the purpose: {purpose}\n'
    for z in range(divi):
        print(f's:e :: {s}:{e}')
        
        hist = history[s:e]
        
        resp = run_gpt(
            COMPRESS_DATA_PROMPT,
            stop_tokens=["observation:", "task:", "action:", "thought:"],
            max_tokens=8192,
            seed=seed,
            direction=instruct,
            knowledge=new_history,
            history=hist,
        )
        
        new_history = resp
        print (resp)
        out+=resp
        e=e+chunk
        s=s+chunk
    '''
    resp = run_gpt(
        COMPRESS_DATA_PROMPT,
        stop_tokens=["observation:", "task:", "action:", "thought:"],
        max_tokens=8192,
        seed=seed,
        direction=instruct,
        knowledge=new_history,
        history="All data has been recieved.",
    )'''
    print ("final" + resp)
    #history = "observation: {}\n".format(resp)
    return resp


RECALL_MEMORY="""The user will give you a query and a list
Your duty is to choose the words from the list that are closely related to the search query.
If there are no relevant keywords found in the provided list return 'NONE'
Respond with only a list, or NONE
Respond only in this format:
[keyword1,keyword2,keyword3]

USER QUERY:
{prompt}

KEYWORD LIST:
{keywords}
"""




def get_mem(prompt,kw):
    seed=random.randint(1,1000000000)
    generate_kwargs = dict(
        temperature=0.6,
        max_new_tokens=1024,
        top_p=0.6,
        repetition_penalty=1.0,
        do_sample=True,
        seed=seed,
    )
    
    content = RECALL_MEMORY.format(keywords=kw,prompt=prompt)

    stream = client.text_generation(content, **generate_kwargs, stream=True, details=True, return_full_text=False)
    resp = ""
    for response in stream:
        resp += response.token.text

    print (resp)
    return resp 

    
def summarize(inp,history,report_check,sum_check,mem_check,data=None,files=None,url=None,pdf_url=None,pdf_batch=None):
    json_box=[]
    if inp == "":
        inp = "Process this data"
    history.clear()
    history = [(inp,"Working on it...")] 
    yield "",history,error_box,json_box

    if pdf_batch.startswith("http"):
        c=0
        data=""
        for i in str(pdf_batch):
            if i==",":
                c+=1
        print (f'c:: {c}')

        try:
            for i in range(c+1):
                batch_url = pdf_batch.split(",",c)[i]
                bb = read_pdf_online(batch_url)
                data=f'{data}\nFile Name URL ({batch_url}):\n{bb}'
        except Exception as e:
            print(e)
            #data=f'{data}\nError reading URL ({batch_url})'
    if pdf_url.startswith("http"):
        print("PDF_URL")
        out = read_pdf_online(pdf_url)
        data=out
    if url.startswith("http"):
        val, out = find_all(url)
        if not val:
            data="Error"
            rawp = str(out)
        else:
            data=out
    if files:
        for i, file in enumerate(files):
            try: 
                print (file)
                if file.endswith(".pdf"):
                    zz=read_pdf(file)
                    print (zz)
                    data=f'{data}\nFile Name ({file}):\n{zz}'
                elif file.endswith(".txt"):
                    zz=read_txt(file)
                    print (zz)
                    data=f'{data}\nFile Name ({file}):\n{zz}'                
            except Exception as e:
                data=f'{data}\nError opening File Name ({file})'                
                print (e) 
    if data != "Error" and data != "":
        print(inp)
        out = str(data)
        rl = len(out)
        print(f'rl:: {rl}')
        c=1
        for i in str(out):
            if i == " " or i=="," or i=="\n":
                c +=1
        print (f'c:: {c}')
        if mem_check:
            json_out = save_memory(inp,out)
            rawp = "Complete"
        if sum_check:
            json_out = compress_data(c,inp,out)  
    
            out = str(json_out)
            if report_check:
                rl = len(out)
                print(f'rl:: {rl}')
                c=1
                for i in str(out):
                    if i == " " or i=="," or i=="\n":
                        c +=1
                print (f'c2:: {c}')
                rawp = compress_data_og(c,inp,out)
            else:
                rawp = out
        json_out = format_json(json_out)
    else:
        rawp = "Provide a valid data source"
    history.clear()
    history.append((inp,rawp))
    yield "", history,error_box,json_out
SAVE_MEMORY = """
You are attempting to complete the task
task: {task}
Data:
{history}
Instructions:
Compile and categorize the data above into a JSON dictionary string
Include ALL text, datapoints, titles, descriptions, and source urls indexed into an easy to search JSON format
Required keys:
"keywords":["short", "list", "of", "important", "keywords", "found", "in", "this", "entry"],
"title":"title of entry",
"description":"A sentence summarizing the topic of this entry",
"content":"A brief paragraph summarizing the important datapoints found in this entry",
"url":"https://url.source"
"""




def format_json(inp):

    print("FORMATTING:::")
    print(type(inp))
    print("###########")
    print(inp)
    print("###########")
    print("###########")
    new_str=""
    matches=["```","#","//"]
    for i,line in enumerate(inp):
        line = line.strip()
        print(line)
        #if not any(x in line for x in matches):
        new_str+=line.strip("\n").strip("```").strip("#").strip("//")
    print("###########")
    print("###########")
    #inp = inp.strip("<\s>")
    new_str=new_str.strip("</s>")
    out_json=eval(new_str)
    print(out_json)
    print("###########")
    print("###########")
    
    return out_json




def format_json_og(inp):
    new_json=[]
    start_json={}
    print("FORMATTING:::")
    for i,line in enumerate(inp):
        line = line.strip()
        if "{" in line:
            print (line)
            start_json={}
        #print(f'test:: {line}')
        if "keywords" in line and ":" in line:
            start_json['keywords']=line.split(":")[1].strip(",")
            print (line)
        if "title" in line and ":" in line:
            start_json['title']=line.split(":")[1].strip(",")
            print (line)
        if "description" in line and ":" in line:
            start_json['description']=line.split(":")[1].strip(",")
            print (line)
        if "content" in line and ":" in line:
            start_json['content']=line.split(":")[1].strip(",")
            print (line)
        if "url" in line and ":" in line:
            start_json['url']=line.split(":")[1].strip(",")
            print (line)

        if "}" in line:
            new_json.append(start_json)
            print (new_json)
    return new_json

def create_index():
    uid=uuid.uuid4()

    ####### load index ###############
    r = requests.get(f'{save_data}mem-test2/index.json') 
    print(f'status code main:: {r.status_code}')
    if r.status_code==200:
        ind = json.loads(r.text)
        print (f'ind::\n{ind}')
    if not r.status_code==200:
        print("Create new IND")
        ind = [{}]

    ####### load main ###############
    m = requests.get(f'{save_data}mem-test2/main.json') 
    print(f'status code main:: {m.status_code}')

    if m.status_code==200:
        main = json.loads(m.text)
        #print (f'main::\n{main}')
    if not r.status_code==200:
        main = []   
    try:
        for ea in main:
            #print(f'###### EACH::: {ea}')
            print(f"KEYWORDS:: {ea['keywords']}")
    except Exception as e:
        print(f"ERROR:: {e}")    
    for ea in main:
        try:
            for k in ea['keywords']:
                print(k)
                print(ea['file_name'])
                #for ii in ind[0]:
                try:
                    if k in ind[0].keys():
                        print("Adding to list")
                        if not ea['file_name'] in ind[0][k]:
                            ind[0][k].append(ea['file_name'])
                    else:
                        print("Adding new Value")
                        ind[0].update({k:[ea['file_name']]})
                except Exception as e:
                    print (e)
                    ind[0].append({k:[ea['file_name']]})
                    #ind.append({k:[ea['file_name']]})
                    
        except Exception as e:
            print (e)

    json_object = json.dumps(ind, indent=4)
    with open(f"tmp3-{uid}.json", "w") as outfile3:
        outfile3.write(json_object)
    outfile3.close()
    api.upload_file(
    path_or_fileobj=f"tmp3-{uid}.json",
    path_in_repo=f"/mem-test2/index.json",
    repo_id=reponame,
    #repo_id=save_data.split('datasets/',1)[1].split('/raw',1)[0],
    token=token_self,
    repo_type="dataset",
    )

        
    
def save_memory(purpose, history):
    uid=uuid.uuid4()
    history=str(history)
    c=1
    inp = str(history)
    rl = len(inp)
    print(f'rl:: {rl}')
    for i in str(inp):
        if i == " " or i=="," or i=="\n" or i=="/" or i=="\\" or i=="." or i=="<":
            c +=1
    print (f'c:: {c}')

    seed=random.randint(1,1000000000)
    
    print (c)
    #tot=len(purpose)
    #print(tot)
    divr=int(c)/MAX_DATA
    divi=int(divr)+1 if divr != int(divr) else int(divr)
    chunk = int(int(c)/divr)
    print(f'chunk:: {chunk}')
    print(f'divr:: {divr}')
    print (f'divi:: {divi}')
    out_box = []
    #out=""
    s=0
    ee=chunk
    print(f'e:: {ee}')
    new_history=""
    task = f'Index this Data\n'
    for z in range(divi):
        print(f's:e :: {s}:{ee}')
        
        hist = inp[s:ee]
        
        resp = run_gpt(
            SAVE_MEMORY,
            stop_tokens=["observation:", "task:", "action:", "thought:"],
            max_tokens=4096,
            seed=seed,
            purpose=purpose,
            task=task,
            history=hist,
        ).strip('\n')
        #new_history = resp
        #print (resp)
        #out+=resp

        #print ("final1" + resp)
        try:
            resp='[{'+resp.split('[{')[1].split('</s>')[0]
            #print ("final2\n" + resp)
            #print(f"keywords:: {resp['keywords']}")
        except Exception as e:
            resp = resp
            print(e)
        timestamp=str(datetime.datetime.now())
        timename=timestamp.replace(" ","--").replace(":","-").replace(".","-")
        json_object=resp
        #json_object = json.dumps(out_box)
        #json_object = json.dumps(out_box,indent=4)
        with open(f"tmp-{uid}.json", "w") as outfile:
            outfile.write(json_object)
            
        outfile.close()
        api.upload_file(
        path_or_fileobj=f"tmp-{uid}.json",
        path_in_repo=f"/mem-test2/{timename}---{s}-{ee}.json",
        repo_id=reponame,
        #repo_id=save_data.split('datasets/',1)[1].split('/raw',1)[0],
        token=token_self,
        repo_type="dataset",
        )
        lines = resp.strip().strip("\n").split("\n")
        #formatted_json=format_json(lines)

        r = requests.get(f'{save_data}mem-test2/main.json') 
        print(f'status code main:: {r.status_code}')
        try:
            print(f"KEYWORDS:: {json_object['keywords']}")
        except Exception as e:
            print(f"KEYWORDS:: {e}")
        if r.status_code==200:
            
            lod = json.loads(r.text)
            #lod = eval(lod)
            print (f'lod:: {lod}')
        if not r.status_code==200:
            lod = []
        key_box=[]
        desc=""
        for i,line in enumerate(lines):

            #print(f'LINE:: {line}')
            if ":" in line:
                print(f'line:: {line}')
            
            if "keywords" in line and ":" in line:
                print(f'trying:: {line}')
                keyw=line.split(":")[1]
                print (keyw)
                try:
                    print (keyw.split("[")[1].split("]")[0])
                    keyw=keyw.split("[")[1].split("]")[0]
                    for ea in keyw.split(","):
                        s1=""
                        ea=ea.strip().strip("\n")
                        for ev in ea:
                            if ev.isalnum():
                                s1+=ev
                            if ev == " ":
                                s1+=ev
                            #ea=s1
                        print(s1)
                        key_box.append(s1)
                except Exception as e:
                    print(f'ERROR SAVING KEYWORD:: {e}')
            if "description" in line and ":" in line:
                #print(f'trying:: {line}')
                desc=line.split(":")[1]

            if key_box and desc:
                lod.append({"file_name":f"{timename}---{s}-{ee}","keywords":key_box,"description":str(desc),"index":f"{s}:{ee}"})
                key_box = []
                desc=""
                json_object = json.dumps(lod, indent=4)
                with open(f"tmp2-{uid}.json", "w") as outfile2:
                    outfile2.write(json_object)
                outfile2.close()
                api.upload_file(
                path_or_fileobj=f"tmp2-{uid}.json",
                path_in_repo=f"/mem-test2/main.json",
                repo_id=reponame,
                #repo_id=save_data.split('datasets/',1)[1].split('/raw',1)[0],
                token=token_self,
                repo_type="dataset",
                )

        ee=ee+chunk
        s=s+chunk       
        out_box.append(resp)
        create_index()
    return out_box

def valid_list(inp):
    out_list=[]
    inp_typ = type(inp)
    print(inp_typ)
    if inp_typ==type(str(inp)):
        print("STRING")
        #new_list = new_list.replace(", ",",").replace(" ,",",")
        new_list=inp.split("[")[1].split("]",-1)[0]
        print(new_list)
        print(type(new_list))
        
        for ea in new_list.split(","):
            ea = ea.replace("'","").replace('"',"")
            out_list.append(ea)
        print(out_list)
        print(type(out_list))
    
    
def recall_memory(inp,history):
    error_box=""
    json_out={}
    if not history:
        history=[]
    r = requests.get(f'{save_data}mem-test2/index.json') 
    print(f'status code main:: {r.status_code}')
    if r.status_code==200:
        mem = json.loads(r.text)
        print (f'ind::\n{mem}')
    if not r.status_code==200:
        print("Create new IND")
        out="MEMORY FILE NOT FOUND"
        return out,out,out,out
    mem_keys = mem[0].keys()
    rawp = get_mem(inp,mem_keys)
    valid_list(rawp)
    #valid_list(["123","333"])
    
    history.clear()
    history.append((inp,rawp))
    yield "", history,error_box,json_out


    
#################################
def clear_fn():
    return "",[(None,None)]

with gr.Blocks() as app:
    gr.HTML("""<center><h1>Mixtral 8x7B TLDR Summarizer + Web</h1><h3>Summarize Data of unlimited length</h3>""")
    chatbot = gr.Chatbot(label="Mixtral 8x7B Chatbot",show_copy_button=True)
    with gr.Row():
        with gr.Column(scale=3):
            prompt=gr.Textbox(label = "Instructions (optional)")
        with gr.Column(scale=1):
            report_check=gr.Checkbox(label="Return Report", value=True)
            sum_check=gr.Checkbox(label="Summarize", value=True)
            mem_check=gr.Checkbox(label="Memory", value=True)
            #sum_mem_check=gr.Radio(label="Output",choices=["Summary","Memory"])
            button=gr.Button()
        
        #models_dd=gr.Dropdown(choices=[m for m in return_list],interactive=True)
    with gr.Row():
        stop_button=gr.Button("Stop")
        clear_btn = gr.Button("Clear")
    with gr.Row():
        with gr.Tab("Text"):
            data=gr.Textbox(label="Input Data (paste text)", lines=6)
        with gr.Tab("File"):
            file=gr.Files(label="Input File(s) (.pdf .txt)")
        with gr.Tab("Raw HTML"):
            url = gr.Textbox(label="URL")
        with gr.Tab("PDF URL"):
            pdf_url = gr.Textbox(label="PDF URL")       
        with gr.Tab("PDF Batch"):
            pdf_batch = gr.Textbox(label="PDF URL Batch (comma separated)")
        with gr.Tab("Memory"):
            mem_inp = gr.Textbox(label="Query")
            mem = gr.Button()
    json_out=gr.JSON()
    e_box=gr.Textbox()

    mem.click(recall_memory,mem_inp,[prompt,chatbot,e_box,json_out])
    #text=gr.JSON()
    #inp_query.change(search_models,inp_query,models_dd)
    clear_btn.click(clear_fn,None,[prompt,chatbot])
    go=button.click(summarize,[prompt,chatbot,report_check,sum_check,mem_check,data,file,url,pdf_url,pdf_batch],[prompt,chatbot,e_box,json_out])
    stop_button.click(None,None,None,cancels=[go])
app.queue(default_concurrency_limit=20).launch(show_api=False)