gdrive-illustration-search

Running

File size: 14,076 Bytes

fb7f2e1
e54af7b
 
06c335d
e54af7b
 
df4c3d4
 
 
 
 
 
9da3be7
e54af7b
 
df4c3d4
72ccdcf
3beb244
2d44025
fb7f2e1
2d44025
 
 
34f79f7
2d44025
 
 
 
7c77316
df4c3d4
2d44025
fb7f2e1
 
 
 
 
 
df4c3d4
2d44025
 
 
9da3be7
2d44025
 
fb7f2e1
2d44025
 
 
 
 
 
 
 
 
fb7f2e1
 
 
 
 
 
 
2d44025
 
 
fb7f2e1
 
2d44025
 
 
 
fb7f2e1
 
 
 
 
2d44025
fb7f2e1
2d44025
fb7f2e1
 
 
2d44025
 
 
 
 
fb7f2e1
 
 
 
 
2d44025
fb7f2e1
2d44025
 
fb7f2e1
 
 
 
2d44025
fb7f2e1
 
2d44025
b9cabd2
 
 
fb7f2e1
 
 
 
2d44025
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e4e0ca7
 
fb7f2e1
 
 
 
e4e0ca7
 
2d44025
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ff114d8
df4c3d4
2d44025
 
fb7f2e1
 
2d44025
 
fb7f2e1
 
 
2d44025
df4c3d4
 
1267ef7
df4c3d4
2d44025
117ade9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fb7f2e1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117ade9
2d44025
117ade9
2d44025
 
 
dd3d2d9
2d44025
 
 
 
 
 
 
 
 
e4e0ca7
2d44025
fb7f2e1
2d44025
fb7f2e1
117ade9
fb7f2e1
 
 
df4c3d4
 
fb7f2e1
2d44025
 
e54af7b
 
2d44025
df4c3d4
 
 
 
 
 
 
 
 
 
 
e54af7b
df4c3d4
 
 
 
 
 
 
 
 
e54af7b
df4c3d4
 
 
 
fb7f2e1

import functions as funky # need to enable this for Hugging Face
import pandas as pd
import gradio as gr
import os
from datasets import load_dataset
from huggingface_hub import login
import numpy as np
from fastapi import FastAPI, Request
import uvicorn
from starlette.middleware.sessions import SessionMiddleware
import fastapi
from datetime import datetime
import re

login(token = os.environ['HUB_TOKEN'])

logger = gr.HuggingFaceDatasetSaver(os.environ['HUB_TOKEN'], dataset_name='illustration_gdrive_logging_main', organization=None, private=True)
logger.setup([gr.Text(label="clicked_url"), gr.Text(label="seach_term"),  gr.Text(label = 'sessionhash'), gr.Text(label = 'datetime')], './flagged_data_points')


logging_js = '''
function magicFunc(x){
    let script = document.createElement('script');    
    script.src = "file/js_functions.js"
    document.head.appendChild(script);
}
'''

dataset = load_dataset("bradley6597/illustration-test", data_files = 'data.csv')
df = pd.DataFrame(dataset['train']).drop_duplicates()

dataset_ai = load_dataset("bradley6597/illustration-test", data_files = 'ai_captions_data.csv')
ai_captions = pd.DataFrame(dataset_ai['train']).drop_duplicates()

df = df.merge(ai_captions, how = 'left', on = 'clean_link')
df['ai_description'] = df['ai_description'].fillna('')

ill_links = df.copy()
ill_links = ill_links[ill_links['Description'] != 'Moved'].copy()
ill_links['code'] = ill_links['link'].str.replace("https://drive.google.com/file/d/", "", regex = False)
ill_links['code'] = ill_links['code'].str.replace("/view?usp=drivesdk", "", regex = False)
ill_links['filename'] = ill_links['file'].str.replace(".*\\/", "", regex = True)
# ill_links['image_code'] = 'https://lh3.google.com/u/0/d/' + ill_links['code'] + '=k'
ill_links['image_code'] = 'https://lh3.google.com/u/0/d/' + ill_links['code'] + '=w320-h304'
ill_links['image_code'] = '<center><a href="' + ill_links['link'] + '" target="_blank" onclick="magicFunc(\'' + ill_links['code'] + '\')"><img src="' + ill_links['image_code'] + '" style="max-height:400px; max-width:200px"> ' +  ill_links['filename'] + '</a><a href="https://drive.google.com/u/0/uc?id=' + ill_links['code'] + '&export=download"><img src="/file/download_icon.png"></a><button class="submit-btn" onclick="mdFunc(this.parentNode)">Make Draggable</button></center>'
ill_links['shared_drive'] = ill_links['file'].str.replace("/content/drive/Shareddrives/", "", regex = False)
ill_links['shared_drive'] = ill_links['shared_drive'].str.replace("(.*?)\\/.*", "\\1", regex = True)
ill_links['Description'] = ill_links['Description'].str.replace("No Description", "", regex = False)

ill_links['ID'] = ill_links.index
ill_links['title'] = ill_links['filename']
ill_links['url'] = ill_links['image_code']
ill_links['filepath'] = ill_links['file']
ill_links['post_filepath'] = ill_links['filepath'].str.replace(".*?\\/KS1 EYFS\\/", "", regex = True)

ill_links_title = ill_links.copy()
ill_links_ai = ill_links.copy()

ill_links['abstract'] = ill_links['filename'].str.replace("\\-|\\_", " ", regex = True) + ' ' + ill_links['Description'].str.replace(",", " ", regex = False).astype(str)
ill_links_title['abstract'] = ill_links_title['filename'].str.replace('\\-|\\_', " ", regex = True)
ill_links_ai['abstract'] = ill_links_title['ai_description']

ill_check_lst = []
for i in range(0, 5):
    tmp_links =  f'https://lh3.google.com/u/{i}/d/' + ill_links['code'].iloc[0] + '=w320-h304'
    tmp_links =  '<img onmousedown="mdFunc(this)" src="' + tmp_links + '" style="max-height:400px; max-width:25%">'
    tmp_links = f'<p>{i}</p>' + tmp_links
    ill_check_lst.append(tmp_links)
ill_check_df = pd.DataFrame(ill_check_lst).T
ill_check_html = ill_check_df.to_html(escape = False, render_links = True, index = False, header = False)

ill_links = ill_links[['ID', 'title', 'url', 'abstract', 'filepath', 'Date Created', 'post_filepath']]
ill_links_title = ill_links_title[['ID', 'title', 'url', 'abstract', 'filepath', 'Date Created', 'Description', 'post_filepath']]
ill_links_ai = ill_links_ai[['ID', 'title', 'url', 'abstract', 'filepath', 'Date Created', 'Description', 'post_filepath']]

ind_main, doc_main, tf_main = funky.index_documents(ill_links)
del ill_links
ind_title, doc_title, tf_title = funky.index_documents(ill_links_title)
del ill_links_title
ind_ai, doc_ai, tf_ai = funky.index_documents(ill_links_ai)
del ill_links_ai

def same_auth(username, password):
    return(username == os.environ['username']) & (password == os.environ['password'])



def search_index(search_text, sd, ks, sort_by, max_results, user_num, search_title, increase = None):
    max_results_list = ['10', '25', '50', '75', '100', '250', '500', '1000', '5000', '10000', 'All']
    if increase:
        max_results = max_results_list[max_results_list.index(max_results) + 1] 
    if search_title:
        output = funky.search(tf_title, doc_title, ind_title, search_text, search_type = 'AND', ranking = True)        
    else:
        output = funky.search(tf_main, doc_main, ind_main, search_text, search_type='AND', ranking = True)
        # Don't need to order by AI as the AI ranking numbers are much lower than the default numbers
        output_ai = funky.search(tf_ai, doc_ai, ind_ai, search_text, search_type = 'AND', ranking = True)
        output.extend(output_ai)

    output = [x for o in output for x in o if type(x) is not float]
            
    load_more_visible = False
    
    if len(output) > 0:
        
        output_df = (pd.DataFrame(output)
                .groupby('url')
                .first()
                .reset_index()
                .drop_duplicates())
        
        output_df['url'] = output_df['url'].str.replace("/u/0/", f"/u/{int(user_num)}/", regex = False)
        if len(sd) == 1:
            output_df = output_df[(output_df['filepath'].str.contains(str(sd[0]), regex = False))]
        if len(ks) > 0:
            keystage_filter = '|'.join(ks).lower()
            if search_title:
                output_df['abstract'] = output_df['abstract'] + ' ' + output_df['Description']
            
            output_df['abstract'] = output_df['abstract'].str.lower()
            output_df['post_filepath'] = output_df['post_filepath'].str.lower()
            output_df['missing_desc'] = np.where(output_df['abstract'].str.contains('eyfs|ks1|ks2', regex = True), 0, 1)
            output_df2 = output_df[(output_df['abstract'].str.contains(keystage_filter, regex = True) | (output_df['missing_desc'] == 1))].copy()
            output_df2 = output_df2[(output_df2['post_filepath'].str.contains(keystage_filter, regex = True))]
            if output_df2.shape[0] == 0:
                output_df2 = output_df[(output_df['post_filepath'].str.contains(keystage_filter, regex = True))]
        
        output_df2['ind'] = output_df2.index
        if sort_by == 'Relevance':
            output_df2 = output_df2.sort_values(by = ['missing_desc', 'ind'], ascending = [True, True])
        elif sort_by == 'Date Created':
            output_df2 = output_df2.sort_values(by = ['Date Created'], ascending = False)
        elif sort_by == 'A-Z':
            output_df2 = output_df2.sort_values(by = ['title'], ascending = True)

        total_returned = 'No. of Results to Return (Total: ' + str(output_df2.shape[0]) + ')'
        
        if output_df2.shape[0] > int(max_results):
            load_more_visible = True
            
        if max_results != 'All':
            output_df2 = output_df2.head(int(max_results))
        output_df2 = output_df2[['url']].reset_index(drop = True)
        
        max_cols = 5
        output_df2['row'] = output_df2.index % max_cols
        for x in range(0, max_cols):
            tmp = output_df2[output_df2['row'] == x].reset_index(drop = True)
            tmp = tmp[['url']]
            if x == 0:
                final_df = tmp
            else:
                final_df = pd.concat([final_df, tmp], axis = 1)
        
        final_df = final_df.fillna('')
    else:
        final_df = pd.DataFrame(['<h3>No Results Found :(</h3>'])
        total_returned = 'No. of Results to Return (Total: 0)'

    if final_df.shape[0] == 0 :
        final_df = pd.DataFrame(['<h3>No Results Found :(</h3>'])
    
    
    return('<center>' + 
           final_df.to_html(escape = False, render_links = True, index = False, header = False) +
           '</center>',
           gr.update(label = total_returned, value = max_results),
           gr.update(visible = load_more_visible))
    

def search_logging(x: str, request: gr.Request):
    session_id = getattr(request.cookies, 'access-token')
    logger.flag(['', x, session_id, str(datetime.now())])

back_to_top_btn_html = '''
<button id="toTopBtn" onclick="'parentIFrame' in window ? window.parentIFrame.scrollTo({top: 0, behavior:'smooth'}) : window.scrollTo({ top: 0 })">
<a style="color:white; text-decoration:none;">Back to Top!</a>
</button>
'''

style = '''
footer{
    display: none !important;
}

td img{
  background-image:
    linear-gradient(45deg, lightgrey 25%, transparent 25%), 
    linear-gradient(135deg, lightgrey 25%, transparent 25%),
    linear-gradient(45deg, transparent 75%, lightgrey 75%),
    linear-gradient(135deg, transparent 75%, lightgrey 75%);
  
  background-size: 20px 20px;
  background-position: 0 0, 10px 0, 10px -10px, 0px 10px;
}
#toTopBtn {
	position: fixed;
        bottom: 10px;
        float: right;
        right: 18.5%;
        left: 77.25%;
	height: 30px;
        max-width: 100px;
        width: 100%;
        font-size: 12px;
        border-color: rgba(217,24,120, .5);
        background-color: rgba(35,153,249,.5);
        padding: .5px;
        border-radius: 4px;
   }
   
.submit-btn{
    display:inline-block !important;
    padding:0.7em 1.4em !important;
    margin:0 0.3em 0.3em 0 !important;
    border-radius:0.15em !important;
    box-sizing: border-box !important;
    text-decoration:none !important;
    font-family:'Roboto',sans-serif !important;
    text-transform:uppercase !important;
    font-weight:400 !important;
    color:#FFFFFF !important;
    background-color:#3369ff !important;
    box-shadow:inset 0 -0.6em 0 -0.35em rgba(0,0,0,0.17) !important;
    text-align:center !important;
    position:relative !important;
}
.submit-btn:active{
    top:0.1em !important;
}
@media all and (max-width:30em){
.submit-btn{
    display:block !important;
    margin:0.4em auto !important;
    }
}
'''

with gr.Blocks(css=style) as app:
    with gr.Row():
        with gr.Column(min_width = 10):
            with gr.Row():
                gr.HTML("<center><p>If you can't see the images please make sure you are signed in to your Twinkl account on Google & you have access to the Shared Drives you are searching :)</p><p>To drag images click 'Make Draggable' button and wait until it says 'Drag It!'. After this you can drag the image into a folder on your computer</p></center>")
                gr.HTML(ill_check_html)
                user_num = gr.Number(value = 0, label = 'Put lowest number of the alarm clock you can see')
            with gr.Row():
                search_prompt = gr.Textbox(placeholder = 'search for an illustration', label = 'Search', elem_id = 'search_term')
                title_search = gr.Checkbox(label = 'Search title only')
            # with gr.Row():
                shared_drive = gr.Dropdown(choices = ['Illustrations - 01-10 to 07-22', 'Illustrations - Now'], multiselect = True, label = 'Shared Drive', value = ['Illustrations - 01-10 to 07-22', 'Illustrations - Now'])
                key_stage = gr.Dropdown(choices = ['EYFS', 'KS1', 'KS2'], multiselect = True, label = 'Key Stage', value = ['EYFS', 'KS1', 'KS2'])
                sort_by = gr.Dropdown(choices = ['Relevance', 'Date Created', 'A-Z'], value = 'Relevance', multiselect = False, label = 'Sort By')
                max_return = gr.Dropdown(choices = ['10', '25', '50', '75', '100', '250', '500', '1000', '5000', '10000', 'All'], value = '50', multiselect = False, label = 'No. of Results to Return (Total: 0)')
            with gr.Row():
                search_button = gr.Button(value="Search!", interactive = True)
            with gr.Row(): 
                output_df = gr.HTML()
            back_top_btn = gr.HTML(back_to_top_btn_html)
            load_more_results_btn = gr.Button(value = 'Load More Results', interactive = True, visible = False)
    search_button.click(search_index, inputs=[search_prompt, shared_drive, key_stage, sort_by, max_return, user_num, title_search], outputs=[output_df, max_return, load_more_results_btn]) 
    search_prompt.submit(search_index, inputs=[search_prompt, shared_drive, key_stage, sort_by, max_return, user_num, title_search], outputs=[output_df, max_return, load_more_results_btn])
    search_button.click(search_logging, inputs=[search_prompt], outputs=None) 
    search_prompt.submit(search_logging, inputs=[search_prompt], outputs=None)
    load_more_results_btn.click(search_index, inputs=[search_prompt, shared_drive, key_stage, sort_by, max_return, user_num, title_search, load_more_results_btn], outputs=[output_df, max_return, load_more_results_btn])
    app.load(_js = logging_js)

app.auth = (same_auth)
app.auth_message = ''

fapi = FastAPI()

fapi.add_middleware(SessionMiddleware, secret_key=os.environ['session_key'])

@fapi.middleware("http")
async def add_session_hash(request: Request, call_next):
    response = await call_next(request)
    session = request.cookies.get('session')
    if session:
        response.set_cookie(key='session', value=request.cookies.get('session'), httponly=True)
    return response

# custom get request handler with params to flag clicks
@ fapi.get("/track")
async def track(url: str, q: str, request: Request):
    
    if q is None:
        q = ''
    
    logger.flag([url, q, request.cookies['access-token'], str(datetime.now())])
    return {"message": "ok"}

# mount Gradio app to FastAPI app
app2 = gr.mount_gradio_app(fapi, app, path="/")
# serve the app
if __name__ == "__main__":
    uvicorn.run(app2, host="0.0.0.0", port=7860)