import functions as funky # need to enable this for Hugging Face import pandas as pd import gradio as gr import os from datasets import load_dataset from huggingface_hub import login import numpy as np from fastapi import FastAPI, Request import uvicorn from starlette.middleware.sessions import SessionMiddleware import fastapi from datetime import datetime import re login(token = os.environ['HUB_TOKEN']) # logger = gr.HuggingFaceDatasetSaver(os.environ['HUB_TOKEN'], dataset_name='illustration_gdrive_logging_main', private=True) # logger.setup([gr.Text(label="clicked_url"), gr.Text(label="seach_term"), gr.Text(label = 'sessionhash'), gr.Text(label = 'datetime')], './flagged_data_points') logging_js = ''' function magicFunc(x){ let script = document.createElement('script'); script.src = "file/js_functions.js" document.head.appendChild(script); } ''' dataset = load_dataset("bradley6597/illustration-test", data_files = 'data.csv') df = pd.DataFrame(dataset['train']).drop_duplicates() dataset_ai = load_dataset("bradley6597/illustration-test", data_files = 'ai_captions_data.csv') ai_captions = pd.DataFrame(dataset_ai['train']).drop_duplicates() df = df.merge(ai_captions, how = 'left', on = 'clean_link') df['ai_description'] = df['ai_description'].fillna('') ill_links = df.copy() ill_links = ill_links[ill_links['Description'] != 'Moved'].copy() ill_links['code'] = ill_links['link'].str.replace("https://drive.google.com/file/d/", "", regex = False) ill_links['code'] = ill_links['code'].str.replace("/view?usp=drivesdk", "", regex = False) ill_links['filename'] = ill_links['file'].str.replace(".*\\/", "", regex = True) # ill_links['image_code'] = 'https://lh3.google.com/u/0/d/' + ill_links['code'] + '=k' ill_links['image_code'] = 'https://lh3.google.com/u/0/d/' + ill_links['code'] + '=w320-h304' ill_links['image_code'] = np.where(ill_links['file'].str.contains("\\.png$", regex = True), '
{i}
' + tmp_links ill_check_lst.append(tmp_links) ill_check_df = pd.DataFrame(ill_check_lst).T ill_check_html = ill_check_df.to_html(escape = False, render_links = True, index = False, header = False) ill_links = ill_links[['ID', 'title', 'url', 'abstract', 'filepath', 'Date Created', 'post_filepath', 'parent_id']] ill_links_title = ill_links_title[['ID', 'title', 'url', 'abstract', 'filepath', 'Date Created', 'Description', 'post_filepath', 'parent_id']] ill_links_ai = ill_links_ai[['ID', 'title', 'url', 'abstract', 'filepath', 'Date Created', 'Description', 'post_filepath', 'parent_id']] ind_main, doc_main, tf_main = funky.index_documents(ill_links) del ill_links ind_title, doc_title, tf_title = funky.index_documents(ill_links_title) del ill_links_title ind_ai, doc_ai, tf_ai = funky.index_documents(ill_links_ai) del ill_links_ai def same_auth(username, password): return(username == os.environ['username']) & (password == os.environ['password']) def search_index(search_text, sd, ks, sort_by, max_results, user_num, search_title, image_type, do_not_use, increase = None): max_results_list = ['10', '25', '50', '75', '100', '250', '500', '1000', '5000', '10000', 'All'] if increase: max_results = max_results_list[max_results_list.index(max_results) + 1] if search_title: output = funky.search(tf_title, doc_title, ind_title, search_text, search_type = 'AND', ranking = True) else: output = funky.search(tf_main, doc_main, ind_main, search_text, search_type='AND', ranking = True) # Don't need to order by AI as the AI ranking numbers are much lower than the default numbers output_ai = funky.search(tf_ai, doc_ai, ind_ai, search_text, search_type = 'AND', ranking = True) output.extend(output_ai) output = [x for o in output for x in o if type(x) is not float] load_more_visible = False extra_info = '' if len(output) > 0: output_df = (pd.DataFrame(output) .groupby('url') .first() .reset_index() .drop_duplicates()) output_df['Date Created'] = pd.to_datetime(output_df['Date Created'], format = 'mixed') if do_not_use: output_df = output_df[~output_df['filepath'].str.lower().str.contains("do.*not.*use|not.*general|don\\'t.*use|do.*no.*use|numberblock", regex = True)] map_df = output_df[output_df['title'].str.contains('map|Map', regex = True)] output_df['url'] = output_df['url'].str.replace("/u/0/", f"/u/{int(user_num)}/", regex = False) output_df_temp = pd.DataFrame() if len(sd) > 0: for shared in sd: temp_df = output_df[(output_df['filepath'].str.contains(str(shared), regex = False))] output_df_temp = pd.concat([output_df_temp, temp_df]) output_df = output_df_temp.sort_index() # if len(sd) == 1: # output_df = output_df[(output_df['filepath'].str.contains(str(sd[0]), regex = False))] if len(ks) > 0: keystage_filter = '|'.join(ks).lower() if search_title: output_df['abstract'] = output_df['abstract'] + ' ' + output_df['Description'] output_df['abstract'] = output_df['abstract'].str.lower() output_df['post_filepath'] = output_df['post_filepath'].str.lower() output_df['missing_desc'] = np.where(output_df['abstract'].str.contains('eyfs|ks1|ks2|ks3', regex = True), 0, 1) output_df2 = output_df[(output_df['abstract'].str.contains(keystage_filter, regex = True) | (output_df['missing_desc'] == 1))].copy() output_df2 = output_df2[(output_df2['post_filepath'].str.contains(keystage_filter, regex = True))] if output_df2.shape[0] == 0: output_df2 = output_df[(output_df['post_filepath'].str.contains(keystage_filter, regex = True))] else: output_df['abstract'] = output_df['abstract'].str.lower() output_df['post_filepath'] = output_df['post_filepath'].str.lower() output_df['missing_desc'] = np.where(output_df['abstract'].str.contains('eyfs|ks1|ks2|ks3', regex = True), 0, 1) output_df2 = output_df output_df2['ind'] = output_df2.index min_parent_score = output_df2.groupby('parent_id')['ind'].min().reset_index() min_parent_score.columns = ['parent_id', 'min_parent_ind'] output_df2 = output_df2.merge(min_parent_score, how = 'left', on = 'parent_id') if sort_by == 'Relevance': output_df2 = output_df2.sort_values(by = ['missing_desc', 'min_parent_ind'], ascending = [True, True]) elif sort_by == 'Date Created': output_df2 = output_df2.sort_values(by = ['Date Created'], ascending = False) elif sort_by == 'A-Z': output_df2 = output_df2.sort_values(by = ['title'], ascending = True) image_type_filter = '$|'.join(image_type).lower().replace("jpeg", "jpg") + '$' output_df2 = output_df2[output_df2['filepath'].str.contains(image_type_filter, regex = True)].reset_index(drop = True) total_returned = 'No. of Results to Return (Total: ' + str(output_df2.shape[0]) + ')' if max_results != 'All': if output_df2.shape[0] > int(max_results): load_more_visible = True output_df2 = output_df2.head(int(max_results)) output_df2 = output_df2[['url']].reset_index(drop = True) max_cols = 5 output_df2['row'] = output_df2.index % max_cols for x in range(0, max_cols): tmp = output_df2[output_df2['row'] == x].reset_index(drop = True) tmp = tmp[['url']] if x == 0: final_df = tmp else: final_df = pd.concat([final_df, tmp], axis = 1) final_df = final_df.fillna('') else: final_df = pd.DataFrame(['If you can't see the images please make sure you are signed in to your Twinkl account on Google & you have access to the Shared Drives you are searching :)
To drag images click 'Make Draggable' button and wait until it says 'Drag It!'. After this you can drag the image into a folder on your computer