gdrive-illustration-search

Running

App Files Files Community

gdrive-illustration-search / app.py

bradley6597

Update app.py

ef89f97 verified 7 months ago

raw

history blame

16.4 kB

	import functions as funky # need to enable this for Hugging Face
	import pandas as pd
	import gradio as gr
	import os
	from datasets import load_dataset
	from huggingface_hub import login
	import numpy as np
	from fastapi import FastAPI, Request
	import uvicorn
	from starlette.middleware.sessions import SessionMiddleware
	import fastapi
	from datetime import datetime
	import re

	login(token = os.environ['HUB_TOKEN'])

	logger = gr.HuggingFaceDatasetSaver(os.environ['HUB_TOKEN'], dataset_name='illustration_gdrive_logging_main', organization=None, private=True)
	logger.setup([gr.Text(label="clicked_url"), gr.Text(label="seach_term"), gr.Text(label = 'sessionhash'), gr.Text(label = 'datetime')], './flagged_data_points')


	logging_js = '''
	function magicFunc(x){
	let script = document.createElement('script');
	script.src = "file/js_functions.js"
	document.head.appendChild(script);
	}
	'''

	dataset = load_dataset("bradley6597/illustration-test", data_files = 'data.csv')
	df = pd.DataFrame(dataset['train']).drop_duplicates()

	dataset_ai = load_dataset("bradley6597/illustration-test", data_files = 'ai_captions_data.csv')
	ai_captions = pd.DataFrame(dataset_ai['train']).drop_duplicates()

	df = df.merge(ai_captions, how = 'left', on = 'clean_link')
	df['ai_description'] = df['ai_description'].fillna('')

	ill_links = df.copy()
	ill_links = ill_links[ill_links['Description'] != 'Moved'].copy()
	ill_links['code'] = ill_links['link'].str.replace("https://drive.google.com/file/d/", "", regex = False)
	ill_links['code'] = ill_links['code'].str.replace("/view?usp=drivesdk", "", regex = False)
	ill_links['filename'] = ill_links['file'].str.replace(".*\\/", "", regex = True)
	# ill_links['image_code'] = 'https://lh3.google.com/u/0/d/' + ill_links['code'] + '=k'
	ill_links['image_code'] = 'https://lh3.google.com/u/0/d/' + ill_links['code'] + '=w320-h304'
	ill_links['image_code'] = np.where(ill_links['file'].str.contains("\\.png$", regex = True),
	'<center><a href="' + ill_links['link'] + '" target="_blank" onclick="magicFunc(\'' + ill_links['code'] + '\')"><img src="' + ill_links['image_code'] + '" style="max-height:400px; max-width:200px"> ' + ill_links['filename'] + '</a><a href="https://drive.google.com/u/0/uc?id=' + ill_links['code'] + '&export=download"><img src="/file/download_icon.png"></a><button class="submit-btn" onclick="mdFunc(this.parentNode)">Make Draggable</button></center>',
	'<center><a href="' + ill_links['link'] + '" target="_blank" onclick="magicFunc(\'' + ill_links['code'] + '\')"><img src="' + ill_links['image_code'] + '" style="max-height:400px; max-width:200px"> ' + ill_links['filename'] + '</a><a href="https://drive.google.com/u/0/uc?id=' + ill_links['code'] + '&export=download"><img src="/file/download_icon.png"></a></center>',
	)
	ill_links['shared_drive'] = ill_links['file'].str.replace("/content/drive/Shareddrives/", "", regex = False)
	ill_links['shared_drive'] = ill_links['shared_drive'].str.replace("(.?)\\/.", "\\1", regex = True)
	ill_links['Description'] = ill_links['Description'].str.replace("No Description", "", regex = False)

	ill_links['ID'] = ill_links.index
	ill_links['title'] = ill_links['filename']
	ill_links['url'] = ill_links['image_code']
	ill_links['filepath'] = ill_links['file']
	ill_links['post_filepath'] = ill_links['filepath'].str.replace(".*?\\/KS1 EYFS\\/", "", regex = True)

	ill_links_title = ill_links.copy()
	ill_links_ai = ill_links.copy()

	ill_links['abstract'] = ill_links['filename'].str.replace("\\-\|\\_", " ", regex = True) + ' ' + ill_links['Description'].str.replace(",", " ", regex = False).astype(str)
	ill_links_title['abstract'] = ill_links_title['filename'].str.replace('\\-\|\\_', " ", regex = True)
	ill_links_ai['abstract'] = ill_links_title['ai_description']

	ill_check_lst = []
	for i in range(0, 5):
	tmp_links = f'https://lh3.google.com/u/{i}/d/' + ill_links['code'].iloc[0] + '=w320-h304'
	tmp_links = '<img onmousedown="mdFunc(this)" src="' + tmp_links + '" style="max-height:400px; max-width:25%">'
	tmp_links = f'<p>{i}</p>' + tmp_links
	ill_check_lst.append(tmp_links)
	ill_check_df = pd.DataFrame(ill_check_lst).T
	ill_check_html = ill_check_df.to_html(escape = False, render_links = True, index = False, header = False)

	ill_links = ill_links[['ID', 'title', 'url', 'abstract', 'filepath', 'Date Created', 'post_filepath', 'parent_id']]
	ill_links_title = ill_links_title[['ID', 'title', 'url', 'abstract', 'filepath', 'Date Created', 'Description', 'post_filepath', 'parent_id']]
	ill_links_ai = ill_links_ai[['ID', 'title', 'url', 'abstract', 'filepath', 'Date Created', 'Description', 'post_filepath', 'parent_id']]

	ind_main, doc_main, tf_main = funky.index_documents(ill_links)
	del ill_links
	ind_title, doc_title, tf_title = funky.index_documents(ill_links_title)
	del ill_links_title
	ind_ai, doc_ai, tf_ai = funky.index_documents(ill_links_ai)
	del ill_links_ai

	def same_auth(username, password):
	return(username == os.environ['username']) & (password == os.environ['password'])


	def search_index(search_text, sd, ks, sort_by, max_results, user_num, search_title, image_type, increase = None):
	max_results_list = ['10', '25', '50', '75', '100', '250', '500', '1000', '5000', '10000', 'All']
	if increase:
	max_results = max_results_list[max_results_list.index(max_results) + 1]
	if search_title:
	output = funky.search(tf_title, doc_title, ind_title, search_text, search_type = 'AND', ranking = True)
	else:
	output = funky.search(tf_main, doc_main, ind_main, search_text, search_type='AND', ranking = True)
	# Don't need to order by AI as the AI ranking numbers are much lower than the default numbers
	output_ai = funky.search(tf_ai, doc_ai, ind_ai, search_text, search_type = 'AND', ranking = True)
	output.extend(output_ai)

	output = [x for o in output for x in o if type(x) is not float]
	load_more_visible = False
	extra_info = ''
	if 'map' in search_text:
	extra_info = '<div id="mapBorder"><strong>If real-world maps are needed please check they are from the folder: Illustrations Now > Maps - Using the New Guidance (2024)</strong></div><br>'
	if len(output) > 0:

	output_df = (pd.DataFrame(output)
	.groupby('url')
	.first()
	.reset_index()
	.drop_duplicates())
	output_df['Date Created'] = pd.to_datetime(output_df['Date Created'], format = 'mixed')
	map_df = output_df[output_df['title'].str.contains('map\|Map', regex = True)]
	if map_df.shape[0] > 0:
	extra_info = '<div id="mapBorder"><strong>If real-world maps are needed please check they are from the folder: Illustrations Now > Maps - Using the New Guidance (2024)</strong></div><br>'

	output_df['url'] = output_df['url'].str.replace("/u/0/", f"/u/{int(user_num)}/", regex = False)
	if len(sd) == 1:
	output_df = output_df[(output_df['filepath'].str.contains(str(sd[0]), regex = False))]
	if len(ks) > 0:
	keystage_filter = '\|'.join(ks).lower()
	if search_title:
	output_df['abstract'] = output_df['abstract'] + ' ' + output_df['Description']

	output_df['abstract'] = output_df['abstract'].str.lower()
	output_df['post_filepath'] = output_df['post_filepath'].str.lower()
	output_df['missing_desc'] = np.where(output_df['abstract'].str.contains('eyfs\|ks1\|ks2', regex = True), 0, 1)
	output_df2 = output_df[(output_df['abstract'].str.contains(keystage_filter, regex = True) \| (output_df['missing_desc'] == 1))].copy()
	output_df2 = output_df2[(output_df2['post_filepath'].str.contains(keystage_filter, regex = True))]
	if output_df2.shape[0] == 0:
	output_df2 = output_df[(output_df['post_filepath'].str.contains(keystage_filter, regex = True))]
	else:
	output_df['abstract'] = output_df['abstract'].str.lower()
	output_df['post_filepath'] = output_df['post_filepath'].str.lower()
	output_df['missing_desc'] = np.where(output_df['abstract'].str.contains('eyfs\|ks1\|ks2', regex = True), 0, 1)
	output_df2 = output_df
	output_df2['ind'] = output_df2.index
	min_parent_score = output_df2.groupby('parent_id')['ind'].min().reset_index()
	min_parent_score.columns = ['parent_id', 'min_parent_ind']
	output_df2 = output_df2.merge(min_parent_score, how = 'left', on = 'parent_id')

	if sort_by == 'Relevance':
	output_df2 = output_df2.sort_values(by = ['missing_desc', 'min_parent_ind'], ascending = [True, True])
	elif sort_by == 'Date Created':
	output_df2 = output_df2.sort_values(by = ['Date Created'], ascending = False)
	elif sort_by == 'A-Z':
	output_df2 = output_df2.sort_values(by = ['title'], ascending = True)

	image_type_filter = '$\|'.join(image_type).lower().replace("jpeg", "jpg") + '$'

	output_df2 = output_df2[output_df2['filepath'].str.contains(image_type_filter, regex = True)].reset_index(drop = True)
	total_returned = 'No. of Results to Return (Total: ' + str(output_df2.shape[0]) + ')'

	if max_results != 'All':
	if output_df2.shape[0] > int(max_results):
	load_more_visible = True
	output_df2 = output_df2.head(int(max_results))
	output_df2 = output_df2[['url']].reset_index(drop = True)

	max_cols = 5
	output_df2['row'] = output_df2.index % max_cols
	for x in range(0, max_cols):
	tmp = output_df2[output_df2['row'] == x].reset_index(drop = True)
	tmp = tmp[['url']]
	if x == 0:
	final_df = tmp
	else:
	final_df = pd.concat([final_df, tmp], axis = 1)

	final_df = final_df.fillna('')
	else:
	final_df = pd.DataFrame(['<h3>No Results Found :(</h3>'])
	total_returned = 'No. of Results to Return (Total: 0)'

	if final_df.shape[0] == 0 :
	final_df = pd.DataFrame(['<h3>No Results Found :(</h3>'])


	return('<center>' +
	extra_info +
	final_df.to_html(escape = False, render_links = True, index = False, header = False) +
	'</center>',
	gr.update(label = total_returned, value = max_results),
	gr.update(visible = load_more_visible))


	def search_logging(x: str, request: gr.Request):
	session_id = getattr(request.cookies, 'access-token')
	logger.flag(['', x, session_id, str(datetime.now())])

	back_to_top_btn_html = '''
	<button id="toTopBtn" onclick="'parentIFrame' in window ? window.parentIFrame.scrollTo({top: 0, behavior:'smooth'}) : window.scrollTo({ top: 0 })">
	<a style="color:white; text-decoration:none;">Back to Top!</a>
	</button>
	'''

	style = '''
	footer{
	display: none !important;
	}

	td img{
	background-image:
	linear-gradient(45deg, lightgrey 25%, transparent 25%),
	linear-gradient(135deg, lightgrey 25%, transparent 25%),
	linear-gradient(45deg, transparent 75%, lightgrey 75%),
	linear-gradient(135deg, transparent 75%, lightgrey 75%);

	background-size: 20px 20px;
	background-position: 0 0, 10px 0, 10px -10px, 0px 10px;
	}
	#toTopBtn {
	position: fixed;
	bottom: 10px;
	float: right;
	right: 18.5%;
	left: 77.25%;
	height: 30px;
	max-width: 100px;
	width: 100%;
	font-size: 12px;
	border-color: rgba(217,24,120, .5);
	background-color: rgba(35,153,249,.5);
	padding: .5px;
	border-radius: 4px;
	}

	.submit-btn{
	display:inline-block !important;
	padding:0.7em 1.4em !important;
	margin:0 0.3em 0.3em 0 !important;
	border-radius:0.15em !important;
	box-sizing: border-box !important;
	text-decoration:none !important;
	font-family:'Roboto',sans-serif !important;
	text-transform:uppercase !important;
	font-weight:400 !important;
	color:#FFFFFF !important;
	background-color:#3369ff !important;
	box-shadow:inset 0 -0.6em 0 -0.35em rgba(0,0,0,0.17) !important;
	text-align:center !important;
	position:relative !important;
	}
	.submit-btn:active{
	top:0.1em !important;
	}
	@media all and (max-width:30em){
	.submit-btn{
	display:block !important;
	margin:0.4em auto !important;
	}
	}
	#mapBorder {
	border-radius: 25px;
	border: 2px solid orange;
	}
	'''

	with gr.Blocks(css=style) as app:
	with gr.Row():
	with gr.Column(min_width = 10):
	with gr.Row():
	gr.HTML("<center><p>If you can't see the images please make sure you are signed in to your Twinkl account on Google & you have access to the Shared Drives you are searching :)</p><p>To drag images click 'Make Draggable' button and wait until it says 'Drag It!'. After this you can drag the image into a folder on your computer</p></center>")
	gr.HTML(ill_check_html)
	user_num = gr.Number(value = 0, label = 'Put lowest number of the alarm clock you can see')
	with gr.Row():
	search_prompt = gr.Textbox(placeholder = 'search for an illustration', label = 'Search', elem_id = 'search_term')
	title_search = gr.Checkbox(label = 'Search title only')
	# with gr.Row():
	shared_drive = gr.Dropdown(choices = ['Illustrations - 01-10 to 07-22', 'Illustrations - Now', 'Shutter Stock Images', 'Beyond - Illustrations'], multiselect = True, label = 'Shared Drive', value = ['Illustrations - 01-10 to 07-22', 'Illustrations - Now'])
	key_stage = gr.Dropdown(choices = ['EYFS', 'KS1', 'KS2'], multiselect = True, label = 'Key Stage', value = ['EYFS', 'KS1', 'KS2'])
	image_type = gr.Dropdown(choices = ['JPEG', 'PNG', 'TIF'], multiselect = True, label = 'Image Type', value = ['PNG', 'JPEG', 'TIF'])

	sort_by = gr.Dropdown(choices = ['Relevance', 'Date Created', 'A-Z'], value = 'Relevance', multiselect = False, label = 'Sort By')
	max_return = gr.Dropdown(choices = ['10', '25', '50', '75', '100', '250', '500', '1000', '5000', '10000', 'All'], value = '50', multiselect = False, label = 'No. of Results to Return (Total: 0)')
	with gr.Row():
	search_button = gr.Button(value="Search!", interactive = True)
	with gr.Row():
	output_df = gr.HTML()
	back_top_btn = gr.HTML(back_to_top_btn_html)
	load_more_results_btn = gr.Button(value = 'Load More Results', interactive = True, visible = False)
	search_button.click(search_index, inputs=[search_prompt, shared_drive, key_stage, sort_by, max_return, user_num, title_search, image_type], outputs=[output_df, max_return, load_more_results_btn])
	search_prompt.submit(search_index, inputs=[search_prompt, shared_drive, key_stage, sort_by, max_return, user_num, title_search, image_type], outputs=[output_df, max_return, load_more_results_btn])
	search_button.click(search_logging, inputs=[search_prompt], outputs=None)
	search_prompt.submit(search_logging, inputs=[search_prompt], outputs=None)
	load_more_results_btn.click(search_index, inputs=[search_prompt, shared_drive, key_stage, sort_by, max_return, user_num, title_search, image_type, load_more_results_btn], outputs=[output_df, max_return, load_more_results_btn])
	app.load(_js = logging_js)

	app.auth = (same_auth)
	app.auth_message = ''

	fapi = FastAPI()

	fapi.add_middleware(SessionMiddleware, secret_key=os.environ['session_key'])

	@fapi.middleware("http")
	async def add_session_hash(request: Request, call_next):
	response = await call_next(request)
	session = request.cookies.get('session')
	if session:
	response.set_cookie(key='session', value=request.cookies.get('session'), httponly=True)
	return response

	# custom get request handler with params to flag clicks
	@ fapi.get("/track")
	async def track(url: str, q: str, request: Request):

	if q is None:
	q = ''

	logger.flag([url, q, request.cookies['access-token'], str(datetime.now())])
	return {"message": "ok"}

	# mount Gradio app to FastAPI app
	app2 = gr.mount_gradio_app(fapi, app, path="/")
	# serve the app
	if __name__ == "__main__":
	uvicorn.run(app2, host="0.0.0.0", port=7860)