gdrive-illustration-search

Running

App Files Files Community

Bradley commited on Mar 21, 2023

Commit

2d44025

•

0 Parent(s):

Duplicate from bradley6597/illustration-testing

Browse files

Files changed (6) hide show

.gitattributes +35 -0
README.md +13 -0
app.py +196 -0
functions.py +165 -0
requirements.txt +8 -0
style.css +14 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+small_data.csv filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,13 @@

+---
+title: Illustration Testing
+emoji: ⚡
+colorFrom: blue
+colorTo: red
+sdk: gradio
+sdk_version: 3.20.0
+app_file: app.py
+pinned: false
+duplicated_from: bradley6597/illustration-testing
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,196 @@

+import functions as funky
+import pandas as pd
+import gradio as gr
+import os
+from datasets import load_dataset
+from huggingface_hub import login
+import numpy as np
+from fastapi import FastAPI, Request
+import uvicorn
+from starlette.middleware.sessions import SessionMiddleware
+import fastapi
+login(token = os.environ['HUB_TOKEN'])
+logger = gr.HuggingFaceDatasetSaver(os.environ['HUB_TOKEN'], dataset_name='illustration_gdrive_logging', organization=None, private=True)
+logger.setup([gr.Text(label="clicked_url"), gr.Text(label="seach_term"),  gr.Text(label = 'sessionhash')], './flagged_data_points')
+logging_js = '''
+function magicFunc(x){
+    let script = document.createElement('script');
+    script.innerHTML = "async function magicFunc(x){let z = document.getElementById('search_term').getElementsByTagName('textarea')[0].value; await fetch('/track?url=' + x + '&q=' + z)}";
+    document.head.appendChild(script);
+}
+'''
+dataset = load_dataset("bradley6597/illustration-test")
+df = pd.DataFrame(dataset['train']).drop_duplicates()
+ill_links = df.copy()
+ill_links = ill_links[ill_links['Description'] != 'Moved'].copy()
+ill_links['code'] = ill_links['link'].str.replace("https://drive.google.com/file/d/", "", regex = False)
+ill_links['code'] = ill_links['code'].str.replace("/view?usp=drivesdk", "", regex = False)
+# ill_links['image_code'] = 'https://lh3.google.com/u/0/d/' + ill_links['code'] + '=k'
+ill_links['image_code'] = 'https://lh3.google.com/u/0/d/' + ill_links['code'] + '=w320-h304'
+ill_links['image_code'] = '<center><a href="' + ill_links['link'] + '" target="_blank" onclick="magicFunc(\'' + ill_links['code'] + '\')"><img src="' + ill_links['image_code'] + '" style="max-height:400px; max-width:200px"></a></center>'
+ill_links['filename'] = ill_links['file'].str.replace(".*\\/", "", regex = True)
+ill_links['shared_drive'] = ill_links['file'].str.replace("/content/drive/Shareddrives/", "", regex = False)
+ill_links['shared_drive'] = ill_links['shared_drive'].str.replace("(.*?)\\/.*", "\\1", regex = True)
+ill_links['Description'] = ill_links['Description'].str.replace("No Description", "", regex = False)
+ill_links_title = ill_links.copy()
+ill_links['ID'] = ill_links.index
+ill_links_title['ID'] = ill_links_title.index
+ill_links['title'] = ill_links['filename']
+ill_links_title['title'] = ill_links_title['filename']
+ill_links['url'] = ill_links['image_code']
+ill_links_title['url'] = ill_links_title['image_code']
+ill_links['abstract'] = ill_links['filename'].str.replace("\\-|\\_", " ", regex = True) + ' ' + ill_links['Description'].str.replace(",", " ", regex = False).astype(str)
+ill_links_title['abstract'] = ill_links_title['filename'].str.replace('\\-|\\_', " ", regex = True)
+ill_links['filepath'] = ill_links['file']
+ill_links_title['filepath'] = ill_links_title['file']
+ill_links['post_filepath'] = ill_links['filepath'].str.replace(".*?\\/KS1 EYFS\\/", "", regex = True)
+ill_links_title['post_filepath'] = ill_links_title['filepath'].str.replace(".*?\\/KS1 EYFS\\/", "", regex = True)
+ill_links = ill_links[['ID', 'title', 'url', 'abstract', 'filepath', 'Date Created', 'post_filepath']]
+ill_links_title = ill_links_title[['ID', 'title', 'url', 'abstract', 'filepath', 'Date Created', 'Description', 'post_filepath']]
+ill_check_lst = []
+for i in range(0, 5):
+    tmp_links = ill_links['url'].iloc[0].replace("/u/0/", f"/u/{i}/")
+    tmp_links = tmp_links.replace('max-width:200px', 'max-width:25%')
+    tmp_links = tmp_links.replace("<center>", "")
+    tmp_links = tmp_links.replace("</center>", "")
+    tmp_links = f'<p>{i}</p>' + tmp_links
+    ill_check_lst.append(tmp_links)
+ill_check_df = pd.DataFrame(ill_check_lst).T
+ill_check_html = ill_check_df.to_html(escape = False, render_links = True, index = False, header = False)
+ind_main, doc_main, tf_main = funky.index_documents(ill_links)
+ind_title, doc_title, tf_title = funky.index_documents(ill_links_title)
+def same_auth(username, password):
+    return(username == os.environ['username']) & (password == os.environ['password'])
+def search_index(search_text, sd, ks, sort_by, max_results, user_num, search_title):
+    if search_title:
+        output = funky.search(tf_title, doc_title, ind_title, search_text, search_type = 'AND', ranking = True)
+    else:
+        output = funky.search(tf_main, doc_main, ind_main, search_text, search_type='AND', ranking = True)
+    output = [x for o in output for x in o if type(x) is not float]
+    output_df = pd.DataFrame(output).reset_index(drop = True)
+    if output_df.shape[0] > 0:
+        output_df['url'] = output_df['url'].str.replace("/u/0/", f"/u/{int(user_num)}/", regex = False)
+        if len(sd) == 1:
+            output_df = output_df[(output_df['filepath'].str.contains(str(sd[0]), regex = False))]
+        if len(ks) > 0:
+            keystage_filter = '|'.join(ks).lower()
+            if search_title:
+                output_df['abstract'] = output_df['abstract'] + ' ' + output_df['Description']
+            output_df['abstract'] = output_df['abstract'].str.lower()
+            output_df['post_filepath'] = output_df['post_filepath'].str.lower()
+            output_df['missing_desc'] = np.where(output_df['abstract'].str.contains('eyfs|ks1|ks2', regex = True), 0, 1)
+            output_df2 = output_df[(output_df['abstract'].str.contains(keystage_filter, regex = True) | (output_df['missing_desc'] == 1))].copy()
+            output_df2 = output_df2[(output_df2['post_filepath'].str.contains(keystage_filter, regex = True))]
+            if output_df2.shape[0] == 0:
+                output_df2 = output_df[(output_df['post_filepath'].str.contains(keystage_filter, regex = True))]
+        output_df2['ind'] = output_df2.index
+        if sort_by == 'Relevance':
+            output_df2 = output_df2.sort_values(by = ['missing_desc', 'ind'], ascending = [True, True])
+        elif sort_by == 'Date Created':
+            output_df2 = output_df2.sort_values(by = ['Date Created'], ascending = False)
+        elif sort_by == 'A-Z':
+            output_df2 = output_df2.sort_values(by = ['title'], ascending = True)
+        output_df2 = output_df2.head(int(max_results))
+        output_df2 = output_df2[['url']].reset_index(drop = True)
+        max_cols = 5
+        output_df2['row'] = output_df2.index % max_cols
+        for x in range(0, max_cols):
+            tmp = output_df2[output_df2['row'] == x].reset_index(drop = True)
+            tmp = tmp[['url']]
+            if x == 0:
+                final_df = tmp
+            else:
+                final_df = pd.concat([final_df, tmp], axis = 1)
+        final_df = final_df.fillna('')
+    else:
+        final_df = pd.DataFrame(['<h3>No Results Found :(</h3>'])
+    if final_df.shape[0] == 0 :
+        final_df = pd.DataFrame(['<h3>No Results Found :(</h3>'])
+    return('<center>' +
+           final_df.to_html(escape = False, render_links = True, index = False, header = False) +
+           '</center>')
+def log_clicks(x):
+    print(x)
+with gr.Blocks(css="style.css") as app:
+    with gr.Row():
+        with gr.Column(min_width = 10):
+            with gr.Row():
+                gr.HTML("<center><p>If you can't see the images please make sure you are signed in to your Twinkl account on Google & you have access to the Shared Drives you are searching :)</p></center>")
+                gr.HTML(ill_check_html)
+                user_num = gr.Number(value = 0, label = 'Put lowest number of the alarm clock you can see')
+            with gr.Row():
+                search_prompt = gr.Textbox(placeholder = 'search for an illustration', label = 'Search', elem_id = 'search_term')
+                title_search = gr.Checkbox(label = 'Search title only')
+            # with gr.Row():
+                shared_drive = gr.Dropdown(choices = ['Illustrations - 01-10 to 07-22', 'Illustrations - Now'], multiselect = True, label = 'Shared Drive', value = ['Illustrations - 01-10 to 07-22', 'Illustrations - Now'])
+                key_stage = gr.Dropdown(choices = ['EYFS', 'KS1', 'KS2'], multiselect = True, label = 'Key Stage', value = ['EYFS', 'KS1', 'KS2'])
+                sort_by = gr.Dropdown(choices = ['Relevance', 'Date Created', 'A-Z'], value = 'Relevance', multiselect = False, label = 'Sort By')
+                max_return = gr.Dropdown(choices = ['10', '25', '50', '75', '100', '250', '500'], value = '10', multiselect = False, label = 'No. of Results to Return')
+            with gr.Row():
+                search_button = gr.Button(value="Search!")
+            with gr.Row():
+                output_df = gr.HTML()
+    search_button.click(search_index, inputs=[search_prompt, shared_drive, key_stage, sort_by, max_return, user_num, title_search], outputs=output_df)
+    search_prompt.submit(search_index, inputs=[search_prompt, shared_drive, key_stage, sort_by, max_return, user_num, title_search], outputs=output_df)
+    app.load(_js = logging_js)
+app.auth = (same_auth)
+app.auth_message = ''
+fapi = FastAPI()
+fapi.add_middleware(SessionMiddleware, secret_key=os.environ['session_key'])
+@fapi.middleware("http")
+async def add_session_hash(request: Request, call_next):
+    response = await call_next(request)
+    session = request.cookies.get('session')
+    if session:
+        response.set_cookie(key='session', value=request.cookies.get('session'), httponly=True)
+    return response
+# custom get request handler with params to flag clicks
+@ fapi.get("/track")
+async def track(url: str, q: str, request: Request):
+    if q is None:
+        q = ''
+    logger.flag([url, q, request.cookies['access-token']])
+    return {"message": "ok"}
+# mount Gradio app to FastAPI app
+app2 = gr.mount_gradio_app(fapi, app, path="/")
+# serve the app
+if __name__ == "__main__":
+    uvicorn.run(app2, host="0.0.0.0", port=7860)

functions.py ADDED Viewed

	@@ -0,0 +1,165 @@

+import re
+import string
+from collections import Counter
+import math
+from tqdm import tqdm
+from itertools import combinations
+from nltk.stem import PorterStemmer
+# top 25 most common words in English and "wikipedia":
+# https://en.wikipedia.org/wiki/Most_common_words_in_English
+stop_words = set(['the', 'be', 'to', 'of', 'and', 'a', 'in', 'that', 'have',
+                 'i', 'it', 'for', 'not', 'on', 'with', 'he', 'as', 'you',
+                 'do', 'at', 'this', 'but', 'his', 'by', 'from', 'wikipedia'])
+punct = re.compile(f'[{re.escape(string.punctuation)}]')
+def tokenize(text):
+    # Split text
+    return(text.split())
+def lowercase_filter(tokens):
+    # Make text lowercase
+    return([token.lower() for token in tokens])
+def punctuation_filter(tokens):
+    # Remove punctuation
+    return([punct.sub('', token) for token in tokens])
+def stopword_filter(tokens):
+    # Remove stopwords
+    return([token for token in tokens if token not in stop_words])
+def stem_filter(tokens):
+    # Stem words
+    ps = PorterStemmer()
+    return([ps.stem(token) for token in tokens])
+def analyze(text):
+    tokens = tokenize(text)
+    tokens = lowercase_filter(tokens)
+    tokens = punctuation_filter(tokens)
+    tokens = stopword_filter(tokens)
+    tokens = stem_filter(tokens)
+    return([token for token in tokens if token])
+# Setup an index and document structure to reference later
+def index_documents(df):
+    ind = {}
+    doc = {}
+    for i in tqdm(range(0, df.shape[0])):
+        if df['ID'].iloc[i] not in doc:
+            doc[df['ID'].iloc[i]] = df.iloc[i]
+            full_text = ' '.join([df['title'].iloc[i], df['abstract'].iloc[i]])
+        for token in analyze(full_text):
+            if token not in ind:
+                ind[token] = set()
+            ind[token].add(df['ID'].iloc[i])
+        if i % 5000 == 0:
+            print(f'Indexed {i} documents', end='\r')
+    df['title_abs'] = df['title'] + ' '  + df['abstract']
+    all_text = ' '.join(df['title_abs'])
+    term_frequencies = Counter(analyze(all_text))
+    return(ind, doc, term_frequencies)
+def rank(termfreq, doc, ind, analyzed_query, documents):
+    results = []
+    if not documents:
+        return results
+    for document in documents:
+        score = 0.0
+        for token in analyzed_query:
+            tf = termfreq.get(token, 0)
+            if len(ind.get(token, set())) == 0:
+                continue
+            idf = math.log10(len(doc) / len(ind.get(token, set())))
+            score += tf * idf
+        results.append((document, score))
+    return sorted(results, key=lambda doc: doc[1], reverse=True)
+def search(tf, doc, ind, query, search_type='AND', ranking=False):
+    """
+    Search; this will return documents that contain words from the query,
+    and rank them if requested (sets are fast, but unordered).
+    Parameters:
+        - tf: the term frequencies. Taken from indexing documents
+        - doc: documents. Taken from indexing documents
+        - ind: index. Taken from indexing documents
+        - query: the query string
+        - search_type: ('AND', 'OR') do all query terms have to match, or just one
+        - score: (True, False) if True, rank results based on TF-IDF score
+    """
+    if search_type not in ('AND', 'OR'):
+        return []
+    analyzed_query = analyze(query)
+    minus_query = [x[1:] for x in query.split() if x[0] == '-']
+    minus_query = [q for mq in minus_query for q in analyze(mq)]
+    specific_query = re.findall('"([^"]*)"', query)
+    specific_query = ' '.join(specific_query)
+    specific_query = [x.replace('"', '') for x in specific_query.split()]
+    specific_query = [q for sq in specific_query for q in analyze(sq)]
+    results = [ind.get(token, set()) for token in analyzed_query]
+    minus_results = [ind.get(token, set()) for token in minus_query]
+    specific_results = [ind.get(token, set()) for token in specific_query]
+    if len(minus_results) > 0:
+        for j in range(0, len(results)):
+            for i in range(0, len(minus_results)):
+                results[j] = results[j] - minus_results[i]
+    results = [r for r in results if len(r) > 0]
+    if len(results) > 0:
+        if search_type == 'AND':
+            # Deal with users who use "" to get specific results
+            if len(specific_results) > 0:
+                documents = [doc[doc_id] for doc_id in set.intersection(*results)]
+                if len(documents) == 0:
+                    for x in range(len(results), 1, -1):
+                        combo_len_list = []
+                        all_combos = list(combinations(results, x))
+                        for c in range(0, len(all_combos)):
+                            combo_len_list.append(len(set.intersection(*all_combos[c], *specific_results)))
+                        if len(combo_len_list) == 0:
+                            continue
+                        if max(combo_len_list) > 0:
+                            break
+                    if max(combo_len_list) > 0:
+                        max_index = combo_len_list.index(max(combo_len_list))
+                        documents = [doc[doc_id] for doc_id in set.intersection(*all_combos[max_index])]
+            else:
+                # all tokens must be in the document
+                documents = [doc[doc_id] for doc_id in set.intersection(*results)]
+                if len(documents) == 0:
+                    # Iterate from length of search query backwards until some documents are returned.
+                    # Looks at all combinations
+                    for x in range(len(results), 1, -1):
+                        combo_len_list = []
+                        all_combos = list(combinations(results, x))
+                        for c in range(0, len(all_combos)):
+                            combo_len_list.append(len(set.intersection(*all_combos[c])))
+                        if len(combo_len_list) == 0:
+                            continue
+                        if max(combo_len_list) > 0:
+                            break
+                    max_index = combo_len_list.index(max(combo_len_list))
+                    documents = [doc[doc_id] for doc_id in set.intersection(*all_combos[max_index])]
+                    if len(documents) == 0:
+                        documents = [doc[doc_id] for doc_id in set.union(*results)]
+        if search_type == 'OR':
+            # only one token has to be in the document
+            documents = [doc[doc_id] for doc_id in set.union(*results)]
+        if ranking:
+            return(rank(tf, doc, ind, analyzed_query, documents))
+    else:
+        documents = []
+    return documents

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+pandas
+tqdm
+numpy
+nltk
+starlette==0.25.0
+gradio==3.19.1
+fastapi==0.92.0
+itsdangerous==2.0.1

style.css ADDED Viewed

	@@ -0,0 +1,14 @@

+footer{
+    display: none !important;
+}
+td img{
+  background-image:
+    linear-gradient(45deg, lightgrey 25%, transparent 25%),
+    linear-gradient(135deg, lightgrey 25%, transparent 25%),
+    linear-gradient(45deg, transparent 75%, lightgrey 75%),
+    linear-gradient(135deg, transparent 75%, lightgrey 75%);
+  background-size: 20px 20px;
+  background-position: 0 0, 10px 0, 10px -10px, 0px 10px;
+}