Bradley
commited on
Commit
•
2d44025
0
Parent(s):
Duplicate from bradley6597/illustration-testing
Browse files- .gitattributes +35 -0
- README.md +13 -0
- app.py +196 -0
- functions.py +165 -0
- requirements.txt +8 -0
- style.css +14 -0
.gitattributes
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
35 |
+
small_data.csv filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: Illustration Testing
|
3 |
+
emoji: ⚡
|
4 |
+
colorFrom: blue
|
5 |
+
colorTo: red
|
6 |
+
sdk: gradio
|
7 |
+
sdk_version: 3.20.0
|
8 |
+
app_file: app.py
|
9 |
+
pinned: false
|
10 |
+
duplicated_from: bradley6597/illustration-testing
|
11 |
+
---
|
12 |
+
|
13 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
ADDED
@@ -0,0 +1,196 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import functions as funky
|
2 |
+
import pandas as pd
|
3 |
+
import gradio as gr
|
4 |
+
import os
|
5 |
+
from datasets import load_dataset
|
6 |
+
from huggingface_hub import login
|
7 |
+
import numpy as np
|
8 |
+
from fastapi import FastAPI, Request
|
9 |
+
import uvicorn
|
10 |
+
from starlette.middleware.sessions import SessionMiddleware
|
11 |
+
import fastapi
|
12 |
+
|
13 |
+
login(token = os.environ['HUB_TOKEN'])
|
14 |
+
|
15 |
+
|
16 |
+
logger = gr.HuggingFaceDatasetSaver(os.environ['HUB_TOKEN'], dataset_name='illustration_gdrive_logging', organization=None, private=True)
|
17 |
+
logger.setup([gr.Text(label="clicked_url"), gr.Text(label="seach_term"), gr.Text(label = 'sessionhash')], './flagged_data_points')
|
18 |
+
|
19 |
+
logging_js = '''
|
20 |
+
function magicFunc(x){
|
21 |
+
let script = document.createElement('script');
|
22 |
+
script.innerHTML = "async function magicFunc(x){let z = document.getElementById('search_term').getElementsByTagName('textarea')[0].value; await fetch('/track?url=' + x + '&q=' + z)}";
|
23 |
+
document.head.appendChild(script);
|
24 |
+
}
|
25 |
+
'''
|
26 |
+
|
27 |
+
dataset = load_dataset("bradley6597/illustration-test")
|
28 |
+
df = pd.DataFrame(dataset['train']).drop_duplicates()
|
29 |
+
|
30 |
+
ill_links = df.copy()
|
31 |
+
ill_links = ill_links[ill_links['Description'] != 'Moved'].copy()
|
32 |
+
ill_links['code'] = ill_links['link'].str.replace("https://drive.google.com/file/d/", "", regex = False)
|
33 |
+
ill_links['code'] = ill_links['code'].str.replace("/view?usp=drivesdk", "", regex = False)
|
34 |
+
# ill_links['image_code'] = 'https://lh3.google.com/u/0/d/' + ill_links['code'] + '=k'
|
35 |
+
ill_links['image_code'] = 'https://lh3.google.com/u/0/d/' + ill_links['code'] + '=w320-h304'
|
36 |
+
ill_links['image_code'] = '<center><a href="' + ill_links['link'] + '" target="_blank" onclick="magicFunc(\'' + ill_links['code'] + '\')"><img src="' + ill_links['image_code'] + '" style="max-height:400px; max-width:200px"></a></center>'
|
37 |
+
ill_links['filename'] = ill_links['file'].str.replace(".*\\/", "", regex = True)
|
38 |
+
ill_links['shared_drive'] = ill_links['file'].str.replace("/content/drive/Shareddrives/", "", regex = False)
|
39 |
+
ill_links['shared_drive'] = ill_links['shared_drive'].str.replace("(.*?)\\/.*", "\\1", regex = True)
|
40 |
+
ill_links['Description'] = ill_links['Description'].str.replace("No Description", "", regex = False)
|
41 |
+
|
42 |
+
ill_links_title = ill_links.copy()
|
43 |
+
|
44 |
+
ill_links['ID'] = ill_links.index
|
45 |
+
ill_links_title['ID'] = ill_links_title.index
|
46 |
+
ill_links['title'] = ill_links['filename']
|
47 |
+
ill_links_title['title'] = ill_links_title['filename']
|
48 |
+
ill_links['url'] = ill_links['image_code']
|
49 |
+
ill_links_title['url'] = ill_links_title['image_code']
|
50 |
+
ill_links['abstract'] = ill_links['filename'].str.replace("\\-|\\_", " ", regex = True) + ' ' + ill_links['Description'].str.replace(",", " ", regex = False).astype(str)
|
51 |
+
ill_links_title['abstract'] = ill_links_title['filename'].str.replace('\\-|\\_', " ", regex = True)
|
52 |
+
ill_links['filepath'] = ill_links['file']
|
53 |
+
ill_links_title['filepath'] = ill_links_title['file']
|
54 |
+
ill_links['post_filepath'] = ill_links['filepath'].str.replace(".*?\\/KS1 EYFS\\/", "", regex = True)
|
55 |
+
ill_links_title['post_filepath'] = ill_links_title['filepath'].str.replace(".*?\\/KS1 EYFS\\/", "", regex = True)
|
56 |
+
ill_links = ill_links[['ID', 'title', 'url', 'abstract', 'filepath', 'Date Created', 'post_filepath']]
|
57 |
+
ill_links_title = ill_links_title[['ID', 'title', 'url', 'abstract', 'filepath', 'Date Created', 'Description', 'post_filepath']]
|
58 |
+
|
59 |
+
ill_check_lst = []
|
60 |
+
for i in range(0, 5):
|
61 |
+
tmp_links = ill_links['url'].iloc[0].replace("/u/0/", f"/u/{i}/")
|
62 |
+
tmp_links = tmp_links.replace('max-width:200px', 'max-width:25%')
|
63 |
+
tmp_links = tmp_links.replace("<center>", "")
|
64 |
+
tmp_links = tmp_links.replace("</center>", "")
|
65 |
+
tmp_links = f'<p>{i}</p>' + tmp_links
|
66 |
+
ill_check_lst.append(tmp_links)
|
67 |
+
ill_check_df = pd.DataFrame(ill_check_lst).T
|
68 |
+
ill_check_html = ill_check_df.to_html(escape = False, render_links = True, index = False, header = False)
|
69 |
+
|
70 |
+
ind_main, doc_main, tf_main = funky.index_documents(ill_links)
|
71 |
+
ind_title, doc_title, tf_title = funky.index_documents(ill_links_title)
|
72 |
+
|
73 |
+
|
74 |
+
def same_auth(username, password):
|
75 |
+
return(username == os.environ['username']) & (password == os.environ['password'])
|
76 |
+
|
77 |
+
|
78 |
+
def search_index(search_text, sd, ks, sort_by, max_results, user_num, search_title):
|
79 |
+
if search_title:
|
80 |
+
output = funky.search(tf_title, doc_title, ind_title, search_text, search_type = 'AND', ranking = True)
|
81 |
+
else:
|
82 |
+
output = funky.search(tf_main, doc_main, ind_main, search_text, search_type='AND', ranking = True)
|
83 |
+
output = [x for o in output for x in o if type(x) is not float]
|
84 |
+
output_df = pd.DataFrame(output).reset_index(drop = True)
|
85 |
+
|
86 |
+
if output_df.shape[0] > 0:
|
87 |
+
|
88 |
+
output_df['url'] = output_df['url'].str.replace("/u/0/", f"/u/{int(user_num)}/", regex = False)
|
89 |
+
if len(sd) == 1:
|
90 |
+
output_df = output_df[(output_df['filepath'].str.contains(str(sd[0]), regex = False))]
|
91 |
+
if len(ks) > 0:
|
92 |
+
keystage_filter = '|'.join(ks).lower()
|
93 |
+
if search_title:
|
94 |
+
output_df['abstract'] = output_df['abstract'] + ' ' + output_df['Description']
|
95 |
+
|
96 |
+
output_df['abstract'] = output_df['abstract'].str.lower()
|
97 |
+
output_df['post_filepath'] = output_df['post_filepath'].str.lower()
|
98 |
+
output_df['missing_desc'] = np.where(output_df['abstract'].str.contains('eyfs|ks1|ks2', regex = True), 0, 1)
|
99 |
+
output_df2 = output_df[(output_df['abstract'].str.contains(keystage_filter, regex = True) | (output_df['missing_desc'] == 1))].copy()
|
100 |
+
output_df2 = output_df2[(output_df2['post_filepath'].str.contains(keystage_filter, regex = True))]
|
101 |
+
if output_df2.shape[0] == 0:
|
102 |
+
output_df2 = output_df[(output_df['post_filepath'].str.contains(keystage_filter, regex = True))]
|
103 |
+
|
104 |
+
output_df2['ind'] = output_df2.index
|
105 |
+
if sort_by == 'Relevance':
|
106 |
+
output_df2 = output_df2.sort_values(by = ['missing_desc', 'ind'], ascending = [True, True])
|
107 |
+
elif sort_by == 'Date Created':
|
108 |
+
output_df2 = output_df2.sort_values(by = ['Date Created'], ascending = False)
|
109 |
+
elif sort_by == 'A-Z':
|
110 |
+
output_df2 = output_df2.sort_values(by = ['title'], ascending = True)
|
111 |
+
|
112 |
+
output_df2 = output_df2.head(int(max_results))
|
113 |
+
output_df2 = output_df2[['url']].reset_index(drop = True)
|
114 |
+
|
115 |
+
max_cols = 5
|
116 |
+
output_df2['row'] = output_df2.index % max_cols
|
117 |
+
for x in range(0, max_cols):
|
118 |
+
tmp = output_df2[output_df2['row'] == x].reset_index(drop = True)
|
119 |
+
tmp = tmp[['url']]
|
120 |
+
if x == 0:
|
121 |
+
final_df = tmp
|
122 |
+
else:
|
123 |
+
final_df = pd.concat([final_df, tmp], axis = 1)
|
124 |
+
|
125 |
+
final_df = final_df.fillna('')
|
126 |
+
else:
|
127 |
+
final_df = pd.DataFrame(['<h3>No Results Found :(</h3>'])
|
128 |
+
|
129 |
+
if final_df.shape[0] == 0 :
|
130 |
+
final_df = pd.DataFrame(['<h3>No Results Found :(</h3>'])
|
131 |
+
|
132 |
+
return('<center>' +
|
133 |
+
final_df.to_html(escape = False, render_links = True, index = False, header = False) +
|
134 |
+
'</center>')
|
135 |
+
|
136 |
+
|
137 |
+
def log_clicks(x):
|
138 |
+
print(x)
|
139 |
+
|
140 |
+
|
141 |
+
with gr.Blocks(css="style.css") as app:
|
142 |
+
with gr.Row():
|
143 |
+
with gr.Column(min_width = 10):
|
144 |
+
with gr.Row():
|
145 |
+
gr.HTML("<center><p>If you can't see the images please make sure you are signed in to your Twinkl account on Google & you have access to the Shared Drives you are searching :)</p></center>")
|
146 |
+
gr.HTML(ill_check_html)
|
147 |
+
user_num = gr.Number(value = 0, label = 'Put lowest number of the alarm clock you can see')
|
148 |
+
with gr.Row():
|
149 |
+
search_prompt = gr.Textbox(placeholder = 'search for an illustration', label = 'Search', elem_id = 'search_term')
|
150 |
+
title_search = gr.Checkbox(label = 'Search title only')
|
151 |
+
# with gr.Row():
|
152 |
+
shared_drive = gr.Dropdown(choices = ['Illustrations - 01-10 to 07-22', 'Illustrations - Now'], multiselect = True, label = 'Shared Drive', value = ['Illustrations - 01-10 to 07-22', 'Illustrations - Now'])
|
153 |
+
key_stage = gr.Dropdown(choices = ['EYFS', 'KS1', 'KS2'], multiselect = True, label = 'Key Stage', value = ['EYFS', 'KS1', 'KS2'])
|
154 |
+
sort_by = gr.Dropdown(choices = ['Relevance', 'Date Created', 'A-Z'], value = 'Relevance', multiselect = False, label = 'Sort By')
|
155 |
+
max_return = gr.Dropdown(choices = ['10', '25', '50', '75', '100', '250', '500'], value = '10', multiselect = False, label = 'No. of Results to Return')
|
156 |
+
with gr.Row():
|
157 |
+
search_button = gr.Button(value="Search!")
|
158 |
+
with gr.Row():
|
159 |
+
output_df = gr.HTML()
|
160 |
+
search_button.click(search_index, inputs=[search_prompt, shared_drive, key_stage, sort_by, max_return, user_num, title_search], outputs=output_df)
|
161 |
+
search_prompt.submit(search_index, inputs=[search_prompt, shared_drive, key_stage, sort_by, max_return, user_num, title_search], outputs=output_df)
|
162 |
+
app.load(_js = logging_js)
|
163 |
+
|
164 |
+
app.auth = (same_auth)
|
165 |
+
app.auth_message = ''
|
166 |
+
|
167 |
+
|
168 |
+
fapi = FastAPI()
|
169 |
+
|
170 |
+
fapi.add_middleware(SessionMiddleware, secret_key=os.environ['session_key'])
|
171 |
+
|
172 |
+
@fapi.middleware("http")
|
173 |
+
async def add_session_hash(request: Request, call_next):
|
174 |
+
response = await call_next(request)
|
175 |
+
session = request.cookies.get('session')
|
176 |
+
if session:
|
177 |
+
response.set_cookie(key='session', value=request.cookies.get('session'), httponly=True)
|
178 |
+
return response
|
179 |
+
|
180 |
+
# custom get request handler with params to flag clicks
|
181 |
+
@ fapi.get("/track")
|
182 |
+
async def track(url: str, q: str, request: Request):
|
183 |
+
|
184 |
+
if q is None:
|
185 |
+
q = ''
|
186 |
+
|
187 |
+
logger.flag([url, q, request.cookies['access-token']])
|
188 |
+
return {"message": "ok"}
|
189 |
+
|
190 |
+
|
191 |
+
# mount Gradio app to FastAPI app
|
192 |
+
app2 = gr.mount_gradio_app(fapi, app, path="/")
|
193 |
+
# serve the app
|
194 |
+
if __name__ == "__main__":
|
195 |
+
uvicorn.run(app2, host="0.0.0.0", port=7860)
|
196 |
+
|
functions.py
ADDED
@@ -0,0 +1,165 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import string
|
3 |
+
from collections import Counter
|
4 |
+
import math
|
5 |
+
from tqdm import tqdm
|
6 |
+
from itertools import combinations
|
7 |
+
from nltk.stem import PorterStemmer
|
8 |
+
|
9 |
+
|
10 |
+
# top 25 most common words in English and "wikipedia":
|
11 |
+
# https://en.wikipedia.org/wiki/Most_common_words_in_English
|
12 |
+
stop_words = set(['the', 'be', 'to', 'of', 'and', 'a', 'in', 'that', 'have',
|
13 |
+
'i', 'it', 'for', 'not', 'on', 'with', 'he', 'as', 'you',
|
14 |
+
'do', 'at', 'this', 'but', 'his', 'by', 'from', 'wikipedia'])
|
15 |
+
punct = re.compile(f'[{re.escape(string.punctuation)}]')
|
16 |
+
|
17 |
+
def tokenize(text):
|
18 |
+
# Split text
|
19 |
+
return(text.split())
|
20 |
+
|
21 |
+
def lowercase_filter(tokens):
|
22 |
+
# Make text lowercase
|
23 |
+
return([token.lower() for token in tokens])
|
24 |
+
|
25 |
+
def punctuation_filter(tokens):
|
26 |
+
# Remove punctuation
|
27 |
+
return([punct.sub('', token) for token in tokens])
|
28 |
+
|
29 |
+
def stopword_filter(tokens):
|
30 |
+
# Remove stopwords
|
31 |
+
return([token for token in tokens if token not in stop_words])
|
32 |
+
|
33 |
+
def stem_filter(tokens):
|
34 |
+
# Stem words
|
35 |
+
ps = PorterStemmer()
|
36 |
+
return([ps.stem(token) for token in tokens])
|
37 |
+
|
38 |
+
def analyze(text):
|
39 |
+
tokens = tokenize(text)
|
40 |
+
tokens = lowercase_filter(tokens)
|
41 |
+
tokens = punctuation_filter(tokens)
|
42 |
+
tokens = stopword_filter(tokens)
|
43 |
+
tokens = stem_filter(tokens)
|
44 |
+
|
45 |
+
return([token for token in tokens if token])
|
46 |
+
|
47 |
+
|
48 |
+
# Setup an index and document structure to reference later
|
49 |
+
def index_documents(df):
|
50 |
+
ind = {}
|
51 |
+
doc = {}
|
52 |
+
for i in tqdm(range(0, df.shape[0])):
|
53 |
+
if df['ID'].iloc[i] not in doc:
|
54 |
+
doc[df['ID'].iloc[i]] = df.iloc[i]
|
55 |
+
full_text = ' '.join([df['title'].iloc[i], df['abstract'].iloc[i]])
|
56 |
+
for token in analyze(full_text):
|
57 |
+
if token not in ind:
|
58 |
+
ind[token] = set()
|
59 |
+
ind[token].add(df['ID'].iloc[i])
|
60 |
+
if i % 5000 == 0:
|
61 |
+
print(f'Indexed {i} documents', end='\r')
|
62 |
+
df['title_abs'] = df['title'] + ' ' + df['abstract']
|
63 |
+
all_text = ' '.join(df['title_abs'])
|
64 |
+
term_frequencies = Counter(analyze(all_text))
|
65 |
+
return(ind, doc, term_frequencies)
|
66 |
+
|
67 |
+
|
68 |
+
def rank(termfreq, doc, ind, analyzed_query, documents):
|
69 |
+
results = []
|
70 |
+
if not documents:
|
71 |
+
return results
|
72 |
+
for document in documents:
|
73 |
+
score = 0.0
|
74 |
+
for token in analyzed_query:
|
75 |
+
tf = termfreq.get(token, 0)
|
76 |
+
if len(ind.get(token, set())) == 0:
|
77 |
+
continue
|
78 |
+
idf = math.log10(len(doc) / len(ind.get(token, set())))
|
79 |
+
score += tf * idf
|
80 |
+
results.append((document, score))
|
81 |
+
return sorted(results, key=lambda doc: doc[1], reverse=True)
|
82 |
+
|
83 |
+
|
84 |
+
|
85 |
+
def search(tf, doc, ind, query, search_type='AND', ranking=False):
|
86 |
+
"""
|
87 |
+
Search; this will return documents that contain words from the query,
|
88 |
+
and rank them if requested (sets are fast, but unordered).
|
89 |
+
|
90 |
+
Parameters:
|
91 |
+
- tf: the term frequencies. Taken from indexing documents
|
92 |
+
- doc: documents. Taken from indexing documents
|
93 |
+
- ind: index. Taken from indexing documents
|
94 |
+
- query: the query string
|
95 |
+
- search_type: ('AND', 'OR') do all query terms have to match, or just one
|
96 |
+
- score: (True, False) if True, rank results based on TF-IDF score
|
97 |
+
"""
|
98 |
+
if search_type not in ('AND', 'OR'):
|
99 |
+
return []
|
100 |
+
|
101 |
+
analyzed_query = analyze(query)
|
102 |
+
minus_query = [x[1:] for x in query.split() if x[0] == '-']
|
103 |
+
minus_query = [q for mq in minus_query for q in analyze(mq)]
|
104 |
+
|
105 |
+
specific_query = re.findall('"([^"]*)"', query)
|
106 |
+
specific_query = ' '.join(specific_query)
|
107 |
+
specific_query = [x.replace('"', '') for x in specific_query.split()]
|
108 |
+
specific_query = [q for sq in specific_query for q in analyze(sq)]
|
109 |
+
|
110 |
+
results = [ind.get(token, set()) for token in analyzed_query]
|
111 |
+
minus_results = [ind.get(token, set()) for token in minus_query]
|
112 |
+
specific_results = [ind.get(token, set()) for token in specific_query]
|
113 |
+
|
114 |
+
if len(minus_results) > 0:
|
115 |
+
for j in range(0, len(results)):
|
116 |
+
for i in range(0, len(minus_results)):
|
117 |
+
results[j] = results[j] - minus_results[i]
|
118 |
+
results = [r for r in results if len(r) > 0]
|
119 |
+
|
120 |
+
if len(results) > 0:
|
121 |
+
if search_type == 'AND':
|
122 |
+
# Deal with users who use "" to get specific results
|
123 |
+
if len(specific_results) > 0:
|
124 |
+
documents = [doc[doc_id] for doc_id in set.intersection(*results)]
|
125 |
+
if len(documents) == 0:
|
126 |
+
for x in range(len(results), 1, -1):
|
127 |
+
combo_len_list = []
|
128 |
+
all_combos = list(combinations(results, x))
|
129 |
+
for c in range(0, len(all_combos)):
|
130 |
+
combo_len_list.append(len(set.intersection(*all_combos[c], *specific_results)))
|
131 |
+
if len(combo_len_list) == 0:
|
132 |
+
continue
|
133 |
+
if max(combo_len_list) > 0:
|
134 |
+
break
|
135 |
+
if max(combo_len_list) > 0:
|
136 |
+
max_index = combo_len_list.index(max(combo_len_list))
|
137 |
+
documents = [doc[doc_id] for doc_id in set.intersection(*all_combos[max_index])]
|
138 |
+
else:
|
139 |
+
# all tokens must be in the document
|
140 |
+
documents = [doc[doc_id] for doc_id in set.intersection(*results)]
|
141 |
+
if len(documents) == 0:
|
142 |
+
# Iterate from length of search query backwards until some documents are returned.
|
143 |
+
# Looks at all combinations
|
144 |
+
for x in range(len(results), 1, -1):
|
145 |
+
combo_len_list = []
|
146 |
+
all_combos = list(combinations(results, x))
|
147 |
+
for c in range(0, len(all_combos)):
|
148 |
+
combo_len_list.append(len(set.intersection(*all_combos[c])))
|
149 |
+
if len(combo_len_list) == 0:
|
150 |
+
continue
|
151 |
+
if max(combo_len_list) > 0:
|
152 |
+
break
|
153 |
+
max_index = combo_len_list.index(max(combo_len_list))
|
154 |
+
documents = [doc[doc_id] for doc_id in set.intersection(*all_combos[max_index])]
|
155 |
+
if len(documents) == 0:
|
156 |
+
documents = [doc[doc_id] for doc_id in set.union(*results)]
|
157 |
+
if search_type == 'OR':
|
158 |
+
# only one token has to be in the document
|
159 |
+
documents = [doc[doc_id] for doc_id in set.union(*results)]
|
160 |
+
|
161 |
+
if ranking:
|
162 |
+
return(rank(tf, doc, ind, analyzed_query, documents))
|
163 |
+
else:
|
164 |
+
documents = []
|
165 |
+
return documents
|
requirements.txt
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
pandas
|
2 |
+
tqdm
|
3 |
+
numpy
|
4 |
+
nltk
|
5 |
+
starlette==0.25.0
|
6 |
+
gradio==3.19.1
|
7 |
+
fastapi==0.92.0
|
8 |
+
itsdangerous==2.0.1
|
style.css
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
footer{
|
2 |
+
display: none !important;
|
3 |
+
}
|
4 |
+
|
5 |
+
td img{
|
6 |
+
background-image:
|
7 |
+
linear-gradient(45deg, lightgrey 25%, transparent 25%),
|
8 |
+
linear-gradient(135deg, lightgrey 25%, transparent 25%),
|
9 |
+
linear-gradient(45deg, transparent 75%, lightgrey 75%),
|
10 |
+
linear-gradient(135deg, transparent 75%, lightgrey 75%);
|
11 |
+
|
12 |
+
background-size: 20px 20px;
|
13 |
+
background-position: 0 0, 10px 0, 10px -10px, 0px 10px;
|
14 |
+
}
|