tribunal_search / app.py
acd424's picture
Update app.py
2b75384
from sentence_transformers import SentenceTransformer, util
import torch
import os
import gradio as gr
import json
import re
import numpy as np
import pickle
from datetime import datetime
from huggingface_hub import Repository
from datasets import load_dataset
############# Read in the data #############
access_token_1 = os.environ.get("HF_token")
dataset = load_dataset("acd424/tribunal_data", use_auth_token=access_token_1)
embeddings = dataset["train"]["embed"]
corpus = dataset["train"]["reason_text"]
files = dataset["train"]["file_name"]
all_cats = dataset["train"]["all_cats"]
print("The data has loaded")
'''# for saving a log
DATASET_REPO_URL = "https://huggingface.co/datasets/acd424/tribunal_data"
DATA_FILENAME = "queries_and_responces.txt"
DATA_FILE = os.path.join("data", DATA_FILENAME)
repo = Repository(
local_dir="data", clone_from=DATASET_REPO_URL, use_auth_token=access_token_1
)
'''
################### Functions ##########################
def semantic_search(query, corpus=corpus, corpus_embeddings=embeddings, k=5):
"""Performs search to find vector in corpus closest to query (using cosine similarity)
and returns the top k resluts
Parameters
----------
query: str
The query the user wants to search with
corpus: list
Each item in the list is the text from a chunk
corpus_embedding: numpy.ndarray
The word-embedding vector representation of the chunk
k: int
The numer of results to print (default is 5)
Returns
-------
Prints output displaying top k results
"""
top_k = min(k, len(corpus_embeddings))
query_embedding = embedder.encode(query, convert_to_tensor=True)
# We use cosine-similarity and torch.topk to find the highest 5 scores
cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]
top_results = torch.topk(cos_scores, k=top_k)
highest_cos_score = cos_scores[list(np.argsort(cos_scores))[::-1][0]]
if highest_cos_score < 0.5:
final_string = "The query is either not detailed enough or is perhaps not an appropriate query"
else:
final_string = ""
n = 1
for score, idx in zip(top_results[0], top_results[1]):
final_string = final_string + f" Match: {n} \n"
final_string = (
final_string
+ f"https://www.gov.uk/employment-tribunal-decisions/{corpus[idx]}"
)
final_string = final_string + "\n ======== With Catergories ========= \n"
final_string = final_string + all_cats[idx]
final_string = (
final_string
+ f"\n ============================================ {cos_scores[list(np.argsort(cos_scores))[::-1][n-1]]} \n\n\n"
)
n += 1
return final_string
def produce_tribunal_out(query, corpus=files, tribunal_embeddings=embeddings):
# get context
context_string = semantic_search(
query=query, corpus=corpus, corpus_embeddings=tribunal_embeddings
)
# get time
time_now = datetime.now().strftime("%d/%m/%Y %H:%M:%S")
print(f"{time_now}|{query}\n")
'''# write to file
with open(DATA_FILE, "a") as f:
f.write(f"{time_now}|{query}\n")
commit_url = repo.push_to_hub()
print(commit_url)
'''
return gr.update(value=context_string)
############### Specify models
embedder = SentenceTransformer("all-MiniLM-L6-v2")
############### The Appp
with gr.Blocks() as demo:
gr.Markdown("Employment tribunal demo")
# text input from user
inp = gr.Textbox(
placeholder="Type your employment tribunal query here - describe your problem",
label="Tribunal Query",
)
# initiate the functions
process_btn = gr.Button("Search records from UK employment tribunals")
Suggested_text = gr.Textbox(
value="Suggestions will appear here", label="", lines=20
)
Suggested_text.style(show_copy_button=True)
process_btn.click(
fn=produce_tribunal_out,
inputs=[inp],
outputs=Suggested_text,
)
demo.launch(auth=("admin", "dataisking"))
##########################################