Spaces:
Runtime error
Runtime error
from sentence_transformers import SentenceTransformer, util | |
import torch | |
import os | |
import gradio as gr | |
import json | |
import re | |
import numpy as np | |
import pickle | |
from datetime import datetime | |
from huggingface_hub import Repository | |
from datasets import load_dataset | |
############# Read in the data ############# | |
access_token_1 = os.environ.get("HF_token") | |
dataset = load_dataset("acd424/tribunal_data", use_auth_token=access_token_1) | |
embeddings = dataset["train"]["embed"] | |
corpus = dataset["train"]["reason_text"] | |
files = dataset["train"]["file_name"] | |
all_cats = dataset["train"]["all_cats"] | |
print("The data has loaded") | |
'''# for saving a log | |
DATASET_REPO_URL = "https://huggingface.co/datasets/acd424/tribunal_data" | |
DATA_FILENAME = "queries_and_responces.txt" | |
DATA_FILE = os.path.join("data", DATA_FILENAME) | |
repo = Repository( | |
local_dir="data", clone_from=DATASET_REPO_URL, use_auth_token=access_token_1 | |
) | |
''' | |
################### Functions ########################## | |
def semantic_search(query, corpus=corpus, corpus_embeddings=embeddings, k=5): | |
"""Performs search to find vector in corpus closest to query (using cosine similarity) | |
and returns the top k resluts | |
Parameters | |
---------- | |
query: str | |
The query the user wants to search with | |
corpus: list | |
Each item in the list is the text from a chunk | |
corpus_embedding: numpy.ndarray | |
The word-embedding vector representation of the chunk | |
k: int | |
The numer of results to print (default is 5) | |
Returns | |
------- | |
Prints output displaying top k results | |
""" | |
top_k = min(k, len(corpus_embeddings)) | |
query_embedding = embedder.encode(query, convert_to_tensor=True) | |
# We use cosine-similarity and torch.topk to find the highest 5 scores | |
cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0] | |
top_results = torch.topk(cos_scores, k=top_k) | |
highest_cos_score = cos_scores[list(np.argsort(cos_scores))[::-1][0]] | |
if highest_cos_score < 0.5: | |
final_string = "The query is either not detailed enough or is perhaps not an appropriate query" | |
else: | |
final_string = "" | |
n = 1 | |
for score, idx in zip(top_results[0], top_results[1]): | |
final_string = final_string + f" Match: {n} \n" | |
final_string = ( | |
final_string | |
+ f"https://www.gov.uk/employment-tribunal-decisions/{corpus[idx]}" | |
) | |
final_string = final_string + "\n ======== With Catergories ========= \n" | |
final_string = final_string + all_cats[idx] | |
final_string = ( | |
final_string | |
+ f"\n ============================================ {cos_scores[list(np.argsort(cos_scores))[::-1][n-1]]} \n\n\n" | |
) | |
n += 1 | |
return final_string | |
def produce_tribunal_out(query, corpus=files, tribunal_embeddings=embeddings): | |
# get context | |
context_string = semantic_search( | |
query=query, corpus=corpus, corpus_embeddings=tribunal_embeddings | |
) | |
# get time | |
time_now = datetime.now().strftime("%d/%m/%Y %H:%M:%S") | |
print(f"{time_now}|{query}\n") | |
'''# write to file | |
with open(DATA_FILE, "a") as f: | |
f.write(f"{time_now}|{query}\n") | |
commit_url = repo.push_to_hub() | |
print(commit_url) | |
''' | |
return gr.update(value=context_string) | |
############### Specify models | |
embedder = SentenceTransformer("all-MiniLM-L6-v2") | |
############### The Appp | |
with gr.Blocks() as demo: | |
gr.Markdown("Employment tribunal demo") | |
# text input from user | |
inp = gr.Textbox( | |
placeholder="Type your employment tribunal query here - describe your problem", | |
label="Tribunal Query", | |
) | |
# initiate the functions | |
process_btn = gr.Button("Search records from UK employment tribunals") | |
Suggested_text = gr.Textbox( | |
value="Suggestions will appear here", label="", lines=20 | |
) | |
Suggested_text.style(show_copy_button=True) | |
process_btn.click( | |
fn=produce_tribunal_out, | |
inputs=[inp], | |
outputs=Suggested_text, | |
) | |
demo.launch(auth=("admin", "dataisking")) | |
########################################## | |