coolkrishds's picture
Update app.py
496ff80
import requests
import pandas as pd
import numpy as np
import torch
from datasets import load_dataset
from sentence_transformers.util import semantic_search
# Installable
# pip install datasets
# !pip install retry
# !pip install -U sentence-transformers
texts = ["How do I get a replacement Medicare card?",
"What is the monthly premium for Medicare Part B?",
"How do I terminate my Medicare Part B (medical insurance)?",
"How do I sign up for Medicare?",
"Can I sign up for Medicare Part B if I am working and have health insurance through an employer?",
"How do I sign up for Medicare Part B if I already have Part A?",
"What are Medicare late enrollment penalties?",
"What is Medicare and who can get it?",
"How can I get help with my Medicare Part A and Part B premiums?",
"What are the different parts of Medicare?",
"Will my Medicare premiums be higher because of my higher income?",
"What is TRICARE ?",
"Should I sign up for Medicare Part B if I have Veterans' Benefits?"]
model_id = "sentence-transformers/all-MiniLM-L6-v2"
hf_token = "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"
api_url = f"https://api-inference.huggingface.co/pipeline/feature-extraction/{model_id}"
headers = {"Authorization": f"Bearer {hf_token}"}
# def query(texts):
# response = requests.post(api_url, headers=headers, json={"inputs": texts, "options":{"wait_for_model":True}})
# return response.json()
#@retry(tries=3, delay=10)
def query(texts):
response = requests.post(api_url, headers=headers, json={"inputs": texts})
result = response.json()
if isinstance(result, list):
return result
elif list(result.keys())[0] == "error":
raise RuntimeError(
"The model is currently loading, please re-run the query."
)
output = (dict(inputs = texts))
print("output done")
embeddings = pd.DataFrame(output)
embeddings.to_csv("embeddings.csv", index=False)
print("embeddings done")
# If were to upload embeddings in huggingface dataset
faqs_embeddings = load_dataset('ITESM/embedded_faqs_medicare')
dataset_embeddings = torch.from_numpy(faqs_embeddings["train"].to_pandas().to_numpy()).to(torch.float)
print("dataset_embeddings done")
# embeddings_new = pd.read_csv(embeddings.csv)
# dataset_embeddings = torch.from_numpy(embeddings_new.to_pandas().to_numpy()).to(torch.float)
question = ["How can Medicare help me?"]
output = query(question)
print("output done")
query_embeddings = torch.FloatTensor(output)
print(f"The size of our embedded dataset is {dataset_embeddings.shape} and of our embedded query is {query_embeddings.shape}.")
# Search top 5 matching query
hits = semantic_search(query_embeddings, dataset_embeddings, top_k=5)
print([texts[hits[0][i]['corpus_id']] for i in range(len(hits[0]))])