Spaces:
Sleeping
Sleeping
import requests | |
import pandas as pd | |
import numpy as np | |
import torch | |
from datasets import load_dataset | |
from sentence_transformers.util import semantic_search | |
# Installable | |
# pip install datasets | |
# !pip install retry | |
# !pip install -U sentence-transformers | |
texts = ["How do I get a replacement Medicare card?", | |
"What is the monthly premium for Medicare Part B?", | |
"How do I terminate my Medicare Part B (medical insurance)?", | |
"How do I sign up for Medicare?", | |
"Can I sign up for Medicare Part B if I am working and have health insurance through an employer?", | |
"How do I sign up for Medicare Part B if I already have Part A?", | |
"What are Medicare late enrollment penalties?", | |
"What is Medicare and who can get it?", | |
"How can I get help with my Medicare Part A and Part B premiums?", | |
"What are the different parts of Medicare?", | |
"Will my Medicare premiums be higher because of my higher income?", | |
"What is TRICARE ?", | |
"Should I sign up for Medicare Part B if I have Veterans' Benefits?"] | |
model_id = "sentence-transformers/all-MiniLM-L6-v2" | |
hf_token = "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" | |
api_url = f"https://api-inference.huggingface.co/pipeline/feature-extraction/{model_id}" | |
headers = {"Authorization": f"Bearer {hf_token}"} | |
# def query(texts): | |
# response = requests.post(api_url, headers=headers, json={"inputs": texts, "options":{"wait_for_model":True}}) | |
# return response.json() | |
#@retry(tries=3, delay=10) | |
def query(texts): | |
response = requests.post(api_url, headers=headers, json={"inputs": texts}) | |
result = response.json() | |
if isinstance(result, list): | |
return result | |
elif list(result.keys())[0] == "error": | |
raise RuntimeError( | |
"The model is currently loading, please re-run the query." | |
) | |
output = (dict(inputs = texts)) | |
print("output done") | |
embeddings = pd.DataFrame(output) | |
embeddings.to_csv("embeddings.csv", index=False) | |
print("embeddings done") | |
# If were to upload embeddings in huggingface dataset | |
faqs_embeddings = load_dataset('ITESM/embedded_faqs_medicare') | |
dataset_embeddings = torch.from_numpy(faqs_embeddings["train"].to_pandas().to_numpy()).to(torch.float) | |
print("dataset_embeddings done") | |
# embeddings_new = pd.read_csv(embeddings.csv) | |
# dataset_embeddings = torch.from_numpy(embeddings_new.to_pandas().to_numpy()).to(torch.float) | |
question = ["How can Medicare help me?"] | |
output = query(question) | |
print("output done") | |
query_embeddings = torch.FloatTensor(output) | |
print(f"The size of our embedded dataset is {dataset_embeddings.shape} and of our embedded query is {query_embeddings.shape}.") | |
# Search top 5 matching query | |
hits = semantic_search(query_embeddings, dataset_embeddings, top_k=5) | |
print([texts[hits[0][i]['corpus_id']] for i in range(len(hits[0]))]) |