Spaces:
Sleeping
Sleeping
File size: 2,845 Bytes
b30d739 dcf40f8 b30d739 496ff80 b30d739 d7ecb3b b30d739 7fa0302 b30d739 7fa0302 a2b1661 e04ec03 b30d739 7fa0302 e04ec03 dcf40f8 b30d739 7fa0302 b30d739 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 |
import requests
import pandas as pd
import numpy as np
import torch
from datasets import load_dataset
from sentence_transformers.util import semantic_search
# Installable
# pip install datasets
# !pip install retry
# !pip install -U sentence-transformers
texts = ["How do I get a replacement Medicare card?",
"What is the monthly premium for Medicare Part B?",
"How do I terminate my Medicare Part B (medical insurance)?",
"How do I sign up for Medicare?",
"Can I sign up for Medicare Part B if I am working and have health insurance through an employer?",
"How do I sign up for Medicare Part B if I already have Part A?",
"What are Medicare late enrollment penalties?",
"What is Medicare and who can get it?",
"How can I get help with my Medicare Part A and Part B premiums?",
"What are the different parts of Medicare?",
"Will my Medicare premiums be higher because of my higher income?",
"What is TRICARE ?",
"Should I sign up for Medicare Part B if I have Veterans' Benefits?"]
model_id = "sentence-transformers/all-MiniLM-L6-v2"
hf_token = "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"
api_url = f"https://api-inference.huggingface.co/pipeline/feature-extraction/{model_id}"
headers = {"Authorization": f"Bearer {hf_token}"}
# def query(texts):
# response = requests.post(api_url, headers=headers, json={"inputs": texts, "options":{"wait_for_model":True}})
# return response.json()
#@retry(tries=3, delay=10)
def query(texts):
response = requests.post(api_url, headers=headers, json={"inputs": texts})
result = response.json()
if isinstance(result, list):
return result
elif list(result.keys())[0] == "error":
raise RuntimeError(
"The model is currently loading, please re-run the query."
)
output = (dict(inputs = texts))
print("output done")
embeddings = pd.DataFrame(output)
embeddings.to_csv("embeddings.csv", index=False)
print("embeddings done")
# If were to upload embeddings in huggingface dataset
faqs_embeddings = load_dataset('ITESM/embedded_faqs_medicare')
dataset_embeddings = torch.from_numpy(faqs_embeddings["train"].to_pandas().to_numpy()).to(torch.float)
print("dataset_embeddings done")
# embeddings_new = pd.read_csv(embeddings.csv)
# dataset_embeddings = torch.from_numpy(embeddings_new.to_pandas().to_numpy()).to(torch.float)
question = ["How can Medicare help me?"]
output = query(question)
print("output done")
query_embeddings = torch.FloatTensor(output)
print(f"The size of our embedded dataset is {dataset_embeddings.shape} and of our embedded query is {query_embeddings.shape}.")
# Search top 5 matching query
hits = semantic_search(query_embeddings, dataset_embeddings, top_k=5)
print([texts[hits[0][i]['corpus_id']] for i in range(len(hits[0]))]) |