import requests import pandas as pd import numpy as np import torch from datasets import load_dataset from sentence_transformers.util import semantic_search # Installable # pip install datasets # !pip install retry # !pip install -U sentence-transformers texts = ["How do I get a replacement Medicare card?", "What is the monthly premium for Medicare Part B?", "How do I terminate my Medicare Part B (medical insurance)?", "How do I sign up for Medicare?", "Can I sign up for Medicare Part B if I am working and have health insurance through an employer?", "How do I sign up for Medicare Part B if I already have Part A?", "What are Medicare late enrollment penalties?", "What is Medicare and who can get it?", "How can I get help with my Medicare Part A and Part B premiums?", "What are the different parts of Medicare?", "Will my Medicare premiums be higher because of my higher income?", "What is TRICARE ?", "Should I sign up for Medicare Part B if I have Veterans' Benefits?"] model_id = "sentence-transformers/all-MiniLM-L6-v2" hf_token = "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" api_url = f"https://api-inference.huggingface.co/pipeline/feature-extraction/{model_id}" headers = {"Authorization": f"Bearer {hf_token}"} # def query(texts): # response = requests.post(api_url, headers=headers, json={"inputs": texts, "options":{"wait_for_model":True}}) # return response.json() #@retry(tries=3, delay=10) def query(texts): response = requests.post(api_url, headers=headers, json={"inputs": texts}) result = response.json() if isinstance(result, list): return result elif list(result.keys())[0] == "error": raise RuntimeError( "The model is currently loading, please re-run the query." ) output = (dict(inputs = texts)) print("output done") embeddings = pd.DataFrame(output) embeddings.to_csv("embeddings.csv", index=False) print("embeddings done") # If were to upload embeddings in huggingface dataset faqs_embeddings = load_dataset('ITESM/embedded_faqs_medicare') dataset_embeddings = torch.from_numpy(faqs_embeddings["train"].to_pandas().to_numpy()).to(torch.float) print("dataset_embeddings done") # embeddings_new = pd.read_csv(embeddings.csv) # dataset_embeddings = torch.from_numpy(embeddings_new.to_pandas().to_numpy()).to(torch.float) question = ["How can Medicare help me?"] output = query(question) print("output done") query_embeddings = torch.FloatTensor(output) print(f"The size of our embedded dataset is {dataset_embeddings.shape} and of our embedded query is {query_embeddings.shape}.") # Search top 5 matching query hits = semantic_search(query_embeddings, dataset_embeddings, top_k=5) print([texts[hits[0][i]['corpus_id']] for i in range(len(hits[0]))])