Spaces:

coolkrishds
/

embeddingDemoProject1

Sleeping

App Files Files Community

embeddingDemoProject1 / app.py

coolkrishds

Update app.py

496ff80 almost 2 years ago

raw

history blame contribute delete

2.85 kB

	import requests
	import pandas as pd
	import numpy as np
	import torch
	from datasets import load_dataset
	from sentence_transformers.util import semantic_search


	# Installable
	# pip install datasets
	# !pip install retry
	# !pip install -U sentence-transformers


	texts = ["How do I get a replacement Medicare card?",
	"What is the monthly premium for Medicare Part B?",
	"How do I terminate my Medicare Part B (medical insurance)?",
	"How do I sign up for Medicare?",
	"Can I sign up for Medicare Part B if I am working and have health insurance through an employer?",
	"How do I sign up for Medicare Part B if I already have Part A?",
	"What are Medicare late enrollment penalties?",
	"What is Medicare and who can get it?",
	"How can I get help with my Medicare Part A and Part B premiums?",
	"What are the different parts of Medicare?",
	"Will my Medicare premiums be higher because of my higher income?",
	"What is TRICARE ?",
	"Should I sign up for Medicare Part B if I have Veterans' Benefits?"]

	model_id = "sentence-transformers/all-MiniLM-L6-v2"
	hf_token = "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"

	api_url = f"https://api-inference.huggingface.co/pipeline/feature-extraction/{model_id}"
	headers = {"Authorization": f"Bearer {hf_token}"}

	# def query(texts):
	# response = requests.post(api_url, headers=headers, json={"inputs": texts, "options":{"wait_for_model":True}})
	# return response.json()

	#@retry(tries=3, delay=10)
	def query(texts):
	response = requests.post(api_url, headers=headers, json={"inputs": texts})
	result = response.json()
	if isinstance(result, list):
	return result
	elif list(result.keys())[0] == "error":
	raise RuntimeError(
	"The model is currently loading, please re-run the query."
	)

	output = (dict(inputs = texts))

	print("output done")

	embeddings = pd.DataFrame(output)
	embeddings.to_csv("embeddings.csv", index=False)

	print("embeddings done")

	# If were to upload embeddings in huggingface dataset
	faqs_embeddings = load_dataset('ITESM/embedded_faqs_medicare')
	dataset_embeddings = torch.from_numpy(faqs_embeddings["train"].to_pandas().to_numpy()).to(torch.float)

	print("dataset_embeddings done")
	# embeddings_new = pd.read_csv(embeddings.csv)
	# dataset_embeddings = torch.from_numpy(embeddings_new.to_pandas().to_numpy()).to(torch.float)

	question = ["How can Medicare help me?"]
	output = query(question)

	print("output done")

	query_embeddings = torch.FloatTensor(output)
	print(f"The size of our embedded dataset is {dataset_embeddings.shape} and of our embedded query is {query_embeddings.shape}.")

	# Search top 5 matching query

	hits = semantic_search(query_embeddings, dataset_embeddings, top_k=5)
	print([texts[hits[0][i]['corpus_id']] for i in range(len(hits[0]))])