Spaces:
Runtime error
Runtime error
nkasmanoff
commited on
Commit
•
e5e23ef
1
Parent(s):
fb6da00
Update dataset_recommender.py
Browse files- dataset_recommender.py +22 -10
dataset_recommender.py
CHANGED
@@ -1,16 +1,20 @@
|
|
1 |
from langchain.chains import RetrievalQA
|
2 |
-
from langchain.llms import OpenAI
|
3 |
-
from langchain.embeddings import OpenAIEmbeddings
|
4 |
from vectorize_dataset import load_descriptions_data, create_db
|
5 |
-
from helpers import clean_up_tags, get_dataset_metadata
|
6 |
-
|
|
|
|
|
|
|
7 |
|
8 |
|
9 |
class DatasetRecommender:
|
10 |
-
def __init__(self,
|
|
|
|
|
|
|
11 |
self.llm_backbone = llm_backbone
|
12 |
self.embeddings_backbone = embeddings_backbone
|
13 |
-
self.hf_df = load_descriptions_data()
|
14 |
self.db = create_db(self.hf_df, self.embeddings_backbone)
|
15 |
self.datasets_url_base = "https://huggingface.co/datasets/"
|
16 |
# expose this index in a retriever interface
|
@@ -27,10 +31,18 @@ class DatasetRecommender:
|
|
27 |
return {'message': response_text, 'datasets': linked_datasets}
|
28 |
|
29 |
def get_similar_datasets(self, query_url):
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
similar_documents = self.db.similarity_search(cleaned_description)
|
35 |
similar_datasets = [f"{self.datasets_url_base}{x.metadata['id']}" for x in similar_documents if x.metadata['id'] not in query_url]
|
36 |
return {'datasets': similar_datasets}
|
|
|
1 |
from langchain.chains import RetrievalQA
|
|
|
|
|
2 |
from vectorize_dataset import load_descriptions_data, create_db
|
3 |
+
from helpers import clean_up_tags, get_dataset_metadata, get_dataset_readme
|
4 |
+
from langchain.embeddings import HuggingFaceEmbeddings
|
5 |
+
from langchain import HuggingFaceHub
|
6 |
+
from langchain.chat_models import ChatOpenAI
|
7 |
+
from langchain.embeddings import OpenAIEmbeddings
|
8 |
|
9 |
|
10 |
class DatasetRecommender:
|
11 |
+
def __init__(self, dataset = "nkasmanoff/hf-dataset-cards",
|
12 |
+
llm_backbone = ChatOpenAI(),
|
13 |
+
embeddings_backbone = OpenAIEmbeddings()):
|
14 |
+
self.dataset = dataset
|
15 |
self.llm_backbone = llm_backbone
|
16 |
self.embeddings_backbone = embeddings_backbone
|
17 |
+
self.hf_df = load_descriptions_data(dataset=self.dataset)
|
18 |
self.db = create_db(self.hf_df, self.embeddings_backbone)
|
19 |
self.datasets_url_base = "https://huggingface.co/datasets/"
|
20 |
# expose this index in a retriever interface
|
|
|
31 |
return {'message': response_text, 'datasets': linked_datasets}
|
32 |
|
33 |
def get_similar_datasets(self, query_url):
|
34 |
+
if self.dataset == "nkasmanoff/hf-dataset-cards":
|
35 |
+
retrieved_metadata = get_dataset_readme(query_url)
|
36 |
+
if 'README' not in retrieved_metadata:
|
37 |
+
return {'error': 'no description found for this dataset.'}
|
38 |
+
|
39 |
+
cleaned_description = retrieved_metadata['README']
|
40 |
+
else:
|
41 |
+
retrieved_metadata = get_dataset_metadata(query_url)
|
42 |
+
if 'description' not in retrieved_metadata:
|
43 |
+
return {'error': 'no description found for this dataset.'}
|
44 |
+
cleaned_description = retrieved_metadata['description'] + clean_up_tags(retrieved_metadata['tags'])
|
45 |
+
|
46 |
similar_documents = self.db.similarity_search(cleaned_description)
|
47 |
similar_datasets = [f"{self.datasets_url_base}{x.metadata['id']}" for x in similar_documents if x.metadata['id'] not in query_url]
|
48 |
return {'datasets': similar_datasets}
|