nkasmanoff commited on
Commit
e5e23ef
1 Parent(s): fb6da00

Update dataset_recommender.py

Browse files
Files changed (1) hide show
  1. dataset_recommender.py +22 -10
dataset_recommender.py CHANGED
@@ -1,16 +1,20 @@
1
  from langchain.chains import RetrievalQA
2
- from langchain.llms import OpenAI
3
- from langchain.embeddings import OpenAIEmbeddings
4
  from vectorize_dataset import load_descriptions_data, create_db
5
- from helpers import clean_up_tags, get_dataset_metadata
6
-
 
 
 
7
 
8
 
9
  class DatasetRecommender:
10
- def __init__(self, llm_backbone = OpenAI(), embeddings_backbone = OpenAIEmbeddings()):
 
 
 
11
  self.llm_backbone = llm_backbone
12
  self.embeddings_backbone = embeddings_backbone
13
- self.hf_df = load_descriptions_data()
14
  self.db = create_db(self.hf_df, self.embeddings_backbone)
15
  self.datasets_url_base = "https://huggingface.co/datasets/"
16
  # expose this index in a retriever interface
@@ -27,10 +31,18 @@ class DatasetRecommender:
27
  return {'message': response_text, 'datasets': linked_datasets}
28
 
29
  def get_similar_datasets(self, query_url):
30
- retrieved_metadata = get_dataset_metadata(query_url)
31
- if 'description' not in retrieved_metadata:
32
- return {'error': 'no description found for this dataset.'}
33
- cleaned_description = retrieved_metadata['description'] + clean_up_tags(retrieved_metadata['tags'])
 
 
 
 
 
 
 
 
34
  similar_documents = self.db.similarity_search(cleaned_description)
35
  similar_datasets = [f"{self.datasets_url_base}{x.metadata['id']}" for x in similar_documents if x.metadata['id'] not in query_url]
36
  return {'datasets': similar_datasets}
 
1
  from langchain.chains import RetrievalQA
 
 
2
  from vectorize_dataset import load_descriptions_data, create_db
3
+ from helpers import clean_up_tags, get_dataset_metadata, get_dataset_readme
4
+ from langchain.embeddings import HuggingFaceEmbeddings
5
+ from langchain import HuggingFaceHub
6
+ from langchain.chat_models import ChatOpenAI
7
+ from langchain.embeddings import OpenAIEmbeddings
8
 
9
 
10
  class DatasetRecommender:
11
+ def __init__(self, dataset = "nkasmanoff/hf-dataset-cards",
12
+ llm_backbone = ChatOpenAI(),
13
+ embeddings_backbone = OpenAIEmbeddings()):
14
+ self.dataset = dataset
15
  self.llm_backbone = llm_backbone
16
  self.embeddings_backbone = embeddings_backbone
17
+ self.hf_df = load_descriptions_data(dataset=self.dataset)
18
  self.db = create_db(self.hf_df, self.embeddings_backbone)
19
  self.datasets_url_base = "https://huggingface.co/datasets/"
20
  # expose this index in a retriever interface
 
31
  return {'message': response_text, 'datasets': linked_datasets}
32
 
33
  def get_similar_datasets(self, query_url):
34
+ if self.dataset == "nkasmanoff/hf-dataset-cards":
35
+ retrieved_metadata = get_dataset_readme(query_url)
36
+ if 'README' not in retrieved_metadata:
37
+ return {'error': 'no description found for this dataset.'}
38
+
39
+ cleaned_description = retrieved_metadata['README']
40
+ else:
41
+ retrieved_metadata = get_dataset_metadata(query_url)
42
+ if 'description' not in retrieved_metadata:
43
+ return {'error': 'no description found for this dataset.'}
44
+ cleaned_description = retrieved_metadata['description'] + clean_up_tags(retrieved_metadata['tags'])
45
+
46
  similar_documents = self.db.similarity_search(cleaned_description)
47
  similar_datasets = [f"{self.datasets_url_base}{x.metadata['id']}" for x in similar_documents if x.metadata['id'] not in query_url]
48
  return {'datasets': similar_datasets}