nkasmanoff commited on
Commit
8ac2414
1 Parent(s): 4492eec

Update vectorize_dataset.py

Browse files
Files changed (1) hide show
  1. vectorize_dataset.py +13 -6
vectorize_dataset.py CHANGED
@@ -5,12 +5,19 @@ from langchain.vectorstores import Chroma
5
  from langchain.document_loaders import DataFrameLoader
6
 
7
 
8
- def load_descriptions_data():
9
- hf_datasets = load_dataset('nkasmanoff/huggingface-datasets')
10
- hf_df = hf_datasets['train'].to_pandas()
11
- hf_df['tags_cleaned'] = hf_df['tags'].apply(clean_up_tags)
12
- hf_df.dropna(subset=['description'],inplace=True)
13
- hf_df['description_full'] = hf_df['description'].fillna('') + ' ' + hf_df['tags_cleaned']
 
 
 
 
 
 
 
14
  hf_df = hf_df[hf_df['description_full'] != ' ']
15
  hf_df = hf_df[['id','description_full']]
16
 
 
5
  from langchain.document_loaders import DataFrameLoader
6
 
7
 
8
+ def load_descriptions_data(dataset='nkasmanoff/hf-dataset-cards'):
9
+ if dataset == 'hf-dataset-cards':
10
+ hf_datasets = load_dataset(dataset)
11
+ hf_df = hf_datasets['train'].to_pandas()
12
+ hf_df.dropna(subset=['README'],inplace=True)
13
+ hf_df['description_full'] = hf_df['README']
14
+
15
+ else:
16
+ hf_datasets = load_dataset('nkasmanoff/huggingface-datasets')
17
+ hf_df = hf_datasets['train'].to_pandas()
18
+ hf_df['tags_cleaned'] = hf_df['tags'].apply(clean_up_tags)
19
+ hf_df.dropna(subset=['description'],inplace=True)
20
+ hf_df['description_full'] = hf_df['description'].fillna('') + ' ' + hf_df['tags_cleaned']
21
  hf_df = hf_df[hf_df['description_full'] != ' ']
22
  hf_df = hf_df[['id','description_full']]
23