derek-thomas HF staff commited on
Commit
779c2fa
1 Parent(s): 7fa626d

Updating column names

Browse files
Files changed (1) hide show
  1. src/build_nomic.py +10 -3
src/build_nomic.py CHANGED
@@ -10,11 +10,16 @@ NOMIC_KEY = os.getenv('NOMIC_KEY')
10
  nomic.login(NOMIC_KEY)
11
 
12
 
 
 
 
 
 
13
  def build_nomic(dataset):
14
  df = dataset['train'].to_pandas()
15
 
16
- non_embedding_columns = ['date_utc', 'title', 'flair', 'content', 'poster', 'permalink', 'id', 'content_length',
17
- 'score', 'percentile_ranges']
18
 
19
  # Calculate the 0th, 10th, 20th, ..., 90th percentiles for the 'score' column
20
  percentiles = df['score'].quantile([0, .1, .2, .3, .4, .5, .6, .7, .8, .9]).tolist()
@@ -30,9 +35,11 @@ def build_nomic(dataset):
30
  # This assigns each score to its corresponding percentile range
31
  df['score_percentile'] = pd.cut(df['score'], bins=bins, labels=labels, include_lowest=True)
32
 
 
 
33
  # Create Atlas project
34
  project = atlas.map_data(embeddings=np.stack(df['embedding'].values),
35
  data=df[non_embedding_columns].to_dict(orient='records'),
36
  id_field='id',
37
  identifier='BORU Subreddit Neural Search',
38
- )
 
10
  nomic.login(NOMIC_KEY)
11
 
12
 
13
+ def count_words(text):
14
+ words = text.split()
15
+ return len(words)
16
+
17
+
18
  def build_nomic(dataset):
19
  df = dataset['train'].to_pandas()
20
 
21
+ non_embedding_columns = ['date_utc', 'title', 'flair', 'content', 'poster', 'permalink', 'id', 'word_count',
22
+ 'score', 'score_percentile']
23
 
24
  # Calculate the 0th, 10th, 20th, ..., 90th percentiles for the 'score' column
25
  percentiles = df['score'].quantile([0, .1, .2, .3, .4, .5, .6, .7, .8, .9]).tolist()
 
35
  # This assigns each score to its corresponding percentile range
36
  df['score_percentile'] = pd.cut(df['score'], bins=bins, labels=labels, include_lowest=True)
37
 
38
+ df['word_count'] = df['content'].apply(count_words)
39
+
40
  # Create Atlas project
41
  project = atlas.map_data(embeddings=np.stack(df['embedding'].values),
42
  data=df[non_embedding_columns].to_dict(orient='records'),
43
  id_field='id',
44
  identifier='BORU Subreddit Neural Search',
45
+ )