derek-thomas HF staff commited on
Commit
fdc091a
1 Parent(s): 8ba4837

Adding html and topic modeling on subreddit

Browse files
Files changed (2) hide show
  1. requirements.txt +2 -1
  2. src/build_nomic.py +29 -5
requirements.txt CHANGED
@@ -7,4 +7,5 @@ tqdm==4.66.1
7
  beautifulsoup4==4.12.2
8
  lxml==4.9.3
9
  rich==13.3.4
10
- nomic==3.0.15
 
 
7
  beautifulsoup4==4.12.2
8
  lxml==4.9.3
9
  rich==13.3.4
10
+ nomic==3.0.15
11
+ markdown==3.6
src/build_nomic.py CHANGED
@@ -1,12 +1,15 @@
1
  # https://atlas.nomic.ai/data/derek2/boru-subreddit-neural-search/map
2
  import os
3
- import pandas as pd
4
  import time
5
 
 
6
  import nomic
7
- from nomic import atlas
8
- from nomic.dataset import AtlasClass
9
  import numpy as np
 
 
 
 
10
 
11
  from src.my_logger import setup_logger
12
 
@@ -20,6 +23,11 @@ def count_words(text):
20
  return len(words)
21
 
22
 
 
 
 
 
 
23
  def delete_old_nomic():
24
  logger.info(f"Trying to delete old version of nomic Atlas...")
25
  try:
@@ -32,11 +40,12 @@ def delete_old_nomic():
32
  except:
33
  logger.info(f"Failed to delete old version of nomic Atlas.")
34
 
 
35
  def build_nomic(dataset):
36
  df = dataset['train'].to_pandas()
37
 
38
- non_embedding_columns = ['date_utc', 'title', 'flair', 'content', 'poster', 'permalink', 'id', 'word_count',
39
- 'score', 'score_percentile']
40
 
41
  # Calculate the 0th, 10th, 20th, ..., 90th percentiles for the 'score' column
42
  percentiles = df['score'].quantile([0, .1, .2, .3, .4, .5, .6, .7, .8, .9]).tolist()
@@ -53,6 +62,20 @@ def build_nomic(dataset):
53
  df['score_percentile'] = pd.cut(df['score'], bins=bins, labels=labels, include_lowest=True)
54
 
55
  df['word_count'] = df['content'].apply(count_words)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
 
57
  delete_old_nomic()
58
 
@@ -62,5 +85,6 @@ def build_nomic(dataset):
62
  data=df[non_embedding_columns].to_dict(orient='records'),
63
  id_field='id',
64
  identifier='BORU Subreddit Neural Search',
 
65
  )
66
  logger.info(f"Succeeded in creating new version of nomic Atlas: {project.slug}")
 
1
  # https://atlas.nomic.ai/data/derek2/boru-subreddit-neural-search/map
2
  import os
3
+ import re
4
  import time
5
 
6
+ import markdown
7
  import nomic
 
 
8
  import numpy as np
9
+ import pandas as pd
10
+ from nomic import atlas, Nomic
11
+ from nomic.dataset import AtlasClass
12
+ from nomic.data_inference import NomicTopicOptions
13
 
14
  from src.my_logger import setup_logger
15
 
 
23
  return len(words)
24
 
25
 
26
+ def convert_markdown_to_html(markdown_text):
27
+ html = markdown.markdown(markdown_text)
28
+ return html
29
+
30
+
31
  def delete_old_nomic():
32
  logger.info(f"Trying to delete old version of nomic Atlas...")
33
  try:
 
40
  except:
41
  logger.info(f"Failed to delete old version of nomic Atlas.")
42
 
43
+
44
  def build_nomic(dataset):
45
  df = dataset['train'].to_pandas()
46
 
47
+ non_embedding_columns = ['date_utc', 'title', 'flair', 'poster', 'permalink', 'id', 'word_count',
48
+ 'score', 'score_percentile', 'html_content', 'subreddit']
49
 
50
  # Calculate the 0th, 10th, 20th, ..., 90th percentiles for the 'score' column
51
  percentiles = df['score'].quantile([0, .1, .2, .3, .4, .5, .6, .7, .8, .9]).tolist()
 
62
  df['score_percentile'] = pd.cut(df['score'], bins=bins, labels=labels, include_lowest=True)
63
 
64
  df['word_count'] = df['content'].apply(count_words)
65
+ df['html_content'] = df['content'].apply(convert_markdown_to_html)
66
+
67
+ # Regex to extract subreddit
68
+ subreddit_re = re.compile(r'r/(\w+)')
69
+ def extract_subreddit(text):
70
+ match = subreddit_re.search(text)
71
+ if match:
72
+ return match.group(1)
73
+ return ''
74
+
75
+ # Apply the function
76
+ df['subreddit'] = df['content'].apply(extract_subreddit)
77
+
78
+ topic_options = NomicTopicOptions(build_topic_model=True, community_description_target_field='subreddit')
79
 
80
  delete_old_nomic()
81
 
 
85
  data=df[non_embedding_columns].to_dict(orient='records'),
86
  id_field='id',
87
  identifier='BORU Subreddit Neural Search',
88
+ topic_model=topic_options
89
  )
90
  logger.info(f"Succeeded in creating new version of nomic Atlas: {project.slug}")