alexpantex commited on
Commit
61bfd6f
·
verified ·
1 Parent(s): dc985ea

Upload scripts/preprocess.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. scripts/preprocess.py +2 -1
scripts/preprocess.py CHANGED
@@ -4,7 +4,7 @@ sys.path.append(sys.path[0].replace('scripts', ''))
4
  from urllib.request import urlretrieve
5
  import pandas as pd
6
 
7
- from config.data_paths import RAW_DATA_PATH, PROCESSED_DATA_PATH
8
  import re
9
 
10
  from scripts.utils import load_config
@@ -28,6 +28,7 @@ def clean_corpus():
28
  Utility function to clean and preprocess the prompt corpus.
29
  """
30
  if not os.path.isfile(os.path.join(PROCESSED_DATA_PATH, 'prompt_corpus_clean.parquet')): # to speed up the process
 
31
  df = pd.read_parquet(PROMPTS_URL).sample(10000, random_state=123)
32
  assert 'prompt' in df.columns, "Parquet file must contain a 'prompt' column."
33
  df = df[df['prompt'].notna()][['prompt']] # drop missing rows
 
4
  from urllib.request import urlretrieve
5
  import pandas as pd
6
 
7
+ from config.data_paths import PROCESSED_DATA_PATH
8
  import re
9
 
10
  from scripts.utils import load_config
 
28
  Utility function to clean and preprocess the prompt corpus.
29
  """
30
  if not os.path.isfile(os.path.join(PROCESSED_DATA_PATH, 'prompt_corpus_clean.parquet')): # to speed up the process
31
+ os.makedirs(PROCESSED_DATA_PATH, exist_ok=True)
32
  df = pd.read_parquet(PROMPTS_URL).sample(10000, random_state=123)
33
  assert 'prompt' in df.columns, "Parquet file must contain a 'prompt' column."
34
  df = df[df['prompt'].notna()][['prompt']] # drop missing rows