Spaces:
Sleeping
Sleeping
Upload scripts/preprocess.py with huggingface_hub
Browse files- scripts/preprocess.py +2 -1
scripts/preprocess.py
CHANGED
@@ -4,7 +4,7 @@ sys.path.append(sys.path[0].replace('scripts', ''))
|
|
4 |
from urllib.request import urlretrieve
|
5 |
import pandas as pd
|
6 |
|
7 |
-
from config.data_paths import
|
8 |
import re
|
9 |
|
10 |
from scripts.utils import load_config
|
@@ -28,6 +28,7 @@ def clean_corpus():
|
|
28 |
Utility function to clean and preprocess the prompt corpus.
|
29 |
"""
|
30 |
if not os.path.isfile(os.path.join(PROCESSED_DATA_PATH, 'prompt_corpus_clean.parquet')): # to speed up the process
|
|
|
31 |
df = pd.read_parquet(PROMPTS_URL).sample(10000, random_state=123)
|
32 |
assert 'prompt' in df.columns, "Parquet file must contain a 'prompt' column."
|
33 |
df = df[df['prompt'].notna()][['prompt']] # drop missing rows
|
|
|
4 |
from urllib.request import urlretrieve
|
5 |
import pandas as pd
|
6 |
|
7 |
+
from config.data_paths import PROCESSED_DATA_PATH
|
8 |
import re
|
9 |
|
10 |
from scripts.utils import load_config
|
|
|
28 |
Utility function to clean and preprocess the prompt corpus.
|
29 |
"""
|
30 |
if not os.path.isfile(os.path.join(PROCESSED_DATA_PATH, 'prompt_corpus_clean.parquet')): # to speed up the process
|
31 |
+
os.makedirs(PROCESSED_DATA_PATH, exist_ok=True)
|
32 |
df = pd.read_parquet(PROMPTS_URL).sample(10000, random_state=123)
|
33 |
assert 'prompt' in df.columns, "Parquet file must contain a 'prompt' column."
|
34 |
df = df[df['prompt'].notna()][['prompt']] # drop missing rows
|