File size: 463 Bytes
4b8f61c
5eee2bb
 
22a07fd
4b8f61c
 
5eee2bb
4b8f61c
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
import pandas as pd
import os
from datasets import load_dataset, DownloadConfig
from helpers.utils import extract_audio_identifier

DATA_FILE = "sawadogosalif/MooreFRCollections_BibleOnlyText"
data = load_dataset(DATA_FILE, split="train", download_config=DownloadConfig(token=os.environ["HF_TOKEN"])).to_pandas()
data[["chapter", "page"]] = data["moore_source_url"].apply(
    lambda x: pd.Series(extract_audio_identifier(x))
)

BUCKET_NAME = "moore-collection"