File size: 4,573 Bytes
37a9836
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
import argparse
import logging
import os
from typing import Optional

from core.bark.generate_audio_semantic_dataset import (
    generate_wav_semantic_dataset,
    BarkGenerationConfig,
)
from core.utils import upload_file_to_hf, zip_folder


logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
)
logger = logging.getLogger(__name__)


def parse_dataset_args(args_list=None):
    """Parse arguments specific to dataset creation."""
    parser = argparse.ArgumentParser(description="Audio Semantic Dataset Creation")

    parser.add_argument(
        "--text-file",
        type=str,
        default="data/test_data.txt",
        help="Path to text file for dataset generation",
    )
    parser.add_argument(
        "--batch-size",
        type=int,
        default=2,
        help="Batch size for processing (default: 1)",
    )

    parser.add_argument(
        "--output-dir",
        type=str,
        default="./dataset",
        help="Output directory for generated files (default: ./dataset)",
    )
    parser.add_argument(
        "--max-tokens",
        type=int,
        default=256,
        help="Maximum tokens per example (default: 256)",
    )
    parser.add_argument(
        "--use-small-model",
        action="store_true",
        help="Use small model for generation",
    )
    parser.add_argument(
        "--save-raw-audio",
        action="store_true",
        help="Store generated audio as .wav instead of .npz",
    )
    parser.add_argument(
        "--publish-hf",
        action="store_true",
        help="Publish dataset to HuggingFace Hub",
    )
    parser.add_argument(
        "--repo-id",
        type=str,
        help="HuggingFace repo ID to publish to",
    )
    parser.add_argument(
        "--path-in-repo",
        type=str,
        help="Path in HF repo",
        default=None,
    )
    parser.add_argument(
        "--silent", action="store_true", help="Suppress progress output"
    )

    return parser.parse_args(args_list)


def create_audio_semantic_dataset(
    text_file: str,
    output_dir: str = "./dataset",
    batch_size: int = 1,
    max_tokens: int = 256,
    use_small_model: bool = False,
    save_raw_audio: bool = False,
    publish_hf: bool = False,
    repo_id: Optional[str] = None,
    path_in_repo: Optional[str] = None,
    silent: bool = False,
) -> None:
    """Create audio semantic dataset from text file.

    Can be called directly with parameters or via command line using parse_dataset_args().

    Args:
        text_file: Path to input text file
        output_dir: Directory to save generated dataset
        batch_size: Batch size for processing
        max_tokens: Maximum tokens per example
        use_small_model: Whether to use small model
        save_raw_audio: Save as raw audio (.wav) instead of .npz
        publish_hf: Whether to publish to HuggingFace Hub
        repo_id: HF repo ID to publish to
        path_in_repo: Path in HF repo
        silent: Suppress progress output
    """
    os.makedirs(output_dir, exist_ok=True)

    if not os.path.isfile(text_file):
        raise FileNotFoundError(f"Text file not found: {text_file}")

    logger.info(f"Starting dataset generation from {text_file}")
    generation_config = BarkGenerationConfig(
        temperature=None,
        generate_coarse_temperature=None,
        generate_fine_temperature=None,
        use_small_model=use_small_model,
    )

    generate_wav_semantic_dataset(
        text_file_path=text_file,
        generation_config=generation_config,
        batch_size=batch_size,
        save_path=output_dir,
        save_data_as_raw_audio=save_raw_audio,
        silent=silent,
    )
    logger.info("Dataset generation completed")

    if publish_hf and repo_id:
        logger.info("Publishing dataset to huggingface hub")
        zip_path = "./dataset.zip"
        success = zip_folder(output_dir, zip_path)
        if not success:
            raise RuntimeError(f"Unable to zip folder {output_dir}")
        upload_file_to_hf(zip_path, repo_id, "dataset", path_in_repo=path_in_repo)


if __name__ == "__main__":
    args = parse_dataset_args()
    create_audio_semantic_dataset(
        text_file=args.text_file,
        output_dir=args.output_dir,
        batch_size=args.batch_size,
        max_tokens=args.max_tokens,
        use_small_model=args.use_small_model,
        save_raw_audio=args.save_raw_audio,
        publish_hf=args.publish_hf,
        repo_id=args.repo_id,
        path_in_repo=args.path_in_repo,
        silent=args.silent,
    )