Mfischthal commited on
Commit
758613a
·
verified ·
1 Parent(s): fd3e3ef

Upload 6 files

Browse files
Files changed (4) hide show
  1. app.py +138 -119
  2. requirements.txt +1 -1
  3. teacher.py +2 -1
  4. validators.py +1 -0
app.py CHANGED
@@ -1,135 +1,154 @@
1
  import os
2
  import gradio as gr
3
- from typing import List, Dict, Any
 
 
4
  from data_io import load_from_hub_or_upload
5
  from teacher import call_teacher, MODEL, INSTRUCTION
6
  from validators import validate_output
7
  from exporters import to_jsonl, to_hf_dataset
8
 
 
9
  SESSION: Dict[str, Any] = {
10
  "passages": [],
11
  "records": [],
12
  "dataset_id": None,
13
  }
14
 
15
- DESCRIPTION = """### Dialogue→Speaker Dataset Builder
16
- A Gradio app that prepares passages, generates `Speaker N:`-structured dialogue via the OpenAI API, lets you review & edit, and exports JSONL / HF Datasets."""
17
-
18
- with gr.Blocks(title="Dialogue→Speaker Dataset Builder") as demo:
19
- gr.Markdown("# Dialogue→Speaker Dataset Builder")
20
- gr.Markdown(DESCRIPTION)
21
-
22
- with gr.Tab("Data"):
23
- src_mode = gr.Radio(["HF Dataset", "Upload .txt"], value="HF Dataset", label="Source")
24
- hf_id = gr.Textbox(value="Navanjana/Gutenberg_books", label="HF dataset id (train split)")
25
- upload = gr.File(file_types=[".txt"], label="Upload a .txt file")
26
- sample = gr.Number(value=200, label="Sample passages (0 = all)")
27
- min_words = gr.Number(value=80, label="Min words per passage")
28
- chunk = gr.Number(value=1200, label="Chunk size (chars)")
29
- btn_prep = gr.Button("Prepare passages")
30
- info_data = gr.Markdown()
31
-
32
- with gr.Tab("Generation"):
33
- model_box = gr.Textbox(value=os.getenv("OPENAI_MODEL", MODEL), label="OpenAI model")
34
- temperature = gr.Slider(0, 1, value=0.0, step=0.1, label="Temperature")
35
- btn_gen = gr.Button("Generate with OpenAI")
36
- progress_gen = gr.Markdown()
37
- rec_table = gr.Dataframe(headers=["#", "status", "chars"], row_count=(0, "dynamic"))
38
-
39
- with gr.Tab("Review"):
40
- idx = gr.Number(value=0, label="Record #")
41
- inp = gr.Textbox(lines=12, label="Input passage", interactive=False)
42
- out = gr.Textbox(lines=12, label="Output (edit)")
43
- status = gr.Dropdown(["accepted","needs_work","unreviewed"], value="unreviewed", label="Status")
44
- btn_load = gr.Button("Load record")
45
- btn_save = gr.Button("Save changes")
46
- review_msg = gr.Markdown()
47
-
48
- with gr.Tab("Export"):
49
- btn_jsonl = gr.Button("Download JSONL")
50
- dl_path = gr.Textbox(label="JSONL path")
51
- push_repo = gr.Textbox(value="", label="HF Dataset repo (e.g. yourname/gutenberg_dialogue_v1)")
52
- private_toggle = gr.Checkbox(value=True, label="Private repo")
53
- btn_push = gr.Button("Push to Hugging Face Hub")
54
- export_msg = gr.Markdown()
55
-
56
- with gr.Tab("Settings"):
57
- instr = gr.Textbox(value=INSTRUCTION, lines=14, label="Canonical instruction (read-only)", interactive=False)
58
- gr.Markdown("Set `OPENAI_API_KEY` & optional `OPENAI_MODEL` in Space Secrets.")
59
-
60
- def on_prepare(src_mode, hf_id, upload, sample, min_words, chunk):
61
- passages, dataset_id = load_from_hub_or_upload(src_mode, hf_id, upload, int(sample), int(min_words), int(chunk))
62
- SESSION["passages"] = passages
63
- SESSION["dataset_id"] = dataset_id
64
- SESSION["records"] = []
65
- return f"Prepared {len(passages)} passages from: {dataset_id}"
66
-
67
- def on_generate(model_name, temperature):
68
- if not SESSION["passages"]:
69
- return "No passages prepared yet.", []
70
- os.environ["OPENAI_MODEL"] = model_name
71
- rows, records, ok, bad = [], [], 0, 0
72
- for i, p in enumerate(SESSION["passages"]):
73
- y = call_teacher(p, temperature=float(temperature))
74
- status = "unreviewed"
75
- if y and validate_output(y):
76
- ok += 1
77
- else:
78
- bad += 1
79
- y = y or ""
80
- status = "needs_work"
81
- rec = {
82
- "task": "dialogue_format",
83
- "instruction": INSTRUCTION,
84
- "input": p,
85
- "output": y,
86
- "meta": {
87
- "chars": len(p),
88
- "model": os.getenv("OPENAI_MODEL", model_name),
89
- "status": status,
90
- "source": "LLM",
91
- "dataset_id": SESSION["dataset_id"]
92
- }
93
  }
94
- records.append(rec)
95
- rows.append([i, status, len(p)])
96
- SESSION["records"] = records
97
- return f"Generated {ok} valid, {bad} need work.", rows
98
-
99
- def on_load(idx):
100
- i = int(idx)
101
- r = SESSION["records"][i]
102
- return r["input"], r["output"], r["meta"]["status"]
103
-
104
- def on_save(idx, output, status):
105
- i = int(idx)
106
- SESSION["records"][i]["output"] = output
107
- SESSION["records"][i]["meta"]["status"] = status
108
- return f"Saved record #{i} as {status}."
109
-
110
- def on_export_jsonl():
111
- path = "workspace/dataset.jsonl"
112
- to_jsonl(SESSION["records"], path)
113
- return path
114
-
115
- def on_push(push_repo, private_toggle):
116
- if not push_repo:
117
- return "Provide a repo name like 'yourname/gutenberg_dialogue_v1'"
118
- ds = to_hf_dataset(
119
- SESSION["records"],
120
- save_to="workspace/hf_dataset",
121
- push_repo=push_repo,
122
- private=bool(private_toggle),
123
- token=os.getenv("HF_TOKEN")
124
- )
125
- return f"Pushed {len(ds)} records to {push_repo}"
126
-
127
- btn_prep.click(on_prepare, [src_mode, hf_id, upload, sample, min_words, chunk], [info_data])
128
- btn_gen.click(on_generate, [model_box, temperature], [progress_gen, rec_table])
129
- btn_load.click(on_load, [idx], [inp, out, status])
130
- btn_save.click(on_save, [idx, out, status], [review_msg])
131
- btn_jsonl.click(on_export_jsonl, [], [dl_path])
132
- btn_push.click(on_push, [push_repo, private_toggle], [export_msg])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
 
134
  if __name__ == "__main__":
135
- demo.launch()
 
1
  import os
2
  import gradio as gr
3
+ from typing import List, Dict, Any, Tuple
4
+
5
+ # Local imports
6
  from data_io import load_from_hub_or_upload
7
  from teacher import call_teacher, MODEL, INSTRUCTION
8
  from validators import validate_output
9
  from exporters import to_jsonl, to_hf_dataset
10
 
11
+ # ---------------- State ----------------
12
  SESSION: Dict[str, Any] = {
13
  "passages": [],
14
  "records": [],
15
  "dataset_id": None,
16
  }
17
 
18
+ DESCRIPTION = (
19
+ "### Dialogue→Speaker Dataset Builder\n"
20
+ "Prepare passages, generate `Speaker N:` dialogue via the OpenAI API, "
21
+ "review & edit, and export JSONL / HF Datasets."
22
+ )
23
+
24
+ # ---------------- Callbacks ----------------
25
+ def on_prepare(src_mode: str, hf_id: str, upload, sample: float, min_words: float, chunk: float) -> str:
26
+ sample_i = int(sample) if sample else 0
27
+ min_words_i = int(min_words) if min_words else 80
28
+ chunk_i = int(chunk) if chunk else 1200
29
+ passages, dataset_id = load_from_hub_or_upload(src_mode, hf_id, upload, sample_i, min_words_i, chunk_i)
30
+ SESSION["passages"] = passages
31
+ SESSION["dataset_id"] = dataset_id
32
+ SESSION["records"] = []
33
+ return f"Prepared {len(passages)} passages from: {dataset_id}"
34
+
35
+ def on_generate(model_name: str, temperature: float) -> Tuple[str, list]:
36
+ if not SESSION["passages"]:
37
+ return "No passages prepared yet.", []
38
+ os.environ["OPENAI_MODEL"] = model_name
39
+ rows = []
40
+ records = []
41
+ ok = bad = 0
42
+ for i, p in enumerate(SESSION["passages"]):
43
+ y = call_teacher(p, temperature=float(temperature))
44
+ status = "unreviewed"
45
+ if y and validate_output(y):
46
+ ok += 1
47
+ else:
48
+ bad += 1
49
+ y = y or ""
50
+ status = "needs_work"
51
+ rec = {
52
+ "task": "dialogue_format",
53
+ "instruction": INSTRUCTION,
54
+ "input": p,
55
+ "output": y,
56
+ "meta": {
57
+ "chars": len(p),
58
+ "model": os.getenv("OPENAI_MODEL", model_name),
59
+ "status": status,
60
+ "source": "LLM",
61
+ "dataset_id": SESSION["dataset_id"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  }
63
+ }
64
+ records.append(rec)
65
+ rows.append([i, status, len(p)])
66
+ SESSION["records"] = records
67
+ return f"Generated {ok} valid, {bad} need work.", rows
68
+
69
+ def on_load(idx: float) -> Tuple[str, str, str]:
70
+ i = int(idx)
71
+ r = SESSION["records"][i]
72
+ return r["input"], r["output"], r["meta"]["status"]
73
+
74
+ def on_save(idx: float, output: str, status: str) -> str:
75
+ i = int(idx)
76
+ SESSION["records"][i]["output"] = output
77
+ SESSION["records"][i]["meta"]["status"] = status
78
+ return f"Saved record #{i} as {status}."
79
+
80
+ def on_export_jsonl() -> str:
81
+ path = "workspace/dataset.jsonl"
82
+ to_jsonl(SESSION["records"], path)
83
+ return path
84
+
85
+ def on_push(push_repo: str, private_toggle: bool) -> str:
86
+ if not push_repo:
87
+ return "Provide a repo name like 'yourname/gutenberg_dialogue_v1'"
88
+ ds = to_hf_dataset(
89
+ SESSION["records"],
90
+ save_to="workspace/hf_dataset",
91
+ push_repo=push_repo,
92
+ private=bool(private_toggle),
93
+ token=os.getenv("HF_TOKEN")
94
+ )
95
+ return f"Pushed {len(ds)} records to {push_repo}"
96
+
97
+ # ---------------- UI ----------------
98
+ def build_ui():
99
+ with gr.Blocks(title="Dialogue→Speaker Dataset Builder", theme=gr.themes.Default()) as demo:
100
+ gr.Markdown("# Dialogue→Speaker Dataset Builder")
101
+ gr.Markdown(DESCRIPTION)
102
+
103
+ with gr.Tab("Data"):
104
+ src_mode = gr.Radio(["HF Dataset", "Upload .txt"], value="HF Dataset", label="Source")
105
+ hf_id = gr.Textbox(value="Navanjana/Gutenberg_books", label="HF dataset id (train split)")
106
+ upload = gr.File(file_types=[".txt"], label="Upload a .txt file")
107
+ sample = gr.Number(value=200, label="Sample passages (0 = all)")
108
+ min_words = gr.Number(value=80, label="Min words per passage")
109
+ chunk = gr.Number(value=1200, label="Chunk size (chars)")
110
+ btn_prep = gr.Button("Prepare passages")
111
+ info_data = gr.Markdown()
112
+
113
+ with gr.Tab("Generation"):
114
+ model_box = gr.Textbox(value=os.getenv("OPENAI_MODEL", MODEL), label="OpenAI model")
115
+ temperature = gr.Slider(0, 1, value=0.0, step=0.1, label="Temperature")
116
+ btn_gen = gr.Button("Generate with OpenAI")
117
+ progress_gen = gr.Markdown()
118
+ rec_table = gr.Dataframe(value=[], headers=["#", "status", "chars"], row_count=0, col_count=3, interactive=False)
119
+
120
+ with gr.Tab("Review"):
121
+ idx = gr.Number(value=0, label="Record #")
122
+ inp = gr.Textbox(lines=12, label="Input passage", interactive=False)
123
+ out = gr.Textbox(lines=12, label="Output (edit)")
124
+ status = gr.Dropdown(["accepted","needs_work","unreviewed"], value="unreviewed", label="Status")
125
+ btn_load = gr.Button("Load record")
126
+ btn_save = gr.Button("Save changes")
127
+ review_msg = gr.Markdown()
128
+
129
+ with gr.Tab("Export"):
130
+ btn_jsonl = gr.Button("Download JSONL")
131
+ dl_path = gr.Textbox(label="JSONL path")
132
+ push_repo = gr.Textbox(value="", label="HF Dataset repo (e.g. yourname/gutenberg_dialogue_v1)")
133
+ private_toggle = gr.Checkbox(value=True, label="Private repo")
134
+ btn_push = gr.Button("Push to Hugging Face Hub")
135
+ export_msg = gr.Markdown()
136
+
137
+ with gr.Tab("Settings"):
138
+ instr = gr.Textbox(value=INSTRUCTION, lines=14, label="Canonical instruction (read-only)", interactive=False)
139
+ gr.Markdown("Set `OPENAI_API_KEY` & optional `OPENAI_MODEL` in Space Secrets.")
140
+
141
+ # Wire callbacks
142
+ btn_prep.click(on_prepare, [src_mode, hf_id, upload, sample, min_words, chunk], [info_data])
143
+ btn_gen.click(on_generate, [model_box, temperature], [progress_gen, rec_table])
144
+ btn_load.click(on_load, [idx], [inp, out, status])
145
+ btn_save.click(on_save, [idx, out, status], [review_msg])
146
+ btn_jsonl.click(on_export_jsonl, [], [dl_path])
147
+ btn_push.click(on_push, [push_repo, private_toggle], [export_msg])
148
+
149
+ return demo
150
+
151
+ demo = build_ui()
152
 
153
  if __name__ == "__main__":
154
+ demo.launch()
requirements.txt CHANGED
@@ -1,4 +1,4 @@
1
- gradio>=4.44.0
2
  datasets>=3.0.0
3
  ftfy
4
  regex
 
1
+ gradio>=4.44.1
2
  datasets>=3.0.0
3
  ftfy
4
  regex
teacher.py CHANGED
@@ -19,7 +19,8 @@ Requirements:
19
  """
20
 
21
  MODEL = os.getenv("OPENAI_MODEL", "gpt-4o-mini")
22
- client = OpenAI()
 
23
  STRICT_SUFFIX = "\n\nIMPORTANT: Every line must start with 'Speaker N: ' and include at least two lines."
24
 
25
  def call_teacher(passage: str, temperature: float = 0.0, max_retries: int = 2) -> Optional[str]:
 
19
  """
20
 
21
  MODEL = os.getenv("OPENAI_MODEL", "gpt-4o-mini")
22
+ client = OpenAI() # uses OPENAI_API_KEY
23
+
24
  STRICT_SUFFIX = "\n\nIMPORTANT: Every line must start with 'Speaker N: ' and include at least two lines."
25
 
26
  def call_teacher(passage: str, temperature: float = 0.0, max_retries: int = 2) -> Optional[str]:
validators.py CHANGED
@@ -1,4 +1,5 @@
1
  import regex as re
 
2
  SPEAKER_LINE = re.compile(r"^(Speaker\s+\d+):\s")
3
 
4
  def validate_output(text: str, min_lines: int = 2, max_speaker_index: int = 9) -> bool:
 
1
  import regex as re
2
+
3
  SPEAKER_LINE = re.compile(r"^(Speaker\s+\d+):\s")
4
 
5
  def validate_output(text: str, min_lines: int = 2, max_speaker_index: int = 9) -> bool: