lhoestq HF staff commited on
Commit
8b00326
Β·
1 Parent(s): 32d7d7d

final touches

Browse files
Files changed (3) hide show
  1. Dockerfile +2 -0
  2. run_job.py +15 -17
  3. start_app.py +37 -15
Dockerfile CHANGED
@@ -25,3 +25,5 @@ RUN pip install -r requirements.txt
25
 
26
  # Run app
27
  ENTRYPOINT python start_app.py
 
 
 
25
 
26
  # Run app
27
  ENTRYPOINT python start_app.py
28
+
29
+ # PS: Run with a variable SYSTEM=spaces to enable OAuth on Hugging Face Spaces
run_job.py CHANGED
@@ -32,7 +32,6 @@ DATA_CARD = "# Dataset Card for {dst}\n\nDataset prepared from {src} using\n\n``
32
  def sql(src: str, dst: str, query: str, config: str = "default", split: str = "train", private: bool = False, dry_run: bool = False):
33
  import os
34
  import duckdb
35
- from contextlib import nullcontext
36
  from huggingface_hub import CommitScheduler, DatasetCard
37
 
38
  class CommitAndCleanScheduler(CommitScheduler):
@@ -47,25 +46,24 @@ def sql(src: str, dst: str, query: str, config: str = "default", split: str = "t
47
  for path in self.last_uploaded:
48
  path.unlink(missing_ok=True)
49
 
50
- with nullcontext() if dry_run else CommitAndCleanScheduler(repo_id=dst, repo_type="dataset", folder_path="dst", path_in_repo="data", every=0.1, private=private):
51
- con = duckdb.connect(":memory:", config=CONFIG)
52
- src_kwargs = con.sql(CMD_SRC_KWARGS.format(src=src, config=config, split=split)).df().to_dict(orient="records")
53
- if not src_kwargs:
54
- raise ValueError(f'Invalid --config "{config}" for dataset "{src}", please select a valid dataset config/subset.')
55
 
56
- con.sql((CMD_SRC_DRY_RUN if dry_run else CMD_SRC).format(**src_kwargs[0]))
57
- if dry_run:
58
- print(f"Sample data from '{src}' that would be written to dataset '{dst}':\n")
59
- else:
60
- con.sql("PRAGMA enable_progress_bar;")
61
 
62
- result = con.sql((CMD_DST_DRY_RUN if dry_run else CMD_DST).format(query=query.rstrip("\n ;")))
63
- DatasetCard(DATA_CARD.format(src=src, dst=dst, query=query)).save("dst/README.md")
64
- if dry_run:
65
- print(result.df().to_markdown())
66
- else:
67
- print("done")
68
 
 
 
 
 
 
69
 
70
  if __name__ == '__main__':
71
  fire.Fire(sql)
 
32
  def sql(src: str, dst: str, query: str, config: str = "default", split: str = "train", private: bool = False, dry_run: bool = False):
33
  import os
34
  import duckdb
 
35
  from huggingface_hub import CommitScheduler, DatasetCard
36
 
37
  class CommitAndCleanScheduler(CommitScheduler):
 
46
  for path in self.last_uploaded:
47
  path.unlink(missing_ok=True)
48
 
49
+ con = duckdb.connect(":memory:", config=CONFIG)
50
+ src_kwargs = con.sql(CMD_SRC_KWARGS.format(src=src, config=config, split=split)).df().to_dict(orient="records")
51
+ if not src_kwargs:
52
+ raise ValueError(f'Invalid --config "{config}" for dataset "{src}", please select a valid dataset config/subset.')
 
53
 
54
+ con.sql((CMD_SRC_DRY_RUN if dry_run else CMD_SRC).format(**src_kwargs[0]))
 
 
 
 
55
 
56
+ if dry_run:
57
+ print(f"Sample data from '{src}' that would be written to dataset '{dst}':\n")
58
+ result = con.sql(CMD_DST_DRY_RUN.format(query=query.rstrip("\n ;")))
59
+ print(result.df().to_markdown())
60
+ return
 
61
 
62
+ with CommitAndCleanScheduler(repo_id=dst, repo_type="dataset", folder_path="dst", path_in_repo="data", every=0.1, private=private):
63
+ con.sql("PRAGMA enable_progress_bar;")
64
+ result = con.sql(CMD_DST.format(query=query.rstrip("\n ;")))
65
+ DatasetCard(DATA_CARD.format(src=src, dst=dst, query=query)).save("dst/README.md")
66
+ print("done")
67
 
68
  if __name__ == '__main__':
69
  fire.Fire(sql)
start_app.py CHANGED
@@ -1,6 +1,8 @@
 
1
  import os
2
  import re
3
  import subprocess
 
4
  import yaml
5
 
6
  import gradio as gr
@@ -24,15 +26,23 @@ except Exception:
24
 
25
  DRY_RUN = bool(HELP) and bool(m :=re.search("--dry(-|_)run", HELP)) and m.group(0)
26
 
27
- def parse_log(line: str, pbars: dict[str, float]):
28
- if (percent_match := re.search("\\d+(?:\\.\\d+)?%", line)) and any(c in line.split("%")[1][:10] for c in "|β–ˆβ–Œ"):
29
- [pbars.pop(desc) for desc, percent in pbars.items() if percent == 1.]
30
- percent = float(percent_match.group(0)[:-1]) / 100
31
- desc = line[:percent_match.start()].strip() or "Progress"
32
- pbars[desc] = percent
33
- yield ""
34
- else:
35
- yield line
 
 
 
 
 
 
 
 
36
 
37
  def dry_run(src, config, split, dst, query):
38
  if not all([src, config, split, dst, query]):
@@ -77,15 +87,26 @@ def run(src, config, split, dst, query, oauth_token: gr.OAuthToken | None, profi
77
  pbars = {"Finished with an error ❌": 1.0}
78
  else:
79
  job_id = resp.json()["metadata"]["job_id"]
 
 
80
  resp = requests.get(
81
  f"https://huggingface.co/api/jobs/{username}/{job_id}/logs-stream",
 
 
82
  )
83
- for line in iter(resp.raw.readline, b""):
84
- logs += parse_log(line.decode(), pbars=pbars)
85
  yield {output_markdown: logs, progress_labels: gr.Label(pbars, visible=bool(pbars))}
86
- job_status = requests.get(
87
- f"https://huggingface.co/api/jobs/{username}/{job_id}",
88
- ).json()
 
 
 
 
 
 
 
89
  if job_status["status"]["stage"] == "COMPLETED":
90
  pbars = {"Finished βœ…": 1.0}
91
  else:
@@ -119,7 +140,8 @@ with gr.Blocks() as demo:
119
  if DRY_RUN:
120
  dry_run_button = gr.Button("Dry-Run")
121
  progress_labels= gr.Label(visible=False, label="Progress")
122
- output_markdown = gr.Markdown(label="Output logs")
 
123
  run_button.click(run, inputs=[dataset_dropdown, subset_dropdown, split_dropdown, dst_dropdown, query_textarea], outputs=[progress_labels, output_markdown])
124
  if DRY_RUN:
125
  dry_run_button.click(dry_run, inputs=[dataset_dropdown, subset_dropdown, split_dropdown, dst_dropdown, query_textarea], outputs=[progress_labels, output_markdown])
 
1
+ import json
2
  import os
3
  import re
4
  import subprocess
5
+ import time
6
  import yaml
7
 
8
  import gradio as gr
 
26
 
27
  DRY_RUN = bool(HELP) and bool(m :=re.search("--dry(-|_)run", HELP)) and m.group(0)
28
 
29
+ def parse_log(line: str, pbars: dict[str, float] = None):
30
+ if line.startswith("data: {"):
31
+ data = json.loads(line[len("data: "):])
32
+ data, timestamp = data["data"], data["timestamp"]
33
+ if pbars is not None and data.startswith("===== Job started at"):
34
+ pbars.pop("Starting βš™οΈ", None)
35
+ pbars["Running πŸƒ"] = 0.0
36
+ return f"[{timestamp}] {data}\n\n"
37
+ elif pbars is not None and (percent_match := re.search("\\d+(?:\\.\\d+)?%", data)) and any(c in data.split("%")[1][:10] for c in "|β–ˆβ–Œ"):
38
+ pbars.pop("Running πŸƒ", None)
39
+ [pbars.pop(desc) for desc, percent in pbars.items() if percent == 1.]
40
+ percent = float(percent_match.group(0)[:-1]) / 100
41
+ desc = data[:percent_match.start()].strip() or "Progress"
42
+ pbars[desc] = percent
43
+ else:
44
+ return f"[{timestamp}] {data}\n\n"
45
+ return ""
46
 
47
  def dry_run(src, config, split, dst, query):
48
  if not all([src, config, split, dst, query]):
 
87
  pbars = {"Finished with an error ❌": 1.0}
88
  else:
89
  job_id = resp.json()["metadata"]["job_id"]
90
+ pbars = {"Starting βš™οΈ": 0.0}
91
+ yield {output_markdown: logs, progress_labels: gr.Label(pbars, visible=bool(pbars))}
92
  resp = requests.get(
93
  f"https://huggingface.co/api/jobs/{username}/{job_id}/logs-stream",
94
+ headers={"Authorization": f"Bearer {token}"},
95
+ stream=True
96
  )
97
+ for line in resp.iter_lines():
98
+ logs += parse_log(line.decode("utf-8"), pbars=pbars)
99
  yield {output_markdown: logs, progress_labels: gr.Label(pbars, visible=bool(pbars))}
100
+ job_status = {"status": {"stage": "RUNNING"}}
101
+ while True:
102
+ job_status = requests.get(
103
+ f"https://huggingface.co/api/jobs/{username}/{job_id}",
104
+ headers={"Authorization": f"Bearer {token}"}
105
+ ).json()
106
+ if job_status["status"]["stage"] == "RUNNING":
107
+ time.sleep(1)
108
+ else:
109
+ break
110
  if job_status["status"]["stage"] == "COMPLETED":
111
  pbars = {"Finished βœ…": 1.0}
112
  else:
 
140
  if DRY_RUN:
141
  dry_run_button = gr.Button("Dry-Run")
142
  progress_labels= gr.Label(visible=False, label="Progress")
143
+ with gr.Accordion("Details", open=False):
144
+ output_markdown = gr.Markdown(label="Output logs")
145
  run_button.click(run, inputs=[dataset_dropdown, subset_dropdown, split_dropdown, dst_dropdown, query_textarea], outputs=[progress_labels, output_markdown])
146
  if DRY_RUN:
147
  dry_run_button.click(dry_run, inputs=[dataset_dropdown, subset_dropdown, split_dropdown, dst_dropdown, query_textarea], outputs=[progress_labels, output_markdown])