dataset-rewriter

Sleeping

App Files Files Community

lhoestq HF staff commited on Sep 18, 2024

Commit

063480a

1 Parent(s): 1fa40b1

better error message

Browse files

Files changed (1) hide show

app.py +19 -12

app.py CHANGED Viewed

@@ -35,7 +35,7 @@ MAX_NUM_ROWS_TO_REWRITE = int(os.environ.get("MAX_NUM_ROWS_TO_REWRITE") or 1000)
 assert MAX_NUM_ROWS_TO_REWRITE in PARTIAL_SUFFIX, "allowed max num rows are 100, 1000, 10000, 100000 and 1000000"
 NUM_PARALLEL_CALLS = 10
-NUM_ROWS_PER_CALL = 5
 MAX_PROGRESS_UPDATES_PER_SECOND = 4
 REWRITE_DATASET_PREVIEW = (
     "A Machine Learning practitioner is looking for a dataset similar to '{dataset}' but slightly different. "
@@ -171,19 +171,23 @@ with gr.Blocks(css=css) as demo:
         while batch := list(islice(it, n)):
             yield batch
-    def stream_reponse(messages: list[dict[str: str]], response_format=None) -> Iterator[str]:
         for _ in range(3):
             message = None
             try:
                 for message in client.chat_completion(
                     messages=messages,
-                    max_tokens=5000,
                     stream=True,
                     top_p=0.8,
                     seed=42,
                     response_format=response_format
                 ):
                     yield message.choices[0].delta.content
             except requests.exceptions.ConnectionError as e:
                 if message:
@@ -217,7 +221,7 @@ with gr.Blocks(css=css) as demo:
         response_format = {"type": "json", "value": {"properties": {"data": {"type": "array", "items": format, "minItems": len(rows), "maxItems": len(rows)}}, "required": ["data"]}}
         try:
             yield from ijson.items(StringIteratorIO(stream_reponse(messages, response_format=response_format)), "data.item", buf_size=4, use_float=True)
-        except ijson.IncompleteJSONError as e:
             print(f"{type(e).__name__}: {e}")
             print("Warning: Some rows were missing during ReWriting.")
@@ -389,14 +393,17 @@ with gr.Blocks(css=css) as demo:
         current = 0
         _last_time = time.time()
-        for step in iflatmap_unordered(run, kwargs_iterable=[{"i": i} for i in range(num_parallel_calls)]):
-            current += step
-            if _last_time + 1 / MAX_PROGRESS_UPDATES_PER_SECOND < time.time():
-                _last_time = time.time()
-                yield {
-                    full_dataset_generation_label: gr.Label({f"⚙️ ReWriting {dataset}": current / total}),
-                    pretty_full_dataset_generation_output: gr.DataFrame(pd.DataFrame([row for rows in parallel_output_rows for row in rows]))
-                }
         yield {
             full_dataset_generation_label: gr.Label({f"⚙️ ReWriting {dataset}": current / total}),
             pretty_full_dataset_generation_output: gr.DataFrame(pd.DataFrame([row for rows in parallel_output_rows for row in rows]))

 assert MAX_NUM_ROWS_TO_REWRITE in PARTIAL_SUFFIX, "allowed max num rows are 100, 1000, 10000, 100000 and 1000000"
 NUM_PARALLEL_CALLS = 10
+NUM_ROWS_PER_CALL = 3
 MAX_PROGRESS_UPDATES_PER_SECOND = 4
 REWRITE_DATASET_PREVIEW = (
     "A Machine Learning practitioner is looking for a dataset similar to '{dataset}' but slightly different. "
         while batch := list(islice(it, n)):
             yield batch
+    class ContextTooLongError(ValueError):
+        pass
+    def stream_reponse(messages: list[dict[str: str]], response_format=None, max_tokens=5000) -> Iterator[str]:
         for _ in range(3):
             message = None
             try:
                 for message in client.chat_completion(
                     messages=messages,
+                    max_tokens=max_tokens,
                     stream=True,
                     top_p=0.8,
                     seed=42,
                     response_format=response_format
                 ):
+                    if message is None or not message.choices or message.choices[0] is None or message.choices[0].delta is None or message.choices[0].delta.content is None:
+                        raise ContextTooLongError(f"messages: {sum(len(message['content']) for message in messages)} chars, max_tokens: {max_tokens}")
                     yield message.choices[0].delta.content
             except requests.exceptions.ConnectionError as e:
                 if message:
         response_format = {"type": "json", "value": {"properties": {"data": {"type": "array", "items": format, "minItems": len(rows), "maxItems": len(rows)}}, "required": ["data"]}}
         try:
             yield from ijson.items(StringIteratorIO(stream_reponse(messages, response_format=response_format)), "data.item", buf_size=4, use_float=True)
+        except (ijson.IncompleteJSONError) as e:
             print(f"{type(e).__name__}: {e}")
             print("Warning: Some rows were missing during ReWriting.")
         current = 0
         _last_time = time.time()
+        try:
+            for step in iflatmap_unordered(run, kwargs_iterable=[{"i": i} for i in range(num_parallel_calls)]):
+                current += step
+                if _last_time + 1 / MAX_PROGRESS_UPDATES_PER_SECOND < time.time():
+                    _last_time = time.time()
+                    yield {
+                        full_dataset_generation_label: gr.Label({f"⚙️ ReWriting {dataset}": current / total}),
+                        pretty_full_dataset_generation_output: gr.DataFrame(pd.DataFrame([row for rows in parallel_output_rows for row in rows]))
+                    }
+        except ContextTooLongError:
+            raise gr.Error("Input dataset has too long context for the model")
         yield {
             full_dataset_generation_label: gr.Label({f"⚙️ ReWriting {dataset}": current / total}),
             pretty_full_dataset_generation_output: gr.DataFrame(pd.DataFrame([row for rows in parallel_output_rows for row in rows]))