Spaces:

Dovakiins
/

qwerrwe

Build error

App Files Files Community

hamel commited on Feb 9

Commit

9bca7db

•

1 Parent(s): 91cf4ee

add support for https remote yamls (#1277)

Browse files

Files changed (8) hide show

.mypy.ini +3 -0
README.md +7 -0
requirements-dev.txt +1 -0
requirements.txt +1 -0
src/axolotl/cli/__init__.py +53 -2
src/axolotl/cli/preprocess.py +2 -1
src/axolotl/cli/shard.py +2 -1
src/axolotl/cli/train.py +2 -2

.mypy.ini CHANGED Viewed

@@ -32,6 +32,9 @@ ignore_missing_imports = True
 [mypy-bitsandbytes]
 ignore_missing_imports = True
 [mypy-datasets]
 ignore_missing_imports = True

 [mypy-bitsandbytes]
 ignore_missing_imports = True
+[mypy-requests]
+ignore_missing_imports = True
 [mypy-datasets]
 ignore_missing_imports = True

README.md CHANGED Viewed

@@ -121,6 +121,10 @@ accelerate launch -m axolotl.cli.inference examples/openllama-3b/lora.yml \
 # gradio
 accelerate launch -m axolotl.cli.inference examples/openllama-3b/lora.yml \
     --lora_model_dir="./lora-out" --gradio
 ```
 ## Installation
@@ -988,6 +992,9 @@ Run
 accelerate launch -m axolotl.cli.train your_config.yml
 ```
 #### Preprocess dataset
 You can optionally pre-tokenize dataset with the following before finetuning.

 # gradio
 accelerate launch -m axolotl.cli.inference examples/openllama-3b/lora.yml \
     --lora_model_dir="./lora-out" --gradio
+# remote yaml files - the yaml config can be hosted on a public URL
+# Note: the yaml config must directly link to the **raw** yaml
+accelerate launch -m axolotl.cli.train https://raw.githubusercontent.com/OpenAccess-AI-Collective/axolotl/main/examples/openllama-3b/lora.yml
 ```
 ## Installation
 accelerate launch -m axolotl.cli.train your_config.yml
 ```
+> [!TIP]
+> You can also reference a config file that is hosted on a public URL, for example `accelerate launch -m axolotl.cli.train https://yourdomain.com/your_config.yml`
 #### Preprocess dataset
 You can optionally pre-tokenize dataset with the following before finetuning.

requirements-dev.txt CHANGED Viewed

@@ -1,3 +1,4 @@
 pre-commit
 black
 mypy

 pre-commit
 black
 mypy
+types-requests

requirements.txt CHANGED Viewed

@@ -9,6 +9,7 @@ deepspeed>=0.13.1
 addict
 fire
 PyYAML>=6.0
 datasets>=2.15.0
 flash-attn==2.3.3
 sentencepiece

 addict
 fire
 PyYAML>=6.0
+requests
 datasets>=2.15.0
 flash-attn==2.3.3
 sentencepiece

src/axolotl/cli/__init__.py CHANGED Viewed

@@ -1,16 +1,20 @@
 """Prepare and train a model on a dataset. Can also infer from a model or merge lora"""
 import importlib
 import logging
 import math
 import os
 import random
 import sys
 from pathlib import Path
 from threading import Thread
 from typing import Any, Dict, List, Optional, Union
 import gradio as gr
 import torch
 import yaml
@@ -59,6 +63,52 @@ def print_axolotl_text_art(suffix=None):
         print(ascii_art)
 def get_multi_line_input() -> Optional[str]:
     print("Give me an instruction (Ctrl + D to submit): ")
     instruction = ""
@@ -270,9 +320,10 @@ def check_not_in(list1: List[str], list2: Union[Dict[str, Any], List[str]]) -> b
     return not any(el in list2 for el in list1)
-def load_cfg(config: Path = Path("examples/"), **kwargs):
     if Path(config).is_dir():
-        config = choose_config(config)
     # load the config from the yaml file
     with open(config, encoding="utf-8") as file:

 """Prepare and train a model on a dataset. Can also infer from a model or merge lora"""
 import importlib
+import json
 import logging
 import math
 import os
 import random
 import sys
+import tempfile
 from pathlib import Path
 from threading import Thread
 from typing import Any, Dict, List, Optional, Union
+from urllib.parse import urlparse
 import gradio as gr
+import requests
 import torch
 import yaml
         print(ascii_art)
+def check_remote_config(config: Union[str, Path]):
+    # Check if the config is a valid HTTPS URL to a .yml or .yaml file
+    if not (isinstance(config, str) and config.startswith("https://")):
+        return config  # Return the original value if it's not a valid URL
+    filename = os.path.basename(urlparse(config).path)
+    temp_dir = tempfile.mkdtemp()
+    try:
+        response = requests.get(config, timeout=30)
+        response.raise_for_status()  # Check for HTTP errors
+        content = response.content
+        try:
+            # Try parsing as JSON first to catch cases where JSON content is mistakenly considered YAML
+            json.loads(content)
+            # Log a warning but do not raise an error; JSON is technically valid YAML - this can happen when you forget to point to a raw github link
+            LOG.warning(
+                f"Warning: The content of the file at {config} is JSON, which is technically valid YAML but might not be intended."
+            )
+        except json.JSONDecodeError:
+            # If it's not valid JSON, verify it's valid YAML
+            try:
+                yaml.safe_load(content)
+            except yaml.YAMLError as err:
+                raise ValueError(
+                    f"Failed to parse the content at {config} as YAML: {err}"
+                ) from err
+        # Write the content to a file if it's valid YAML (or JSON treated as YAML)
+        output_path = Path(temp_dir) / filename
+        with open(output_path, "wb") as file:
+            file.write(content)
+        LOG.info(
+            f"Using the following config obtained from {config}:\n\n{content.decode('utf-8')}\n"
+        )
+        return output_path
+    except requests.RequestException as err:
+        # This catches all requests-related exceptions including HTTPError
+        raise RuntimeError(f"Failed to download {config}: {err}") from err
+    except Exception as err:
+        # Catch-all for any other exceptions
+        raise err
 def get_multi_line_input() -> Optional[str]:
     print("Give me an instruction (Ctrl + D to submit): ")
     instruction = ""
     return not any(el in list2 for el in list1)
+def load_cfg(config: Union[str, Path] = Path("examples/"), **kwargs):
+    config = check_remote_config(config)
     if Path(config).is_dir():
+        config = choose_config(Path(config))
     # load the config from the yaml file
     with open(config, encoding="utf-8") as file:

src/axolotl/cli/preprocess.py CHANGED Viewed

@@ -3,6 +3,7 @@ CLI to run training on a model
 """
 import logging
 from pathlib import Path
 import fire
 import transformers
@@ -23,7 +24,7 @@ from axolotl.prompt_strategies.sharegpt import register_chatml_template
 LOG = logging.getLogger("axolotl.cli.preprocess")
-def do_cli(config: Path = Path("examples/"), **kwargs):
     # pylint: disable=duplicate-code
     print_axolotl_text_art()
     parsed_cfg = load_cfg(config, **kwargs)

 """
 import logging
 from pathlib import Path
+from typing import Union
 import fire
 import transformers
 LOG = logging.getLogger("axolotl.cli.preprocess")
+def do_cli(config: Union[Path, str] = Path("examples/"), **kwargs):
     # pylint: disable=duplicate-code
     print_axolotl_text_art()
     parsed_cfg = load_cfg(config, **kwargs)

src/axolotl/cli/shard.py CHANGED Viewed

@@ -3,6 +3,7 @@ CLI to shard a trained model into 10GiB chunks
 """
 import logging
 from pathlib import Path
 import fire
 import transformers
@@ -25,7 +26,7 @@ def shard(
     model.save_pretrained(cfg.output_dir, safe_serialization=safe_serialization)
-def do_cli(config: Path = Path("examples/"), **kwargs):
     # pylint: disable=duplicate-code
     print_axolotl_text_art()
     parsed_cfg = load_cfg(config, **kwargs)

 """
 import logging
 from pathlib import Path
+from typing import Union
 import fire
 import transformers
     model.save_pretrained(cfg.output_dir, safe_serialization=safe_serialization)
+def do_cli(config: Union[Path, str] = Path("examples/"), **kwargs):
     # pylint: disable=duplicate-code
     print_axolotl_text_art()
     parsed_cfg = load_cfg(config, **kwargs)

src/axolotl/cli/train.py CHANGED Viewed

@@ -3,7 +3,7 @@ CLI to run training on a model
 """
 import logging
 from pathlib import Path
-from typing import Tuple
 import fire
 from transformers.hf_argparser import HfArgumentParser
@@ -25,7 +25,7 @@ from axolotl.train import train
 LOG = logging.getLogger("axolotl.cli.train")
-def do_cli(config: Path = Path("examples/"), **kwargs):
     # pylint: disable=duplicate-code
     parsed_cfg = load_cfg(config, **kwargs)
     parser = HfArgumentParser((TrainerCliArgs))

 """
 import logging
 from pathlib import Path
+from typing import Tuple, Union
 import fire
 from transformers.hf_argparser import HfArgumentParser
 LOG = logging.getLogger("axolotl.cli.train")
+def do_cli(config: Union[Path, str] = Path("examples/"), **kwargs):
     # pylint: disable=duplicate-code
     parsed_cfg = load_cfg(config, **kwargs)
     parser = HfArgumentParser((TrainerCliArgs))