giskard-evaluator / cicd /retriever.py
inoki-giskard's picture
Init cicd with commit 9bf277b
b6a7e2b
raw
history blame
1.71 kB
import argparse
import huggingface_hub
def model_has_dataset(model):
for tag in model.tags:
if tag.startswith("dataset:"):
return True
return False
if __name__ == "__main__":
parser = argparse.ArgumentParser(
prog="Giskard Retriever", description="Retrieves HF models that are bound to datasets."
)
parser.add_argument(
"--model_type",
help="Hugging Face model types. default: text-classification",
required=False,
)
parser.add_argument("--output_format",
help="Format of the information retrieved. Default: parquet. Options: parquet, csv, json.")
args = parser.parse_args()
MODEL_TYPE = args.model_type if args.model_type is not None else "text-classification"
models_with_dataset = filter(
model_has_dataset, huggingface_hub.list_models(filter=MODEL_TYPE, sort="likes", direction=-1)
)
import pandas as pd
df = pd.DataFrame(
[
{
"modelId": m.modelId,
"modelType": MODEL_TYPE,
"author": m.author,
"downloads": m.downloads,
"likes": m.likes,
"datasets": [t[8:] for t in m.tags if t.startswith("dataset:")],
}
for m in models_with_dataset
]
)
output_format = args.output_format
if output_format is None or output_format == "parquet":
df.to_parquet(f"models_{MODEL_TYPE}.parquet", index=False)
elif output_format == "csv":
df.to_csv(f"models_{MODEL_TYPE}.csv", columns=df.columns, index=False)
elif output_format == "json":
df.to_json(f"models_{MODEL_TYPE}.json", index=False)