File size: 4,485 Bytes
604cf79
 
 
5adcd02
 
 
 
 
 
 
 
604cf79
5adcd02
 
 
 
 
 
 
604cf79
5adcd02
 
 
 
 
 
 
 
 
 
 
604cf79
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5adcd02
 
 
 
 
 
 
 
9df608d
5adcd02
 
 
 
 
604cf79
5adcd02
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import json
import shutil
import gc
import gradio as gr

import torch
import safetensors
# hack to load safetensors.torch
from safetensors.torch import save_file
from huggingface_hub import hf_hub_download

def check_simple_file(st_weights_path, torch_weights_path):
    st_weights = safetensors.torch.load_file(st_weights_path)
    torch_weights = torch.load(torch_weights_path)

    # check if keys are the same
    if st_weights.keys() != torch_weights.keys():
        # retrieve different keys
        unexpected_keys = st_weights.keys() - torch_weights.keys()
        return f"keys are not the same ! Conversion failed - unexpected keys are: {unexpected_keys} for the file {st_weights_path}"

    total_errors = []

    # check all weights are same
    for key, value in st_weights.items():
        # this automatically asserts that the weights are same and raises error if not
        try:
            torch.testing.assert_close(torch_weights[key], value, rtol=1e-5, atol=1e-5)
        except Exception as e:
            total_errors.append(e)

    del st_weights
    del torch_weights
    gc.collect()

    return total_errors

def run(pr_number, model_id):
    is_sharded = False
    try:
        st_sharded_index_file = hf_hub_download(repo_id=model_id, filename="model.safetensors.index.json", revision=f"refs/pr/{pr_number}")
        torch_sharded_index_file = hf_hub_download(repo_id=model_id, filename="pytorch_model.bin.index.json")

        is_sharded = True
    except:
        pass

    if not is_sharded:
        try:
            st_weights_path = hf_hub_download(repo_id=model_id, filename="model.safetensors", revision=f"refs/pr/{pr_number}")
            torch_weights_path = hf_hub_download(repo_id=model_id, filename="pytorch_model.bin")
        except Exception as e:
            return f"Error: {e} | \n Maybe you specified model ids or PRs that does not exist or does not contain any `model.safetensors` or `pytorch_model.bin` files"

        total_errors = check_simple_file(st_weights_path, torch_weights_path)  
    else:
        total_errors = []
        total_st_files = set(json.load(open(st_sharded_index_file, "r"))["weight_map"].values())
        total_pt_files = set(json.load(open(torch_sharded_index_file, "r"))["weight_map"].values())

        if len(total_st_files) != len(total_pt_files):
            return f"weights are not the same there are {len(total_st_files)} files in safetensors and {len(total_pt_files)} files in torch ! Conversion failed - {len(total_errors)} errors : {total_errors}"
         
        # check if the mapping are correct
        if not all([pt_file.replace("pytorch_model", "model").replace(".bin", ".safetensors") in total_st_files for pt_file in total_pt_files]):
            return f"Conversion failed! Safetensors files are not the same as torch files - make sure you have the correct files in the PR"

        for pt_file in total_pt_files:
            st_file = pt_file.replace("pytorch_model", "model").replace(".bin", ".safetensors")
            
            st_weights_path = hf_hub_download(repo_id=model_id, filename=st_file, revision=f"refs/pr/{pr_number}")
            torch_weights_path = hf_hub_download(repo_id=model_id, filename=pt_file)

            total_errors += check_simple_file(st_weights_path, torch_weights_path)

            # remove files for memory optimization
            shutil.rmtree(st_weights_path)
            shutil.rmtree(torch_weights_path)


    if len(total_errors) > 0:
        return f"weights are not the same ! Conversion failed - {len(total_errors)} errors : {total_errors}"
    
    return "Safetensors and torch weights are the same! Conversion sucessfull - you can safely merge the PR"

DESCRIPTION = """
The steps are the following:
- You got tagged in a Safetensors PR? Check if it works!
- Identify the PR number that you want to check.
- Paste the model id and the PR number below
- Click "Submit"
- That's it! You'll get feedback if the user successfully converted a model in `safetensors` format or not!

This checker also support sharded weights.
"""

demo = gr.Interface(
    title="SafeTensors Checker",
    description=DESCRIPTION,
    allow_flagging="never",
    article="Check out the [Safetensors repo on GitHub](https://github.com/huggingface/safetensors)",
    inputs=[
        gr.Text(max_lines=1, label="PR number"),
        gr.Text(max_lines=1, label="model_id"),
    ],
    outputs=[gr.Markdown(label="output")],
    fn=run,
).queue()

demo.launch()