File size: 12,323 Bytes
393f86d
 
32ac110
 
ce8bd36
393f86d
 
db8cc8e
ce8bd36
db8cc8e
32ac110
 
419df8a
1f6c998
db8cc8e
32ac110
419df8a
32ac110
9ca002e
ce8bd36
32ac110
 
 
 
393f86d
 
 
 
 
ce8bd36
 
393f86d
 
 
 
 
 
 
 
 
 
 
 
ce8bd36
393f86d
 
 
 
 
 
 
419df8a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
393f86d
419df8a
1cf9ef7
 
 
393f86d
 
 
 
795ccdc
393f86d
 
ce8bd36
419df8a
 
 
393f86d
ce8bd36
 
795ccdc
419df8a
f924cbe
419df8a
f924cbe
1f6c998
f924cbe
eda3176
1f6c998
f924cbe
419df8a
1f6c998
419df8a
f924cbe
 
419df8a
f924cbe
 
1f6c998
f924cbe
 
 
 
 
eda3176
f924cbe
1f6c998
f924cbe
32ac110
ce8bd36
 
 
 
c0f2fe1
 
 
ce8bd36
 
1f6c998
ce8bd36
419df8a
248e2bb
 
c0f2fe1
1f6c998
ce8bd36
 
c0f2fe1
ce8bd36
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
393f86d
ce8bd36
 
419df8a
ce8bd36
c0f2fe1
ce8bd36
c0f2fe1
ce8bd36
 
 
 
 
419df8a
 
 
 
 
 
 
11ad112
419df8a
 
 
 
 
 
ce8bd36
 
 
 
 
a23bf4a
419df8a
ce8bd36
419df8a
ce8bd36
 
1f6c998
419df8a
 
1cf9ef7
1f6c998
 
 
ce8bd36
 
248e2bb
 
 
419df8a
248e2bb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
import streamlit as st
import pandas as pd
import os, csv
from huggingface_hub import hf_hub_download, HfApi
import math

HF_TOKEN = os.getenv('HUGGING_FACE_HUB_TOKEN')

CACHED_FILE_PATH = hf_hub_download(repo_id="sasha/co2_submissions", filename="co2_emissions.csv", repo_type="dataset")

api = HfApi()

def write_to_csv(hardware, gpu_tdp, num_gpus, training_time, provider, carbon_intensity, dynamic_emissions, experimentation_time, experimental_emissions, pue, pue_emissions, embodied_type, embodied_emissions, model_info):
    st.session_state["is_shared"] = True
    with open(CACHED_FILE_PATH,'a', newline='') as f:
        writer = csv.writer(f)
        writer.writerow([hardware, gpu_tdp, num_gpus, training_time, provider, carbon_intensity, dynamic_emissions, experimentation_time, experimental_emissions, pue, pue_emissions, embodied_type, embodied_emissions, model_info])
    api.upload_file(
        path_or_fileobj=CACHED_FILE_PATH,
        path_in_repo="co2_emissions.csv",
        repo_id="sasha/co2_submissions",
        repo_type="dataset",
    )

st.set_page_config(
    page_title="AI Carbon Calculator",
    layout="wide",
)



tdp_url = "https://raw.githubusercontent.com/mlco2/impact/master/data/gpus.csv"
compute_url = "https://raw.githubusercontent.com/mlco2/impact/master/data/impact.csv"


electricity_url = "https://raw.githubusercontent.com/mlco2/impact/master/data/2021-10-27yearly_averages.csv"

server_sheet_id = "1DqYgQnEDLQVQm5acMAhLgHLD8xXCG9BIrk-_Nv6jF3k"
server_sheet_name = "Server%20Carbon%20Footprint"
server_url = f"https://docs.google.com/spreadsheets/d/{server_sheet_id}/gviz/tq?tqx=out:csv&sheet={server_sheet_name}"


embodied_gpu_sheet_name = "Scope%203%20Ratios"
embodied_gpu_url = f"https://docs.google.com/spreadsheets/d/{server_sheet_id}/gviz/tq?tqx=out:csv&sheet={embodied_gpu_sheet_name}"

TDP =pd.read_csv(tdp_url)

instances = pd.read_csv(compute_url)
providers = [p.upper() for p in instances['provider'].unique().tolist()]
providers.append('Local/Private Infastructure')

### Default values
hardware = "N/A"
gpu_tdp = 0
num_gpus = 0
training_time = 0.0
provider = "N/A"
carbon_intensity = 0.0
dynamic_emissions = 0.0
experimentation_time = 0.0
experimental_emissions = 0.0
pue = 1.0
pue_emissions = 0.0
embodied_type = 0.0
embodied_emissions = 0.0
model_info = "N/A"

### Conversion factors
kg_per_mile = 0.348
embodied_conversion_factor = 0.0289

if "is_shared" not in st.session_state:
    st.session_state["is_shared"] = False

electricity = pd.read_csv(electricity_url)
servers = pd.read_csv(server_url)
embodied_gpu = pd.read_csv(embodied_gpu_url)
#st.image('images/MIT_carbon_image_narrow.png', use_column_width=True, caption = 'Image credit: ')
st.title("AI Carbon Calculator")

st.markdown('## Estimate your AI model\'s CO2 carbon footprint! 🌎🖥️🌎')
st.markdown('### Calculating the carbon footprint of AI models can be hard... this tool is here to help!')
st.markdown('##### Use the calculators below to calculate different aspects of your model\'s carbon footprint' \
            'and don\'t forget to share your data to help the community better understand the carbon emissions of AI!')

st.markdown('### Dynamic Emissions 🚀')
st.markdown('##### These are the emissions produced by generating the electricity necessary for powering model training.')
with st.expander("Calculate the dynamic emissions of your model"):
    col1, col2, col3, col4, col5 = st.columns(5)
    with col1:
        hardware = st.selectbox('Hardware used', TDP['name'].tolist())
        gpu_tdp = TDP['tdp_watts'][TDP['name'] == hardware].tolist()[0]
        st.markdown("Different hardware has different efficiencies, which impacts how much energy you use.")
    with col2:
       num_gpus = st.text_input('Number of GPUs/CPUs/TPUs used', value = 16)
       st.markdown('If you can\'t find your hardware in the list, select the closest similar model.')
    with col3:
       training_time = st.number_input('Total training time (in hours)', value = 0.0)
       st.markdown('You can find this number in your training logs or TensorBoards')
    with col4:
       provider = st.selectbox('Provider used', providers)
       st.markdown('If you can\'t find your provider here, select "Local/Private Infrastructure".')
    with col5:
        if provider != 'Local/Private Infastructure':
            provider_instances = instances['region'][instances['provider'] == provider.lower()].unique().tolist()
            region = st.selectbox('Region used', provider_instances)
            carbon_intensity = instances['impact'][(instances['provider'] == provider.lower()) & (instances['region'] == region)].tolist()[0]
        else:
            carbon_intensity = st.number_input('Carbon intensity of your energy grid, in grams of CO2 per kWh')
            st.markdown('You can consult a resource like the [IEA](https://www.iea.org/countries) or '
                        ' [Electricity Map](https://app.electricitymaps.com/) to get this information.')
    dynamic_emissions = round(gpu_tdp * float(num_gpus)*training_time * carbon_intensity/1000000)
    st.metric(label="Dynamic emissions", value=str(dynamic_emissions)+' kilograms of CO2eq')
    st.info('This is roughly equivalent to '+ str(round(dynamic_emissions/kg_per_mile,1)) + ' miles driven in an average US car'
    ' produced in 2021. [(Source: energy.gov)](https://www.energy.gov/eere/vehicles/articles/fotw-1223-january-31-2022-average-carbon-dioxide-emissions-2021-model-year)')

st.markdown('### Experimental Emissions 👩‍🔬')
st.markdown('##### These are the emissions produced by generating the electricity necessary for powering the experiments and tests needed to pick your final model architecture '
            'and parameters.')
with st.expander("Calculate the experimental emissions of your model"):
            #st.info('Consult your training logs to figure out how many ablations, baselines and experiments were run before converging on the final model.')
            experimentation_time = st.number_input(label='Number of hours of experimentation (including ablations, baselines and evaluation)', value=training_time)
            st.info('As a baseline, language models such as [OPT](https://arxiv.org/pdf/2205.01068.pdf) and [BLOOM](https://arxiv.org/abs/2211.02001)'
                        ' found that experimentation roughly doubles the amount of compute used by training the model itself.')
            experimental_emissions = round(gpu_tdp * (experimentation_time) * carbon_intensity/1000000)
            st.metric(label="Experimental emissions", value=str(experimental_emissions)+' kilograms of CO2eq')

st.markdown('### Datacenter (Overhead) Emissions 🌐')
st.markdown('##### These are the emissions produced by generating the electricity needed to power the rest of the infrastructure'
            'used for model training -- the datacenter, network, heating/cooling, storage, etc.')
with st.expander("Calculate the datacenter emissions of your model"):
    st.info('A proxy often used to reflect idle emissions is PUE (Power Usage Effectiveness), which represents '
                ' the ratio of energy used for computing overheads like cooling, which varies depending on the data center.')
    pue = instances['PUE'][(instances['provider'] == provider.lower()) & (instances['region'] == region)].tolist()[0]
    source = instances['PUE source'][(instances['provider'] == provider.lower()) & (instances['region'] == region)].tolist()[0]
    if math.isnan(pue) == True:
        if provider != 'Local/Private Infastructure':
            st.markdown('##### The exact information isn\'t available for this datacenter! We will use your provider\'s average instead, which is:')
            if provider == 'AWS':
                pue = 1.135
                st.markdown('#### ' + str(pue)+ " [(source)](https://www.cloudcarbonfootprint.org/docs/methodology/)")
            elif provider == 'GCP':
                pue = 1.1
                st.markdown('#### ' + str(pue) + " [(source)](https://www.google.ca/about/datacenters/efficiency/)")
            elif provider == 'AZURE':
                pue = 1.185
                st.markdown('#### ' + str(pue) + " [(source)](https://www.cloudcarbonfootprint.org/docs/methodology/)")
            elif provider == 'OVH':
                pue = 1.28
                st.markdown('#### ' + str(pue) + " [(source)](https://corporate.ovhcloud.com/en-ca/sustainability/environment/)")
            elif provider == 'SCALEWAY':
                pue = 1.35
                st.markdown('#### ' +str(pue) + " [(source)](https://pue.dc3.scaleway.com/en/)")

        else:
            st.markdown('##### Try to find the PUE of your local infrastructure. Otherwise, you can use the industry average, 1.58:')
            pue = st.slider('Total number of GPU hours', value = 1.58)
    else:
        st.markdown('##### The PUE of the datacenter you used is: '+ str(pue) + ' [(source)]('+source+')')
    pue_emissions = round((experimental_emissions+ dynamic_emissions)*pue)
    st.metric(label="Dynamic and experimental emissions, considering PUE", value=str(pue_emissions)+' kilograms of CO2eq')

st.markdown('### Embodied Emissions 🖥️🔨')
st.markdown('##### These are the emissions associated with the materials and processes involved in producing'
            ' the computing equipment needed for AI models.')
with st.expander("Calculate the embodied emissions of your model"):
        st.markdown('These are the trickiest emissions to track down since a lot of the information needed is missing.')
        st.markdown('##### Based on the number of GPUs and training time you indicated above, we can estimate that your model\'s embodied emissions are approximately: ')
        hardware_type = TDP['type'][TDP['name'] == hardware].tolist()[0]
        if hardware_type == 'cpu':
            embodied_type = embodied_gpu['Value'][embodied_gpu['Ratio']=='Manufacturing emissions per additional CPU (kgCO₂eq)'].tolist()[0]
        elif hardware_type == 'gpu' or hardware_type == 'tpu':
            embodied_type = embodied_gpu['Value'][embodied_gpu['Ratio']=='Manufacturing emissions per additionnal GPU Card (kgCO₂eq)'].tolist()[0]
        embodied_emissions = round(int(embodied_type)*embodied_conversion_factor*float(num_gpus)*training_time/1000,1)
        st.metric(label="Embodied emissions", value=str(embodied_emissions)+' kilograms of CO2eq')
        st.markdown('This is a high-level estimate based on an hourly manufacturing emissions conversion factor (linearly ammortised) of 0.0289 [(source)](https://docs.google.com/spreadsheets/d/1DqYgQnEDLQVQm5acMAhLgHLD8xXCG9BIrk-_Nv6jF3k/).')

st.markdown('### Model Information ℹ️')
st.markdown('##### If you want to share the link to your model code or paper, please do so below! Otherwise, your submission will be anonymous.')
model_info = st.text_input(label= "Enter a link to your model (optional)")

m = st.markdown("""
<style>
div.stButton > button:first-child {
    background-color: rgb(80, 200, 120);
    background-image: none;
    font-size: 25px;
    height: 3em;
    width: 15em;
}
</style>""", unsafe_allow_html=True)

buttoncol1, buttoncol2, buttoncol3 = st.columns(3)
with buttoncol2:
    if not st.session_state["is_shared"]:
        submitted = st.button(label="Share my CO2 data!", on_click = lambda *args: write_to_csv(hardware, gpu_tdp, num_gpus, training_time, provider, carbon_intensity, dynamic_emissions, experimentation_time, experimental_emissions, pue, pue_emissions, embodied_type, embodied_emissions, model_info))
    else:
        st.info('Thank you! Your data has been shared in https://huggingface.co/datasets/sasha/co2_submissions.')

st.markdown('### Methodology')
with st.expander("More information about our Methodology"):
    st.markdown('Building on the work of the [ML CO2 Calculator](https://mlco2.github.io/impact/), this tool allows you to consider'
                ' other aspects of your model\'s carbon footprint based on the LCA methodology.')
    st.markdown('We considered all of these aspects when calculating the CO2 emissions of BLOOM 🌸, a 176-billion parameter language model [(see our preprint!)](https://arxiv.org/abs/2211.02001)'')')
    st.image('images/LCA_CO2.png', caption='The LCA methodology - the parts in green are those we focus on.')