Spaces:

sasha
/

AI_Carbon

Runtime error

File size: 9,487 Bytes

393f86d
 
32ac110
 
ce8bd36
393f86d
 
db8cc8e
ce8bd36
db8cc8e
32ac110
 
 
db8cc8e
32ac110
 
 
9ca002e
ce8bd36
32ac110
 
 
 
393f86d
 
 
 
 
ce8bd36
 
393f86d
 
 
 
 
 
 
 
 
 
 
 
ce8bd36
393f86d
 
 
 
 
 
 
 
 
 
 
ce8bd36
393f86d
ce8bd36
393f86d
795ccdc
393f86d
 
ce8bd36
393f86d
ce8bd36
 
 
393f86d
ce8bd36
 
795ccdc
f924cbe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32ac110
ce8bd36
 
 
 
 
 
 
 
 
 
 
 
248e2bb
 
795ccdc
ce8bd36
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
393f86d
ce8bd36
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a23bf4a
ce8bd36
 
 
 
 
 
 
 
 
248e2bb

import streamlit as st
import pandas as pd
import os, csv
from huggingface_hub import hf_hub_download, HfApi
import math

HF_TOKEN = os.getenv('HUGGING_FACE_HUB_TOKEN')

CACHED_FILE_PATH = hf_hub_download(repo_id="sasha/co2_submissions", filename="co2_emissions.csv", repo_type="dataset")

api = HfApi()

def write_to_csv(hardware, training_time, provider, carbon_intensity, dynamic_emissions):
    with open(CACHED_FILE_PATH,'a', newline='') as f:
        writer = csv.writer(f)
        writer.writerow([hardware, training_time, provider, carbon_intensity, dynamic_emissions])
    api.upload_file(
        path_or_fileobj=CACHED_FILE_PATH,
        path_in_repo="co2_emissions.csv",
        repo_id="sasha/co2_submissions",
        repo_type="dataset",
    )

st.set_page_config(
    page_title="AI Carbon Calculator",
    layout="wide",
)



tdp_url = "https://raw.githubusercontent.com/mlco2/impact/master/data/gpus.csv"
compute_url = "https://raw.githubusercontent.com/mlco2/impact/master/data/impact.csv"


electricity_url = "https://raw.githubusercontent.com/mlco2/impact/master/data/2021-10-27yearly_averages.csv"

server_sheet_id = "1DqYgQnEDLQVQm5acMAhLgHLD8xXCG9BIrk-_Nv6jF3k"
server_sheet_name = "Server%20Carbon%20Footprint"
server_url = f"https://docs.google.com/spreadsheets/d/{server_sheet_id}/gviz/tq?tqx=out:csv&sheet={server_sheet_name}"


embodied_gpu_sheet_name = "Scope%203%20Ratios"
embodied_gpu_url = f"https://docs.google.com/spreadsheets/d/{server_sheet_id}/gviz/tq?tqx=out:csv&sheet={embodied_gpu_sheet_name}"

TDP =pd.read_csv(tdp_url)

instances = pd.read_csv(compute_url)
providers = [p.upper() for p in instances['provider'].unique().tolist()]
providers.append('Local/Private Infastructure')

kg_per_mile = 0.348

electricity = pd.read_csv(electricity_url)
servers = pd.read_csv(server_url)
#print(servers.columns)
embodied_gpu = pd.read_csv(embodied_gpu_url)
#print(embodied_gpu.columns)

#st.image('images/MIT_carbon_image_narrow.png', use_column_width=True, caption = 'Image credit: ')
st.title("AI Carbon Calculator")

st.markdown('## Estimate your AI model\'s CO2 carbon footprint! 🌎🖥️🌎')

st.markdown('##### The calculators below will help you calculate different aspects of your model\'s carbon footprint, as we did for'
            ' BLOOM 🌸, a 176-billion parameter language model [(see our preprint!)](https://arxiv.org/abs/2211.02001)')
st.markdown('##### Don\'t forget to share your data to help us get a better idea of AI model\'s carbon emissions!')

st.markdown('### Dynamic Emissions 🚀')
st.markdown('##### These are the emissions produced by generating the electricity necessary for powering model training.')
with st.expander("Calculate the dynamic emissions of your model"):
    col1, col2, col3, col4 = st.columns(4)
    with col1:
        hardware = st.selectbox('GPU used', TDP['name'].tolist())
        gpu_tdp = TDP['tdp_watts'][TDP['name'] == hardware].tolist()[0]
        st.markdown("Different GPUs have different TDP (Thermal Design Power), which impacts how much energy you use.")
    with col2:
       training_time = st.number_input('Total number of GPU hours')
       st.markdown('This is calculated by multiplying the number of GPUs you used by the training time: '
                   'i.e. if you used 100 GPUs for 10 hours, this is equal to 100x10 = 1,000 GPU hours.')
    with col3:
       provider = st.selectbox('Provider used', providers)
       st.markdown('If you can\'t find your provider here, select "Local/Private Infrastructure".')
    with col4:
        if provider != 'Local/Private Infastructure':
            provider_instances = instances['region'][instances['provider'] == provider.lower()].unique().tolist()
            region = st.selectbox('Provider used', provider_instances)
            carbon_intensity = instances['impact'][(instances['provider'] == provider.lower()) & (instances['region'] == region)].tolist()[0]

        else:
            carbon_intensity = st.number_input('Carbon intensity of your energy grid, in grams of CO2 per kWh')
            st.markdown('You can consult a resource like the [IEA](https://www.iea.org/countries) or '
                        ' [Electricity Map](https://app.electricitymaps.com/) to get this information.')
    dynamic_emissions = round(gpu_tdp * training_time * carbon_intensity/1000000)
    st.metric(label="Dynamic emissions", value=str(dynamic_emissions)+' kilograms of CO2eq')
    st.markdown('This is roughly equivalent to '+ str(round(dynamic_emissions/kg_per_mile,1)) + ' miles driven in an average US car'
    ' produced in 2021. [(Source: energy.gov)](https://www.energy.gov/eere/vehicles/articles/fotw-1223-january-31-2022-average-carbon-dioxide-emissions-2021-model-year)')

st.markdown('### Experimental Emissions 👩‍🔬')
st.markdown('##### These are the emissions produced by generating the electricity necessary for powering the experiments and tests needed to pick your final model architecture '
            'and parameters.')
with st.expander("Calculate the experimental emissions of your model"):
            st.markdown('##### Consult your training logs to figure out how many ablations, baselines and experiments were run before converging on the final model.')
            experimentation_time = st.number_input(label='Number of hours of experimentation run', value=training_time)
            st.markdown('##### As a baseline, language models such as [OPT](https://arxiv.org/pdf/2205.01068.pdf) and [BLOOM](https://arxiv.org/abs/2211.02001)'
                        ' found that experimentation roughly doubles the amount of compute used by training the model itself.')
            experimental_emissions = round(gpu_tdp * (experimentation_time) * carbon_intensity/1000000)
            st.metric(label="Experimental emissions", value=str(0.0)+' kilograms of CO2eq')

st.markdown('### Idle Emissions 🌐')
st.markdown('##### These are the emissions produced by generating the electricity needed to power the rest of the infrastructure'
            'used for model training -- the datacenter, network, heating/cooling, storage, etc.')
with st.expander("Calculate the idle emissions of your model"):
    st.markdown('##### A proxy often used to reflect idle emissions is PUE (Power Usage Effectiveness), which represents '
                ' the ratio of energy used for computing overheads like cooling, which varies depending on the data center.')
    pue = instances['PUE'][(instances['provider'] == provider.lower()) & (instances['region'] == region)].tolist()[0]
    if math.isnan(pue) == True:
        if provider != 'Local/Private Infastructure':
            st.markdown('##### The exact information isn\'t available for this datacenter! We will use your provider\'s average instead, which is:')
            if provider == 'AWS':
                pue = 1.135
                st.markdown('#### ' + str(pue)+ " [(source)](https://www.cloudcarbonfootprint.org/docs/methodology/)")
            elif provider == 'GCP':
                pue = 1.1
                st.markdown('#### ' + str(pue) + " [(source)](https://www.google.ca/about/datacenters/efficiency/)")
            elif provider == 'AZURE':
                pue = 1.185
                st.markdown('#### ' + str(pue) + " [(source)](https://www.cloudcarbonfootprint.org/docs/methodology/)")
            elif provider == 'OVH':
                pue = 1.28
                st.markdown('#### ' + str(pue) + " [(source)](https://corporate.ovhcloud.com/en-ca/sustainability/environment/)")
            elif provider == 'SCALEWAY':
                pue = 1.35
                st.markdown('#### ' +str(pue) + " [(source)](https://pue.dc3.scaleway.com/en/)")

        else:
            st.markdown('##### Try to find the PUE of your local infrastructure. Otherwise, you can use the industry average, 1.58:')
            pue = st.number_input('Total number of GPU hours', value = 1.58)
    else:
        st.markdown('##### The PUE of the datacenter you used is: ')
        st.markdown('#### '+ str(pue))
    pue_emissions = round((experimental_emissions+ dynamic_emissions)*pue)
    st.metric(label="Emissions considering PUE", value=str(pue_emissions)+' kilograms of CO2eq')

st.markdown('### Embodied Emissions 🖥️🔨')
st.markdown('##### These are the emissions associated with the materials and processes involved in producing'
            ' the computing equipment needed for AI models.')
with st.expander("Calculate the embodied emissions of your model"):
        st.markdown('##### These are the trickiest emissions to track down since a lot of the information needed is missing!')

m = st.markdown("""
<style>
div.stButton > button:first-child {
    background-color: rgb(80, 200, 120);
    background-image: none;
    font-size: 20px;
    height: 3em;
}
</style>""", unsafe_allow_html=True)
buttoncol1, cuttoncol2, buttoncol3 = st.columns(3)
with cuttoncol2:
    st.button(label="Anonymously share my data!", on_click = lambda *args: write_to_csv(hardware, training_time, provider, carbon_intensity, dynamic_emissions))

st.markdown('### Methodology')
with st.expander("More information about our Methodology"):
    st.markdown('Building on the work of the [ML CO2 Calculator](https://mlco2.github.io/impact/), this tool allows you to consider'
                ' other aspects of your model\'s carbon footprint based on the LCA methodology.')
    st.image('images/LCA_CO2.png', caption='The LCA methodology - the parts in green are those we focus on.')