import gradio as gr import pandas as pd import os import plotly.express as px import numpy as np datadir = 'data/emissions/complete' seq2seq_finetuned = ['sshleifer/distilbart-xsum-12-6', 'sshleifer/distilbart-cnn-12-6', 'sshleifer/distilbart-cnn-6-6', 'pszemraj/led-large-book-summary', 'google/pegasus-xsum', 'google/pegasus-large', 'google/pegasus-multi_news' ,'facebook/bart-large-cnn', 'ainize/bart-base-cnn'] color_discrete_map = {'Task-specific Encoder': '#636EFA', 'Multi-purpose Seq2Seq': '#AB63FA', 'Multi-purpose Decoder': '#00CC96', 'Task-specific Seq2Seq':'#EF553B'} def multi_check(mname): if 'flan' in mname: return 'Seq2Seq' elif 'bloomz' in mname: return 'Decoder' def encoder_check(mname): if 'flan' in mname: return 'Multi-purpose Seq2Seq' elif mname in seq2seq_finetuned: return 'Task-specific Seq2Seq' elif 'bloomz' in mname: return 'Multi-purpose Decoder' else: return 'Task-specific Encoder' # Data loading model_param_df = pd.read_csv('data/model_parameters.csv', header=0) model_performance_df = pd.read_csv('data/performance.csv', header=0) emissions_df = pd.read_csv('data/co2_data.csv',header=0) modalities_df = pd.read_csv('data/modalities_data.csv',header=0) finetuned_df = emissions_df[~emissions_df['task'].str.contains('zero')] finetuned_df['task'] = finetuned_df['task'].str.replace('_',' ') zeroshot_df = emissions_df[emissions_df['task'].str.contains('zero')] zeroshot_df['task'] = zeroshot_df['task'].str.replace('_',' ') zeroshot_df['architecture_type'] = zeroshot_df.apply(lambda x : multi_check(x.model),axis=1) grouped_df = emissions_df.groupby(['model','task']).mean() grouped_df = grouped_df.reset_index() grouped_df = grouped_df.drop('task',axis=1) performance_all = pd.merge(grouped_df, model_performance_df, on='model') performance_all['type']= performance_all.apply(lambda x : encoder_check(x.model),axis=1) performance_all['log_emissions'] = np.log1p(performance_all["query emissions (g)"]) sent_df = performance_all[['imdb (acc)','sst2 (acc)','tomatoes (acc)', "query emissions (g)", 'model','type','num_params', 'log_emissions']][performance_all['task'].isin(['sentiment'])] qa_df = performance_all[['sciq (acc)', 'squad (f1)', 'squad_v2 (f1, has answer)', "query emissions (g)", 'model','type','num_params', 'log_emissions']][performance_all['task'].isin(['qa'])] summ_df = performance_all[['samsum (rouge)', 'xsum (rouge)', 'cnn (rouge)', "query emissions (g)", 'model','type', 'num_params','log_emissions']][performance_all['task'].isin(['summarization'])] # Figure loading fig0 = px.scatter(emissions_df, x="num_params", y="query emissions (g)", color="model", log_x=True, log_y=True) fig0.update_layout(xaxis={'categoryorder':'mean ascending'}) fig0.update_layout(yaxis_title='Total carbon emitted (g)') fig0.update_layout(xaxis_title='Number of Parameters') fig1 = px.scatter(finetuned_df, x="task", y="query_energy (kWh)", color="model", log_y=True) fig1.update_layout(xaxis={'categoryorder':'mean ascending'}) fig1.update_layout(yaxis_title='Total energy used (Wh)') fig1.update_layout(xaxis_title='Task') fig2 = px.scatter(modalities_df, x="num_params", y="query emissions (g)", color="modality", log_x=True, log_y=True, custom_data=['model','task']) fig2.update_layout(xaxis_title='Model size (number of parameters)') fig2.update_layout(yaxis_title='Model emissions (g of CO2)') fig3 = px.scatter(zeroshot_df, x="model", y="query emissions (g)", color="architecture_type", size='num_params', log_y=True) fig3.update_layout(xaxis={'categoryorder':'mean ascending'}) fig3.update_layout(yaxis_title='Model emissions (g of CO2)') fig3.update_layout(xaxis_title='Model') fig4 = px.scatter(zeroshot_df, x="dataset", y="query emissions (g)", color="model", size='num_params', log_y=True) fig4.update_layout(xaxis={'categoryorder':'mean ascending'}) fig4.update_layout(yaxis_title='Model emissions (g of CO2)') fig4.update_layout(xaxis_title='Model') fig5 = px.scatter(sent_df, y=['imdb (acc)', 'sst2 (acc)', 'tomatoes (acc)'], x="num_params", color="type", color_discrete_map=color_discrete_map, size= "log_emissions", log_x=True, hover_data="model") fig5.update_layout(legend=dict(y=-0.4,x=0.3)) fig5.update_layout(yaxis_title='Text Classification Accuracy') fig6 = px.scatter(qa_df, y=['sciq (acc)', 'squad (f1)', 'squad_v2 (f1, has answer)'], x="num_params", color="type", size = 'log_emissions', log_x=True, hover_data="model") fig6.update_layout(legend=dict(y=-0.4,x=0.3)) fig6.update_layout(yaxis_title='QA accuracy/F1') fig7 = px.scatter(summ_df, y=['samsum (rouge)', 'xsum (rouge)', 'cnn (rouge)'], x="num_params", color="type", size = 'log_emissions', log_x=True, hover_data="model") fig7.update_layout(legend=dict(y=-0.4,x=0.3)) fig7.update_layout(yaxis_title='Summarization Rouge Score') demo = gr.Blocks() with demo: gr.Markdown("# CO2 Inference Demo 🌎 💻 ⚡") gr.Markdown("### TL;DR - We ran a series of experiments to measure the energy efficiency and carbon emissions of different\ models from the HuggingFace Hub, and to see how different tasks and models compare.\ We found that multi-purpose, generative models are orders of magnitude more energy-intensive than task-specific systems\ for a variety of tasks, even for models with a similar number of parameters") gr.Markdown("### Explore the plots below to get more insights about the different models and tasks from our study.") with gr.Accordion("More details about our methodology:", open=False): gr.Markdown("We chose ten ML tasks: text classification, token classification, question answering, \ ), masked language modeling, text generation, summarization, image classification, object detection, \ image captioning and image generation. For each of the taks, we chose three of the most downloaded datasets and 8 of the most \ downloaded models from the Hugging Face Hub. We ran each of the models ten times over a 1,000 sample from each of the models and measured the energy consumed and carbon emitted.") with gr.Row(): with gr.Column(): gr.Markdown("## All models from our study (carbon)") gr.Markdown("### Double click on the model name in the list on the right to isolate its datapoints:") gr.Markdown("The axes of the plot are in logarithmic scale, meaning that the difference between the least carbon-intensive and the most carbon-intensive models is over 9,000 times!") gr.Plot(fig0) with gr.Row(): with gr.Column(): gr.Markdown("## Task-by-task comparison (energy)") gr.Markdown("### Grouping the models by task, we can see different patterns emerge:") gr.Markdown("Image generation is by far the most energy- and carbon-intensive task from the ones studied, and text classification \ is the least.") gr.Plot(fig1) with gr.Row(): with gr.Column(): gr.Markdown("## Modality comparison (carbon)") gr.Markdown("### Grouping the models by their modality shows different characteristics:") gr.Markdown("We can see that tasks involving images (image-to-text, image-to-category) require more energy and emit more carbon\ than ones involving text.") gr.Plot(fig2) gr.Markdown("## Multi-task model comparison (carbon)") gr.Markdown("### Looking at the emissions of multi-task models, we can see that decoder-only models tend to emit more carbon compared to sequence-to-sequence ones.") gr.Markdown("### This pattern varies depending on the dataset and task - for summarization datasets (the 3 rightmost ones), the difference between models is less obvious.") with gr.Row(): with gr.Column(): gr.Plot(fig3) with gr.Column(): gr.Plot(fig4) gr.Markdown("## Evaluations (accuracy vs carbon)") gr.Markdown("### Single-task models are, ceteris paribus, less carbon-intensive than multi-task models for all 3 tasks we looked at: ") with gr.Row(): with gr.Column(): gr.Markdown("### Sentiment Analysis") gr.Plot(fig5) with gr.Column(): gr.Markdown("### Question Answering") gr.Plot(fig6) with gr.Column(): gr.Markdown("### Summarization") gr.Plot(fig7) demo.launch()