import json import math import random import streamlit as st import pandas as pd import time import numpy as np # define some constants CODE_LLM = "Codex" DEFAULT_FIRST_EXAMPLE_IDX = 47 MAX_STAGE = 5 DEFAULT_TOP_K_EXAMPLES = 10 DATASET_NAMES = ["Spider", "WikiTQ", "GSM8k", "MBPP"] RESULT_FILES_DICTS = { "Spider": "demo-spider-codex-results.jsonl", "WikiTQ": "demo-wikitq-codex-results.jsonl", "GSM8k": "demo-gsm8k-codex-results.jsonl", "MBPP": "demo-mbpp-codex-results.jsonl" } N_MODELS = 54 N_ORGS = 13 #################### Setups must go first #################### st.set_page_config(layout="wide") #################### Side Bar #################### with st.sidebar: st.markdown("# About") st.markdown("**L2CEval** is a framework for evaluating Language-to-Code generation for LLMs.") # st.info("**Site under construction**") st.warning("**Interactive visualizer (coming soon!)**") st.warning("**Model output explorer (coming soon!)**") # with st.expander(":blue[**Authors**]", expanded=False): # st.markdown("**Ansong Ni$^†$, Pengcheng Yin$^♣$, Yilun Zhao$^†$, Martin Riddell$^†$, Troy Feng$^†$, Rui Shen$^†$, Stephen Yin$^†$, Ye Liu$^♢$, Semih Yavuz$^♢$, " \ # "Caiming Xiong$^♢$, Shafiq Joty$^♢$, Yingbo Zhou$^♢$, Dragomir Radev$^†$, Arman Cohan$^†‡$**") # st.markdown("**†: Yale University, ♣: Google DeepMind, ♢: Salesforce Research, ‡: Allen Institute for AI**") # st.markdown("**Authors**: Ansong Ni, Srini Iyer, Dragomir Radev, Ves Stoyanov, Wen-tau Yih, Sida I. Wang*, Xi Victoria Lin*") # st.markdown("**Demo made by**: [Ansong Ni](https://niansong1996.github.io/)") # st.markdown("**All experiment code on [GitHub](https://github.com/niansong1996/lever)**") #################### START OF DEMO #################### # some basic intro st.image("images/l2ceval-logo.png", use_column_width="auto") st.markdown("### L2CEval: Evaluating Language-to-Code Generation Capabilities of Large Language Models") st.markdown("**Ansong Ni$^†$, Pengcheng Yin$^♣$, Yilun Zhao$^†$, Martin Riddell$^†$, Troy Feng$^†$, Rui Shen$^†$, Stephen Yin$^†$**") st.markdown("**Ye Liu$^♢$, Semih Yavuz$^♢$, " \ "Caiming Xiong$^♢$, Shafiq Joty$^♢$, Yingbo Zhou$^♢$, Dragomir Radev$^†$, Arman Cohan$^†‡$**") st.markdown("†: Yale University, ♣: Google DeepMind, ♢: Salesforce Research, ‡: Allen Institute for AI") st.warning(":orange[**Site under construction 🛠️... Stay tuned!**]") st.divider() # st.markdown("#### Abstract") # st.markdown(""" # Recently, large language models (LLMs), especially those that are pretrained # on code, have demonstrated strong capabilities in generating programs from # natural language inputs in a few-shot or even zero-shot manner. Despite # promising results, there is a notable lack of a comprehensive evaluation of # these models language-to-code generation capabilities. Existing studies often # focus on specific tasks, model architectures, or learning paradigms, leading to # a fragmented understanding of the overall landscape. In this work, we present # L2CEval, a systematic evaluation of the language-to-code generation # capabilities of LLMs on 7 tasks across the domain spectrum of semantic parsing, # math reasoning and Python programming, analyzing the factors that potentially # affect their performance, such as model size, pretraining data, instruction # tuning, and different prompting methods. In addition to assessing model # performance, we measure confidence calibration for the models and conduct human # evaluations of the output programs. This enables us to identify and analyze the # typical failure modes across various tasks and models. L2CEval offers a # comprehensive understanding of the capabilities and limitations of LLMs in # language-to-code generation. We also release the evaluation framework and all # model outputs, hoping to lay the groundwork for further future research in this # domain. # """) st.markdown("#### Language-to-Code (L2C) Generation") st.markdown("Langauge-to-Code (L2C) generation is a type of tasks that maps from natural language to code. It is " \ "the cornerstone of many applications in AI, such as 1) chatbots; 2) coding assistants; " \ "3) language interfaces for databases; 4) robotic control; etc") st.image("images/pipeline.png", caption="Example of L2C tasks", use_column_width="auto") st.divider() st.markdown("#### L2CEval - Tasks") st.markdown("We evaluate the L2C capabilities of LLMs on 7 tasks across the domain spectrum of *semantic parsing*, \ *math reasoning* and *Python programming*:") st.markdown(""" | Domain | Dataset | Split | Size | Input | Output | |------------------|--------------------------------------------------------------|-------|--------|---------------------------------|------------------------| | Semantic Parsing | [Spider (Yu et al., 2018)](https://yale-lily.github.io/spider)| Dev | 1,000 | DB schema + NL | SQL Query | | | [WikiTQ (Pasupat and Liang, 2015)](https://ppasupat.github.io/WikiTableQuestions) | Dev | 2,828 | Table headers + NL | SQL Query | | Math Reasoning | [GSM8k (Cobbe et al., 2021)](https://github.com/openai/grade-school-math) | All | 1,494 | Math problem in NL | Python solution | | | [SVAMP (Patel et al., 2021)](https://github.com/arkilpatel/SVAMP) | All | 996 | Math problem in NL | Python solution | | Python Programming| [MBPP (Austin et al., 2021)](https://github.com/google-research/google-research/blob/master/mbpp/README.md) | Test | 500 | NL spec. + 1 test | Python function | | | [HumanEval (Chen et al., 2021)](https://github.com/openai/human-eval) | All | 164 | NL spec. + 1-3 test | Python function | | | [DS-1000 (Lai et al., 2022)](https://github.com/HKUNLP/DS-1000) | All | 1000 | NL spec. | Python lines | """) st.divider() st.markdown("#### L2CEval - Models") st.markdown(f"We evaluate {N_MODELS} models from {N_ORGS} organizations. Here is a summary of the *open-source* models we evaluated:") st.markdown(""" | Organization | Model Name | Release Time | Sizes | # All Tokens | # Code Tokens | Ctx. Leng. | Code Specific | Inst. Tuned | |-------------------|--------------------------|--------------|----------------|--------------|---------------|------------|---------------|-------------| | Salesforce | CodeGen-multi | 2022-3 | 6.1/16.1B | 505B | 119B | 2,048 | ✓ | ✗ | | Salesforce | CodeGen-mono | 2022-3 | 6.1/16.1B | 577B | 191B | 2,048 | ✓ | ✗ | | Salesforce | CodeGen-2.5-multi | 2023-7 | 7B | 1.4T | 1.4T | 2,048 | ✓ | ✗ | | Salesforce | CodeGen-2.5-mono | 2023-7 | 7B | - | - | 2,048 | ✓ | ✗ | | Salesforce | CodeGen-2.5-instruct | 2023-7 | 7B | - | - | 2,048 | ✓ | ✓ | | Eleuther AI | GPT-J | 2021-5 | 6.1B | 402B | 46B | 2,048 | ✗ | ✗ | | Eleuther AI | GPT-NeoX | 2022-4 | 20.6B | 472B | 54B | 2,048 | ✗ | ✗ | | Eleuther AI | Pythia | 2023-4 | 1.4/6.9/12B | 300B | 35B | 2,048 | ✗ | ✗ | | Databricks | Dolly-v2 | 2023-4 | 6.9/12B | - | - | 2,048 | ✗ | ✓ | | BigCode | SantaCoder | 2023-1 | 1.1B | 236B | 236B | 2,048 | ✓ | ✗ | | BigCode | StarCoder | 2023-5 | 15.5B | 1T | 1T | 8,192 | ✓ | ✗ | | BigCode | StarCoderPlus | 2023-6 | 15.5B | 1.6T | 1T | 8,192 | ✓ | ✗ | | Meta AI | InCoder | 2022-4 | 1.3/6.7B | 52B | 52B | 2,048 | ✓ | ✗ | | Meta AI | LLaMA | 2023-2 | 6.7/13B | 1T | 45B | 2,048 | ✗ | ✗ | | Meta AI | LLaMA-30B | 2023-2 | 32.5B | 1.4T | 63B | 2,048 | ✗ | ✗ | | Meta AI | LLaMA-2 | 2023-7 | 7/13/70B | 2T | - | 4,096 | ✗ | ✗ | | Meta AI | CodeLLaMA | 2023-7 | 7/13/34B | 2.5T | 435B | 16,384 | ✓ | ✗ | | Stanford | Alpaca | 2023-3 | 6.7/13/32.5B | - | - | 2,048 | ✗ | ✓ | | LMSYS | Vincuna | 2023-3 | 6.7/13/32.5B | - | - | 2,048 | ✗ | ✗ | | Replit | Replit-code-v1-3b | 2023-5 | 2.7B | 525B | 525B | 2,048 | ✓ | ✗ | | MosaicML | MPT-7B | 2023-5 | 7B | 1T | 135B | 2,048 | ✗ | ✗ | | MosaicML | MPT-7B-instruct | 2023-5 | 7B | - | - | 2,048 | ✗ | ✓ | | MosaicML | MPT-30B | 2023-6 | 30B | 1T | 135B | 8,192 | ✗ | ✗ | | MosaicML | MPT-30B-instruct | 2023-6 | 30B | - | - | 8,192 | ✗ | ✓ | """) st.markdown("\n\n\n\n") st.markdown("In addition, we also evaluated the following *proprietary* models:") st.markdown(""" - OpenAI GPT-4 - OpenAI GPT-3.5-turbo - OpenAI text-davinci-002 - OpenAI text-davinci-003 - OpenAI code-davinci-002 - OpenAI code-cushman-001 """) st.divider() # read results from csv # results = pd.read_csv("data/scatter.csv") # st.info(results.to_markdown()) # st.info(results.columns) # st.info(results.dtypes) # st.scatter_chart( # results, # x="Model Size", # y="Avg. Perf.", # color='Model Series', # # size='Avg. Perf.', # ) # chart_data = pd.DataFrame(np.random.randn(20, 3), columns=["col1", "col2", "col3"]) # chart_data['col4'] = np.random.choice(['A','B','C'], 20) # st.info(chart_data.to_markdown()) # st.info(chart_data.dtypes) # st.scatter_chart( # chart_data, # x='col1', # y='col2', # color='col4', # size='col3', # ) st.markdown("#### All Results (coming soon!)") # st.image("images/all_results.png", use_column_width="auto")