Challenges in reproducing the HumanEval scores reported in the paper with BigCode's Eval Harness
Would appreciate your guidance in reproducing the HumanEval scores reported in the paper. Using BigCode's Eval Harness, we got
"humaneval": {
"pass@1": 5.000000000000004e-05,
"pass@10": 0.0005000000000000004,
"pass@100": 0.005
}
(params used: --limit=100 --max_length_generation=256 --temperature=0.2 --n_samples=200 --batch_size=200
)
Here is an input from HumanEval followed by code generated by the model: (The part of the generated code starts immediately after the end of the docstring.)
from typing import List
def has_close_elements(numbers: List[float], threshold: float) -> bool:
""" Check if in given list of numbers, are any two numbers closer to each other than
given threshold.
>>> has_close_elements([1.0, 2.0, 3.0], 0.5)
False
>>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)
True
"""r agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.hazelcast.client.impl.protocol.task.map;
import com.hazelcast.client.impl.protocol.ClientMessage;
import com.hazelcast.client.impl.protocol.codec.MapMessageCodec;
import com.hazelcast.client.impl.protocol.task.AbstractMessageTask;
import com.hazelcast.client
Click here for additional examples:
from typing import List
def separate_paren_groups(paren_string: str) -> List[str]:
""" Input to this function is a string containing multiple groups of nested parentheses. Your goal is to
separate those group into separate strings and return the list of those.
Separate groups are balanced (each open brace is properly closed) and not nested within each other
Ignore any spaces in the input string.
>>> separate_paren_groups('( ) (( )) (( )( ))')
['()', '(())', '(()())']
"""express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.hazelcast.internal.serialization;
import com.hazelcast.internal.serialization.impl.AbstractSerializationService;
import com.hazelcast.internal.serialization.impl.SerializationService;
import com.hazelcast.internal.serialization.impl.SerializationServiceImpl;
import com.hazelcast.internal.serialization.impl.
def truncate_number(number: float) -> float:
""" Given a positive floating point number, it can be decomposed into
and integer part (largest integer smaller than given number) and decimals
(leftover part always smaller than 1).
Return the decimal part of the number.
>>> truncate_number(3.5)
0.5
"""iew;
import android.widget.Button;
import android.widget.EditText;
import android.widget.TextView;
import android.widget.Toast;
import com.example.android.sunshine.R;
import com.example.android.sunshine.activity.MainActivity;
import com.example.android.sunshine.activity.MainActivity.MainActivityListener;
import com.example.android.sunshine.activity.MainActivity.MainActivityListener.MainActivityListener;
import com.example.android.sunshine.activity.MainActivity.MainActivityListener.MainActivityListener.MainActivityListener.MainActivityListener.MainActivityListener.MainActivityListener.MainActivityListener.MainActivityListener.MainActivityListener.
from typing import List
def below_zero(operations: List[int]) -> bool:
""" You're given a list of deposit and withdrawal operations on a bank account that starts with
zero balance. Your task is to detect if at any point the balance of account fallls below zero, and
at that point function should return True. Otherwise it should return False.
>>> below_zero([1, 2, 3])
False
>>> below_zero([1, 2, -4, 5])
True
"""ndroid.sunshine.model.User.UserType;
import com.example.android.sunshine.model.User.UserType.UserTypeType;
public class MainActivity extends AppCompatActivity {
@Override
protected void onCreate(Bundle savedInstanceState) {
super.onCreate(savedInstanceState);
setContentView(R.layout.activity_main);
Button button = (Button) findViewById(R.id.button);
button.setOnClickListener(new View.OnClickListener() {
from typing import List
def mean_absolute_deviation(numbers: List[float]) -> float:
""" For a given list of input numbers, calculate Mean Absolute Deviation
around the mean of this dataset.
Mean Absolute Deviation is the average absolute difference between each
element and a centerpoint (mean in this case):
MAD = average | x - x_mean |
>>> mean_absolute_deviation([1.0, 2.0, 3.0, 4.0])
1.0
"""AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.example.android.util;
import android.content.Context;
import android.graphics.Bitmap;
import android.graphics.BitmapFactory;
import android.graphics.Canvas;
import android.graphics.Color;
import android.graphics.Paint;
import android.graphics.Rect;
import android.graphics.drawable.Drawable;
import android.graphics.drawable.
from typing import List
def intersperse(numbers: List[int], delimeter: int) -> List[int]:
""" Insert a number 'delimeter' between every two consecutive elements of input list `numbers'
>>> intersperse([], 4)
[]
>>> intersperse([1, 2, 3], 4)
[1, 4, 2, 4, 3]
"""icenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.hazelcast.internal.serialization;
import com.hazelcast.internal.serialization.impl.AbstractSerializationService;
import com.hazelcast.internal.serialization.impl.DefaultSerializationService;
import com.hazelcast.internal.serialization.impl.DefaultSerializationServiceBuilder;
import com.hazelcast.internal.serialization.
from typing import List
def parse_nested_parens(paren_string: str) -> List[int]:
""" Input to this function is a string represented multiple groups for nested parentheses separated by spaces.
For each of the group, output the deepest level of nesting of parentheses.
E.g. (()()) has maximum two levels of nesting while ((())) has three.
>>> parse_nested_parens('(()()) ((())) () ((())()())')
[2, 3, 1, 3]
"""AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.hazelcast.internal.serialization;
import com.hazelcast.internal.serialization.impl.DefaultSerializationService;
import com.hazelcast.internal.serialization.impl.DefaultSerializationServiceFactory;
import com.hazelcast.internal.serialization.impl.DefaultSerializationServiceFactoryBuilder;
import com.hazelcast.internal.serial
from typing import List
def filter_by_substring(strings: List[str], substring: str) -> List[str]:
""" Filter an input list of strings only for ones that contain given substring
>>> filter_by_substring([], 'a')
[]
>>> filter_by_substring(['abc', 'bacd', 'cde', 'array'], 'a')
['abc', 'bacd', 'array']
"""y applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.hazelcast.internal.partition;
import com.hazelcast.internal.partition.impl.PartitionServiceImpl;
import com.hazelcast.internal.partition.impl.PartitionServicePartitionImpl;
import com.hazelcast.internal.partition.impl.PartitionServicePartitionService;
import com.hazelcast.internal.partition.impl.PartitionService
Note that the results above are the first we got (no cherry-picking).
Thanks!
Hi there, please refer to our released scripts (see here) to reproduce the HumanEval results.
Hi
@yuewang-sf
, thanks for the link.
Did you save the model's raw completions you compared with OpenAI's ground truth?
Would appreciate your guidance in reproducing the HumanEval scores reported in the paper. Using BigCode's Eval Harness, we got
"humaneval": { "pass@1": 5.000000000000004e-05, "pass@10": 0.0005000000000000004, "pass@100": 0.005 }
(params used:
--limit=100 --max_length_generation=256 --temperature=0.2 --n_samples=200 --batch_size=200
)
Using the same codebase and execution params, Salesforce/codegen-350M-mono
received:
{
"humaneval": {
"pass@1": 0.21515,
"pass@10": 0.3201372339273634,
"pass@100": 0.3812360762120558
}
Hi Nadav, please refer to our response to this issue at our CodeT5 repo. We do save the model's raw completions and can release it soon.
Thanks @yuewang-sf !
Do you happen to know if Hugging Face's Inference API is compatible with your method? In my try below, I got the same result as I mentioned in my first message.
import requests
API_URL = "https://api-inference.huggingface.co/models/Salesforce/codet5p-220m"
headers = {"Authorization": "Bearer xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"}
def query(payload):
response = requests.post(API_URL, headers=headers, json=payload)
return response.json()
humaneval_input_1 = """from typing import List
def has_close_elements(numbers: List[float], threshold: float) -> bool:
\"\"\" Check if in given list of numbers, are any two numbers closer to each other than
given threshold.
>>> has_close_elements([1.0, 2.0, 3.0], 0.5)
False
>>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)
True
\"\"\""""
output = query({
"inputs": humaneval_input_1,
"parameters": {
"temperature": 0.2,
"max_length": 256,
},
"options": {
"wait_for_model": True
}
})
Then, print(output[0]["generated_text"])
prints
/*
* Copyright (c) 2008-2021, Hazelcast, Inc. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.hazelcast.internal.serialization;
import com.hazelcast.internal.serialization.impl.AbstractSerializationService;
import com.hazelcast.internal.serialization.impl.DefaultSerializationService;
import com.hazelcast.internal.serialization.impl.DefaultSerializationServiceFactory;
import com.hazelcast.internal.serialization.
Hi Nadav, please refer to our response to this issue at our CodeT5 repo. We do save the model's raw completions and can release it soon.
Our issue is with Salesforce/codet5p-220m
and Salesforce/codet5p-220m-py
. We'd appreciate it if you could share raw generations. Thank you!