winglian commited on
Commit
d88615f
β€’
1 Parent(s): fd00027

elo calculations and update arena metadta

Browse files
Files changed (2) hide show
  1. README.md +8 -2
  2. calculate_elo.py +276 -0
README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- title: Rlhf Arena
3
  emoji: πŸƒ
4
  colorFrom: yellow
5
  colorTo: blue
@@ -10,4 +10,10 @@ pinned: false
10
  license: apache-2.0
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
1
  ---
2
+ title: Community ChatBot Arena
3
  emoji: πŸƒ
4
  colorFrom: yellow
5
  colorTo: blue
 
10
  license: apache-2.0
11
  ---
12
 
13
+ # OpenAccess AI Collective Community ChatBot Arena
14
+
15
+ - Arena: https://huggingface.co/spaces/openaccess-ai-collective/rlhf-arena
16
+ - GitHub: https://github.com/OpenAccess-AI-Collective/rlhf-arena
17
+ - Built using Runpod Serverless. See our writeup here: https://medium.com/@winglian/inference-any-llm-with-serverless-in-15-minutes-69eeb548a41d
18
+ - Want to have your language model added to the Arena? [Create an Issue](https://github.com/OpenAccess-AI-Collective/rlhf-arena/issues) or reach out on [Discord](https://discord.gg/PugNNHAF5r)
19
+ - [πŸ’΅ Consider Donating on our Patreon](http://patreon.com/OpenAccessAICollective)
calculate_elo.py ADDED
@@ -0,0 +1,276 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ from datetime import datetime
4
+ from decimal import Decimal
5
+
6
+ import boto3
7
+ from boto3.dynamodb.conditions import Attr
8
+
9
+ logging.basicConfig(level=os.getenv("LOG_LEVEL", "INFO"))
10
+
11
+ # Create a DynamoDB client
12
+ dynamodb = boto3.resource('dynamodb', region_name='us-east-1')
13
+
14
+
15
+ def _create_arena_table():
16
+ table = dynamodb.create_table(
17
+ TableName='oaaic_chatbot_arena',
18
+ KeySchema=[
19
+ {
20
+ 'AttributeName': 'arena_battle_id',
21
+ 'KeyType': 'HASH'
22
+ },
23
+ ],
24
+ AttributeDefinitions=[
25
+ {
26
+ 'AttributeName': 'arena_battle_id',
27
+ 'AttributeType': 'S'
28
+ },
29
+ {
30
+ 'AttributeName': 'timestamp',
31
+ 'AttributeType': 'S'
32
+ },
33
+ ],
34
+ ProvisionedThroughput={
35
+ 'ReadCapacityUnits': 5,
36
+ 'WriteCapacityUnits': 5
37
+ },
38
+ GlobalSecondaryIndexes=[
39
+ {
40
+ 'IndexName': 'TimestampIndex',
41
+ 'KeySchema': [
42
+ {
43
+ 'AttributeName': 'arena_battle_id',
44
+ 'KeyType': 'HASH'
45
+ },
46
+ {
47
+ 'AttributeName': 'timestamp',
48
+ 'KeyType': 'RANGE'
49
+ },
50
+ ],
51
+ 'Projection': {
52
+ 'ProjectionType': 'ALL',
53
+ },
54
+ 'ProvisionedThroughput': {
55
+ 'ReadCapacityUnits': 5,
56
+ 'WriteCapacityUnits': 5,
57
+ }
58
+ },
59
+ ]
60
+ )
61
+
62
+ def _create_elo_scores_table():
63
+ dynamodb.create_table(
64
+ TableName='elo_scores',
65
+ KeySchema=[
66
+ {
67
+ 'AttributeName': 'chatbot_name',
68
+ 'KeyType': 'HASH' # Partition key
69
+ },
70
+ ],
71
+ AttributeDefinitions=[
72
+ {
73
+ 'AttributeName': 'chatbot_name',
74
+ 'AttributeType': 'S'
75
+ },
76
+ ],
77
+ ProvisionedThroughput={
78
+ 'ReadCapacityUnits': 5,
79
+ 'WriteCapacityUnits': 5
80
+ }
81
+ )
82
+
83
+
84
+ def _create_elo_logs_table():
85
+ dynamodb.create_table(
86
+ TableName='elo_logs',
87
+ KeySchema=[
88
+ {
89
+ 'AttributeName': 'arena_battle_id',
90
+ 'KeyType': 'HASH' # Partition key
91
+ },
92
+ ],
93
+ AttributeDefinitions=[
94
+ {
95
+ 'AttributeName': 'arena_battle_id',
96
+ 'AttributeType': 'S'
97
+ },
98
+ {
99
+ 'AttributeName': 'battle_timestamp',
100
+ 'AttributeType': 'S'
101
+ },
102
+ ],
103
+ ProvisionedThroughput={
104
+ 'ReadCapacityUnits': 10,
105
+ 'WriteCapacityUnits': 10
106
+ },
107
+ GlobalSecondaryIndexes=[
108
+ {
109
+ 'IndexName': 'BattleTimestampIndex',
110
+ 'KeySchema': [
111
+ {
112
+ 'AttributeName': 'battle_timestamp',
113
+ 'KeyType': 'HASH' # Partition key for the GSI
114
+ },
115
+ ],
116
+ 'Projection': {
117
+ 'ProjectionType': 'ALL'
118
+ },
119
+ 'ProvisionedThroughput': {
120
+ 'ReadCapacityUnits': 10,
121
+ 'WriteCapacityUnits': 10
122
+ }
123
+ },
124
+ ]
125
+ )
126
+
127
+
128
+ def get_unprocessed_battles(last_processed_timestamp):
129
+ # Use boto3 to create a DynamoDB resource and reference the table
130
+ table = dynamodb.Table('oaaic_chatbot_arena')
131
+
132
+ # Use a query to retrieve unprocessed battles in temporal order
133
+ response = table.scan(
134
+ FilterExpression=Attr('timestamp').gt(last_processed_timestamp),
135
+ # ScanIndexForward=True
136
+ )
137
+
138
+ return response['Items']
139
+
140
+
141
+ def calculate_elo(rating1, rating2, result, K=32):
142
+ # Convert ratings to float
143
+ rating1 = float(rating1)
144
+ rating2 = float(rating2)
145
+
146
+ # Calculate the expected outcomes
147
+ expected_outcome1 = 1.0 / (1.0 + 10.0 ** ((rating2 - rating1) / 400.0))
148
+ expected_outcome2 = 1.0 - expected_outcome1
149
+
150
+ # Calculate the new Elo ratings
151
+ new_rating1 = rating1 + K * (result - expected_outcome1)
152
+ new_rating2 = rating2 + K * ((1.0 - result) - expected_outcome2)
153
+
154
+ return Decimal(new_rating1).quantize(Decimal('0.00')), Decimal(new_rating2).quantize(Decimal('0.00'))
155
+
156
+
157
+ def get_last_processed_timestamp():
158
+ table = dynamodb.Table('elo_logs')
159
+
160
+ response = table.update (
161
+ AttributeDefinitions=[
162
+ {
163
+ 'AttributeName': 'timestamp',
164
+ 'AttributeType': 'S'
165
+ },
166
+ ],
167
+ GlobalSecondaryIndexUpdates=[
168
+ {
169
+ 'Create': {
170
+ 'IndexName': 'TimestampIndex',
171
+ 'KeySchema': [
172
+ {
173
+ 'AttributeName': 'timestamp',
174
+ 'KeyType': 'RANGE'
175
+ },
176
+ ],
177
+ 'Projection': {
178
+ 'ProjectionType': 'ALL',
179
+ }
180
+ },
181
+ },
182
+ ]
183
+ )
184
+
185
+ # Scan the table sorted by timestamp in descending order
186
+ response = table.scan(
187
+ Limit=1,
188
+ ScanIndexForward=False
189
+ )
190
+
191
+ # If there are no items in the table, return a default timestamp
192
+ if not response['Items']:
193
+ return '1970-01-01T00:00:00'
194
+
195
+ # Otherwise, return the timestamp of the latest item
196
+ return response['Items'][0]['battle_timestamp']
197
+
198
+
199
+ def log_elo_update(arena_battle_id, battle_timestamp, new_rating1, new_rating2):
200
+ # Reference the elo_logs table
201
+ table = dynamodb.Table('elo_logs')
202
+
203
+ # Update the table
204
+ table.put_item(
205
+ Item={
206
+ 'arena_battle_id': arena_battle_id,
207
+ 'battle_timestamp': battle_timestamp, # Use the timestamp of the battle
208
+ 'log_timestamp': datetime.now().isoformat(), # Also store the timestamp of the log for completeness
209
+ 'new_rating1': new_rating1,
210
+ 'new_rating2': new_rating2
211
+ }
212
+ )
213
+
214
+
215
+ def get_elo_score(chatbot_name, elo_scores):
216
+ if chatbot_name in elo_scores:
217
+ return elo_scores[chatbot_name]
218
+
219
+ table = dynamodb.Table('elo_scores')
220
+ response = table.get_item(Key={'chatbot_name': chatbot_name})
221
+
222
+ # If there is no item in the table, return a default score
223
+ if 'Item' not in response:
224
+ return 1500
225
+
226
+ return response['Item']['elo_score']
227
+
228
+
229
+ def update_elo_score(chatbot_name, new_elo_score):
230
+ table = dynamodb.Table('elo_scores')
231
+
232
+ # This will create a new item if it doesn't exist
233
+ table.put_item(
234
+ Item={
235
+ 'chatbot_name': chatbot_name,
236
+ 'elo_score': Decimal(str(new_elo_score)),
237
+ }
238
+ )
239
+
240
+
241
+ def main():
242
+ # last_processed_timestamp = get_last_processed_timestamp()
243
+ last_processed_timestamp = '1970-01-01T00:00:00'
244
+ battles = get_unprocessed_battles(last_processed_timestamp)
245
+
246
+ elo_scores = {}
247
+
248
+ for battle in battles:
249
+ if battle['label'] in {0, 1, 2}:
250
+ outcome = battle['label']
251
+ for chatbot_name in [battle['choice1_name'], battle['choice2_name']]:
252
+ if chatbot_name not in elo_scores:
253
+ elo_scores[chatbot_name] = get_elo_score(chatbot_name, elo_scores)
254
+ # 1: This means that the first player (or team) won the match.
255
+ # 0.5: This means that the match ended in a draw.
256
+ # 0: This means that the first player (or team) lost the match.
257
+ if outcome == 0:
258
+ elo_result = 0.5
259
+ elif outcome == 1:
260
+ elo_result = 1
261
+ else:
262
+ elo_result = 0
263
+
264
+ new_rating1, new_rating2 = calculate_elo(elo_scores[battle['choice1_name']], elo_scores[battle['choice2_name']], elo_result)
265
+ elo_scores[battle['choice1_name']] = new_rating1
266
+ elo_scores[battle['choice2_name']] = new_rating2
267
+ log_elo_update(battle['arena_battle_id'], battle['timestamp'], new_rating1, new_rating2)
268
+ logging.info(f"{battle['choice1_name']}: {elo_scores[battle['choice1_name']]} -> {new_rating1} | {battle['choice2_name']}: {elo_scores[battle['choice2_name']]} -> {new_rating2}")
269
+ update_elo_score(battle['choice1_name'], new_rating1)
270
+ update_elo_score(battle['choice2_name'], new_rating2)
271
+ elo_scores[battle['choice1_name']] = new_rating1
272
+ elo_scores[battle['choice2_name']] = new_rating2
273
+
274
+
275
+ if __name__ == "__main__":
276
+ main()