Spaces:
Sleeping
Sleeping
openreviewer
commited on
Commit
β’
38a86d9
1
Parent(s):
a5df6bb
Upload folder using huggingface_hub
Browse files- app.py +23 -20
- aws_utils.py +91 -30
app.py
CHANGED
@@ -30,7 +30,7 @@ use_real_api = False
|
|
30 |
|
31 |
# Function to generate a paper_id using SHA-512 hash
|
32 |
def generate_paper_id(paper_content):
|
33 |
-
return hashlib.sha512(paper_content).hexdigest()
|
34 |
|
35 |
# Function to get user IP address
|
36 |
def get_user_ip():
|
@@ -165,6 +165,9 @@ def setup_interface():
|
|
165 |
}
|
166 |
"""
|
167 |
with gr.Blocks(css=css) as demo:
|
|
|
|
|
|
|
168 |
with gr.Tabs():
|
169 |
with gr.TabItem("Reviewer Arena"):
|
170 |
gr.Markdown("## Reviewer Arena")
|
@@ -188,13 +191,13 @@ def setup_interface():
|
|
188 |
|
189 |
model_identity_message = gr.HTML("", visible=False)
|
190 |
|
191 |
-
def handle_vote_interface(vote,
|
192 |
-
return handle_vote(vote,
|
193 |
|
194 |
submit_button.click(fn=review_papers, inputs=[file_input],
|
195 |
-
outputs=[review1, review2, vote, vote_button,
|
196 |
|
197 |
-
vote_button.click(fn=handle_vote_interface, inputs=[vote,
|
198 |
outputs=[vote_message, vote, vote_button, another_paper_button])
|
199 |
|
200 |
another_paper_button.click(fn=lambda: None, inputs=None, outputs=None, js="() => { location.reload(); }")
|
@@ -204,7 +207,7 @@ def setup_interface():
|
|
204 |
|
205 |
# Fetch the leaderboard data from the database
|
206 |
leaderboard_data = get_leaderboard()
|
207 |
-
print(leaderboard_data)
|
208 |
|
209 |
# Create the leaderboard HTML dynamically
|
210 |
leaderboard_html = """
|
@@ -224,19 +227,19 @@ def setup_interface():
|
|
224 |
<tbody>
|
225 |
"""
|
226 |
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
|
241 |
leaderboard_html += """
|
242 |
</tbody>
|
@@ -253,4 +256,4 @@ def setup_interface():
|
|
253 |
if __name__ == "__main__":
|
254 |
logging.basicConfig(level=logging.INFO)
|
255 |
demo = setup_interface()
|
256 |
-
demo.launch()
|
|
|
30 |
|
31 |
# Function to generate a paper_id using SHA-512 hash
|
32 |
def generate_paper_id(paper_content):
|
33 |
+
return hashlib.sha512(paper_content.encode('utf-8')).hexdigest()
|
34 |
|
35 |
# Function to get user IP address
|
36 |
def get_user_ip():
|
|
|
165 |
}
|
166 |
"""
|
167 |
with gr.Blocks(css=css) as demo:
|
168 |
+
paper_content_state = gr.State()
|
169 |
+
model_a_state = gr.State()
|
170 |
+
model_b_state = gr.State()
|
171 |
with gr.Tabs():
|
172 |
with gr.TabItem("Reviewer Arena"):
|
173 |
gr.Markdown("## Reviewer Arena")
|
|
|
191 |
|
192 |
model_identity_message = gr.HTML("", visible=False)
|
193 |
|
194 |
+
def handle_vote_interface(vote, model_a, model_b, paper_content):
|
195 |
+
return handle_vote(vote, model_a, model_b, paper_content)
|
196 |
|
197 |
submit_button.click(fn=review_papers, inputs=[file_input],
|
198 |
+
outputs=[review1, review2, vote, vote_button, model_a_state, model_b_state, paper_content_state])
|
199 |
|
200 |
+
vote_button.click(fn=handle_vote_interface, inputs=[vote, model_a_state, model_b_state, paper_content_state],
|
201 |
outputs=[vote_message, vote, vote_button, another_paper_button])
|
202 |
|
203 |
another_paper_button.click(fn=lambda: None, inputs=None, outputs=None, js="() => { location.reload(); }")
|
|
|
207 |
|
208 |
# Fetch the leaderboard data from the database
|
209 |
leaderboard_data = get_leaderboard()
|
210 |
+
# print(leaderboard_data)
|
211 |
|
212 |
# Create the leaderboard HTML dynamically
|
213 |
leaderboard_html = """
|
|
|
227 |
<tbody>
|
228 |
"""
|
229 |
|
230 |
+
for rank, model in enumerate(leaderboard_data, start=1):
|
231 |
+
leaderboard_html += f"""
|
232 |
+
<tr style="border: 1px solid #444; padding: 12px;">
|
233 |
+
<td style="border: 1px solid #444; padding: 12px; color: #ddd;">{rank}</td>
|
234 |
+
<td style="border: 1px solid #444; padding: 12px; color: #ddd;">{model['ModelID']}</td>
|
235 |
+
<td style="border: 1px solid #444; padding: 12px; color: #ddd;">{model['EloScore']}</td>
|
236 |
+
<td style="border: 1px solid #444; padding: 12px; color: #ddd;">{model['CI_Lower']} - {model['CI_Upper']}</td>
|
237 |
+
<td style="border: 1px solid #444; padding: 12px; color: #ddd;">{model['Votes']}</td>
|
238 |
+
<td style="border: 1px solid #444; padding: 12px; color: #ddd;">Organization</td>
|
239 |
+
<td style="border: 1px solid #444; padding: 12px; color: #ddd;">License</td>
|
240 |
+
<td style="border: 1px solid #444; padding: 12px; color: #ddd;">Knowledge Cutoff</td>
|
241 |
+
</tr>
|
242 |
+
"""
|
243 |
|
244 |
leaderboard_html += """
|
245 |
</tbody>
|
|
|
256 |
if __name__ == "__main__":
|
257 |
logging.basicConfig(level=logging.INFO)
|
258 |
demo = setup_interface()
|
259 |
+
demo.launch()
|
aws_utils.py
CHANGED
@@ -2,6 +2,7 @@ import boto3
|
|
2 |
import uuid
|
3 |
import datetime
|
4 |
import os
|
|
|
5 |
from dotenv import load_dotenv
|
6 |
|
7 |
try:
|
@@ -27,7 +28,7 @@ leaderboards_table = dynamodb.Table('reviewer_arena_leaderboard')
|
|
27 |
# Function to write a request to the Requests table
|
28 |
def write_request(user_id, paper_id, model_a, model_b, vote):
|
29 |
request_id = str(uuid.uuid4())
|
30 |
-
timestamp = datetime.datetime.now().
|
31 |
|
32 |
response = requests_table.put_item(
|
33 |
Item={
|
@@ -44,55 +45,115 @@ def write_request(user_id, paper_id, model_a, model_b, vote):
|
|
44 |
|
45 |
# Function to update leaderboard after a vote
|
46 |
def update_leaderboard(model_a, model_b, vote):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
# Retrieve current stats for ModelA and ModelB
|
48 |
model_a_stats = leaderboards_table.get_item(Key={'ModelID': model_a}).get('Item', {})
|
49 |
model_b_stats = leaderboards_table.get_item(Key={'ModelID': model_b}).get('Item', {})
|
50 |
|
51 |
# Initialize stats if they don't exist
|
52 |
if not model_a_stats:
|
53 |
-
model_a_stats = {'ModelID': model_a, 'Wins': 0, 'Losses': 0, 'Ties': 0, 'EloScore': 1200, 'Votes': 0}
|
|
|
54 |
if not model_b_stats:
|
55 |
-
model_b_stats = {'ModelID': model_b, 'Wins': 0, 'Losses': 0, 'Ties': 0, 'EloScore': 1200, 'Votes': 0}
|
|
|
56 |
|
57 |
# Update stats based on the vote
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
|
|
|
|
|
|
69 |
|
70 |
-
|
71 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
72 |
|
73 |
-
#
|
74 |
-
|
75 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
76 |
|
77 |
# Function to calculate new Elo scores
|
78 |
def calculate_elo(elo_a, elo_b, vote, k=32):
|
79 |
-
|
80 |
-
|
|
|
|
|
|
|
|
|
81 |
|
82 |
if vote == "A is better":
|
83 |
-
actual_a = 1
|
84 |
-
actual_b = 0
|
85 |
elif vote == "B is better":
|
86 |
-
actual_a = 0
|
87 |
-
actual_b = 1
|
88 |
else: # Tie
|
89 |
-
actual_a = 0.5
|
90 |
-
actual_b = 0.5
|
91 |
|
92 |
-
new_elo_a = elo_a + k * (actual_a - expected_a)
|
93 |
-
new_elo_b = elo_b + k * (actual_b - expected_b)
|
94 |
|
95 |
-
return round(new_elo_a), round(new_elo_b)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
96 |
|
97 |
# Function to query leaderboard
|
98 |
def get_leaderboard():
|
|
|
2 |
import uuid
|
3 |
import datetime
|
4 |
import os
|
5 |
+
from decimal import Decimal, getcontext
|
6 |
from dotenv import load_dotenv
|
7 |
|
8 |
try:
|
|
|
28 |
# Function to write a request to the Requests table
|
29 |
def write_request(user_id, paper_id, model_a, model_b, vote):
|
30 |
request_id = str(uuid.uuid4())
|
31 |
+
timestamp = str(Decimal(datetime.datetime.now().timestamp()))
|
32 |
|
33 |
response = requests_table.put_item(
|
34 |
Item={
|
|
|
45 |
|
46 |
# Function to update leaderboard after a vote
|
47 |
def update_leaderboard(model_a, model_b, vote):
|
48 |
+
# Map vote options to simpler keys
|
49 |
+
vote_mapping = {
|
50 |
+
"π A is better": "A is better",
|
51 |
+
"π B is better": "B is better",
|
52 |
+
"π Tie": "Tie",
|
53 |
+
"π Both are bad": "Tie" # Assuming "Both are bad" is treated as a tie
|
54 |
+
}
|
55 |
+
vote = vote_mapping.get(vote, "Tie") # Default to "Tie" if vote is not found
|
56 |
+
|
57 |
# Retrieve current stats for ModelA and ModelB
|
58 |
model_a_stats = leaderboards_table.get_item(Key={'ModelID': model_a}).get('Item', {})
|
59 |
model_b_stats = leaderboards_table.get_item(Key={'ModelID': model_b}).get('Item', {})
|
60 |
|
61 |
# Initialize stats if they don't exist
|
62 |
if not model_a_stats:
|
63 |
+
model_a_stats = {'ModelID': model_a, 'Wins': 0, 'Losses': 0, 'Ties': 0, 'EloScore': Decimal(1200), 'Votes': 0}
|
64 |
+
leaderboards_table.put_item(Item=model_a_stats)
|
65 |
if not model_b_stats:
|
66 |
+
model_b_stats = {'ModelID': model_b, 'Wins': 0, 'Losses': 0, 'Ties': 0, 'EloScore': Decimal(1200), 'Votes': 0}
|
67 |
+
leaderboards_table.put_item(Item=model_b_stats)
|
68 |
|
69 |
# Update stats based on the vote
|
70 |
+
update_expressions = {
|
71 |
+
"A is better": {
|
72 |
+
"model_a": "SET Wins = Wins + :inc, Votes = Votes + :inc",
|
73 |
+
"model_b": "SET Losses = Losses + :inc, Votes = Votes + :inc"
|
74 |
+
},
|
75 |
+
"B is better": {
|
76 |
+
"model_a": "SET Losses = Losses + :inc, Votes = Votes + :inc",
|
77 |
+
"model_b": "SET Wins = Wins + :inc, Votes = Votes + :inc"
|
78 |
+
},
|
79 |
+
"Tie": {
|
80 |
+
"model_a": "SET Ties = Ties + :inc, Votes = Votes + :inc",
|
81 |
+
"model_b": "SET Ties = Ties + :inc, Votes = Votes + :inc"
|
82 |
+
}
|
83 |
+
}
|
84 |
|
85 |
+
expression_a = update_expressions[vote]["model_a"]
|
86 |
+
expression_b = update_expressions[vote]["model_b"]
|
87 |
+
|
88 |
+
# Update ModelA stats
|
89 |
+
leaderboards_table.update_item(
|
90 |
+
Key={'ModelID': model_a},
|
91 |
+
UpdateExpression=expression_a,
|
92 |
+
ExpressionAttributeValues={':inc': 1}
|
93 |
+
)
|
94 |
+
|
95 |
+
# Update ModelB stats
|
96 |
+
leaderboards_table.update_item(
|
97 |
+
Key={'ModelID': model_b},
|
98 |
+
UpdateExpression=expression_b,
|
99 |
+
ExpressionAttributeValues={':inc': 1}
|
100 |
+
)
|
101 |
|
102 |
+
# Calculate new Elo scores (simple Elo calculation for illustration)
|
103 |
+
new_elo_a, new_elo_b = calculate_elo(model_a_stats['EloScore'], model_b_stats['EloScore'], vote)
|
104 |
+
|
105 |
+
# Calculate 95% CI for new Elo scores
|
106 |
+
ci_a_lower, ci_a_upper = calculate_95_ci(new_elo_a, model_a_stats['Votes'] + 1)
|
107 |
+
ci_b_lower, ci_b_upper = calculate_95_ci(new_elo_b, model_b_stats['Votes'] + 1)
|
108 |
+
|
109 |
+
# Update Elo scores and 95% CI
|
110 |
+
leaderboards_table.update_item(
|
111 |
+
Key={'ModelID': model_a},
|
112 |
+
UpdateExpression="SET EloScore = :new_elo, CI_Lower = :ci_lower, CI_Upper = :ci_upper",
|
113 |
+
ExpressionAttributeValues={':new_elo': Decimal(new_elo_a), ':ci_lower': Decimal(ci_a_lower), ':ci_upper': Decimal(ci_a_upper)}
|
114 |
+
)
|
115 |
+
|
116 |
+
leaderboards_table.update_item(
|
117 |
+
Key={'ModelID': model_b},
|
118 |
+
UpdateExpression="SET EloScore = :new_elo, CI_Lower = :ci_lower, CI_Upper = :ci_upper",
|
119 |
+
ExpressionAttributeValues={':new_elo': Decimal(new_elo_b), ':ci_lower': Decimal(ci_b_lower), ':ci_upper': Decimal(ci_b_upper)}
|
120 |
+
)
|
121 |
+
|
122 |
+
# Set the precision for Decimal
|
123 |
+
getcontext().prec = 28
|
124 |
|
125 |
# Function to calculate new Elo scores
|
126 |
def calculate_elo(elo_a, elo_b, vote, k=32):
|
127 |
+
# Ensure elo_a and elo_b are Decimals
|
128 |
+
elo_a = Decimal(elo_a)
|
129 |
+
elo_b = Decimal(elo_b)
|
130 |
+
|
131 |
+
expected_a = 1 / (1 + Decimal(10) ** ((elo_b - elo_a) / Decimal(400)))
|
132 |
+
expected_b = 1 / (1 + Decimal(10) ** ((elo_a - elo_b) / Decimal(400)))
|
133 |
|
134 |
if vote == "A is better":
|
135 |
+
actual_a = Decimal(1)
|
136 |
+
actual_b = Decimal(0)
|
137 |
elif vote == "B is better":
|
138 |
+
actual_a = Decimal(0)
|
139 |
+
actual_b = Decimal(1)
|
140 |
else: # Tie
|
141 |
+
actual_a = Decimal(0.5)
|
142 |
+
actual_b = Decimal(0.5)
|
143 |
|
144 |
+
new_elo_a = elo_a + Decimal(k) * (actual_a - expected_a)
|
145 |
+
new_elo_b = elo_b + Decimal(k) * (actual_b - expected_b)
|
146 |
|
147 |
+
return round(new_elo_a, 2), round(new_elo_b, 2)
|
148 |
+
|
149 |
+
# Function to calculate 95% CI for Elo scores
|
150 |
+
def calculate_95_ci(elo, votes, z=1.96):
|
151 |
+
if votes == 0:
|
152 |
+
return Decimal(0), Decimal(0)
|
153 |
+
elo = Decimal(elo) # Ensure elo is a Decimal
|
154 |
+
std_error = Decimal(400) / (Decimal(votes).sqrt())
|
155 |
+
margin = Decimal(z) * std_error
|
156 |
+
return round(elo - margin, 2), round(elo + margin, 2)
|
157 |
|
158 |
# Function to query leaderboard
|
159 |
def get_leaderboard():
|