Lisa Dunlap
commited on
Commit
·
df2a130
1
Parent(s):
a2fadac
moved buttons back to tab
Browse files
app.py
CHANGED
|
@@ -31,42 +31,44 @@ We've collected over **500,000** human preference votes to rank LLMs with the El
|
|
| 31 |
return leaderboard_md
|
| 32 |
|
| 33 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
# def make_arena_leaderboard_md(arena_df, arena_chinese_df, arena_long_df, arena_english_df):
|
|
|
|
| 35 |
# total_votes = sum(arena_df["num_battles"]) // 2
|
| 36 |
-
#
|
| 37 |
-
# total_code_votes = sum(arena_chinese_df["num_battles"]) // 2
|
| 38 |
-
# total_code_models = len(arena_chinese_df)
|
| 39 |
# total_long_votes = sum(arena_long_df["num_battles"]) // 2
|
| 40 |
-
# total_long_models = len(arena_long_df)
|
| 41 |
# total_english_votes = sum(arena_english_df["num_battles"]) // 2
|
| 42 |
-
# total_english_models = len(arena_english_df)
|
| 43 |
|
|
|
|
| 44 |
# leaderboard_md = f"""
|
| 45 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
|
| 47 |
-
# Contribute your vote 🗳️ at [chat.lmsys.org](https://chat.lmsys.org)!
|
| 48 |
# """
|
| 49 |
-
# return leaderboard_md
|
| 50 |
-
|
| 51 |
-
def make_arena_leaderboard_md(arena_df, arena_chinese_df, arena_long_df, arena_english_df):
|
| 52 |
-
# Calculate totals for each arena
|
| 53 |
-
total_votes = sum(arena_df["num_battles"]) // 2
|
| 54 |
-
total_chinese_votes = sum(arena_chinese_df["num_battles"]) // 2
|
| 55 |
-
total_long_votes = sum(arena_long_df["num_battles"]) // 2
|
| 56 |
-
total_english_votes = sum(arena_english_df["num_battles"]) // 2
|
| 57 |
-
|
| 58 |
-
# Constructing the markdown table
|
| 59 |
-
leaderboard_md = f"""
|
| 60 |
-
Last updated: March 29, 2024.
|
| 61 |
-
| | **Total** | English | Chinese | Long Context |
|
| 62 |
-
| :-------------- | :-----------------------: | :-----------------------: | :-----------------------: | :-----------------------: |
|
| 63 |
-
| # Votes | **{"{:,}".format(total_votes)}** | {"{:,}".format(total_english_votes)} | {"{:,}".format(total_chinese_votes)} | {"{:,}".format(total_long_votes)} |
|
| 64 |
-
| # Models | **{len(arena_df)}** | {len(arena_english_df)}| {len(arena_chinese_df)} | {len(arena_long_df)} |
|
| 65 |
|
| 66 |
-
|
| 67 |
-
"""
|
| 68 |
-
|
| 69 |
-
return leaderboard_md
|
| 70 |
|
| 71 |
|
| 72 |
|
|
@@ -231,9 +233,9 @@ def get_full_table(arena_df, model_table_df):
|
|
| 231 |
|
| 232 |
def create_ranking_str(ranking, ranking_difference):
|
| 233 |
if ranking_difference > 0:
|
| 234 |
-
return f"{int(ranking)} (\u2191
|
| 235 |
elif ranking_difference < 0:
|
| 236 |
-
return f"{int(ranking)} (\u2193
|
| 237 |
else:
|
| 238 |
return f"{int(ranking)}"
|
| 239 |
|
|
@@ -293,12 +295,17 @@ def get_arena_table(arena_df, model_table_df, arena_subset_df=None):
|
|
| 293 |
print(f"{model_key} - {e}")
|
| 294 |
return values
|
| 295 |
|
| 296 |
-
def
|
|
|
|
| 297 |
p1 = elo_subset_results["win_fraction_heatmap"]
|
| 298 |
p2 = elo_subset_results["battle_count_heatmap"]
|
| 299 |
p3 = elo_subset_results["bootstrap_elo_rating"]
|
| 300 |
p4 = elo_subset_results["average_win_rate_bar"]
|
| 301 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 302 |
|
| 303 |
|
| 304 |
def build_leaderboard_tab(elo_results_file, leaderboard_table_file, show_plot=False):
|
|
@@ -325,6 +332,7 @@ def build_leaderboard_tab(elo_results_file, leaderboard_table_file, show_plot=Fa
|
|
| 325 |
default_md = make_default_md(arena_df, elo_results)
|
| 326 |
|
| 327 |
md_1 = gr.Markdown(default_md, elem_id="leaderboard_markdown")
|
|
|
|
| 328 |
if leaderboard_table_file:
|
| 329 |
data = load_leaderboard_table_csv(leaderboard_table_file)
|
| 330 |
model_table_df = pd.DataFrame(data)
|
|
@@ -333,17 +341,21 @@ def build_leaderboard_tab(elo_results_file, leaderboard_table_file, show_plot=Fa
|
|
| 333 |
# arena table
|
| 334 |
arena_table_vals = get_arena_table(arena_df, model_table_df)
|
| 335 |
with gr.Tab("Arena Elo", id=0):
|
| 336 |
-
md = make_arena_leaderboard_md(arena_df
|
| 337 |
-
gr.Markdown(md, elem_id="leaderboard_markdown")
|
| 338 |
with gr.Row():
|
| 339 |
overall_rating = gr.Button("Overall")
|
| 340 |
-
update_overall_rating_df = lambda _: get_arena_table(arena_df, model_table_df)
|
|
|
|
| 341 |
english_rating = gr.Button("English")
|
| 342 |
-
update_english_rating_df = lambda
|
|
|
|
| 343 |
chinese_rating = gr.Button("Chinese")
|
| 344 |
-
update_chinese_rating_df = lambda
|
|
|
|
| 345 |
long_context_rating = gr.Button("Long Context")
|
| 346 |
-
update_long_context_rating_df = lambda
|
|
|
|
| 347 |
elo_display_df = gr.Dataframe(
|
| 348 |
headers=[
|
| 349 |
"Rank",
|
|
@@ -368,14 +380,14 @@ def build_leaderboard_tab(elo_results_file, leaderboard_table_file, show_plot=Fa
|
|
| 368 |
value=arena_table_vals,
|
| 369 |
elem_id="arena_leaderboard_dataframe",
|
| 370 |
height=700,
|
| 371 |
-
column_widths=[70, 190,
|
| 372 |
wrap=True,
|
| 373 |
)
|
| 374 |
# Setup the button click action
|
| 375 |
-
overall_rating.click(fn=update_overall_rating_df, inputs=overall_rating, outputs=elo_display_df)
|
| 376 |
-
english_rating.click(fn=update_english_rating_df, inputs=english_rating, outputs=elo_display_df)
|
| 377 |
-
chinese_rating.click(fn=update_chinese_rating_df, inputs=chinese_rating ,outputs=elo_display_df)
|
| 378 |
-
long_context_rating.click(fn=update_long_context_rating_df, inputs=long_context_rating, outputs=elo_display_df)
|
| 379 |
|
| 380 |
with gr.Tab("Full Leaderboard", id=1):
|
| 381 |
md = make_full_leaderboard_md(elo_results)
|
|
@@ -418,22 +430,12 @@ See Figure 3 below for visualization of the confidence intervals.
|
|
| 418 |
leader_component_values[:] = [default_md, p1, p2, p3, p4]
|
| 419 |
|
| 420 |
if show_plot:
|
| 421 |
-
gr.Markdown(
|
| 422 |
f"""## More Statistics for Chatbot Arena\n
|
| 423 |
-
|
| 424 |
-
You can find more discussions in this blog [post](https://lmsys.org/blog/2023-12-07-leaderboard/).
|
| 425 |
""",
|
| 426 |
elem_id="leaderboard_markdown"
|
| 427 |
)
|
| 428 |
-
with gr.Row():
|
| 429 |
-
overall_plots = gr.Button("Overall")
|
| 430 |
-
update_overall_plots = lambda _: get_plots(elo_results)
|
| 431 |
-
english_plots = gr.Button("English")
|
| 432 |
-
update_english_plot = lambda _: get_plots(elo_english_results)
|
| 433 |
-
chinese_plots = gr.Button("Chinese")
|
| 434 |
-
update_chinese_plot = lambda _: get_plots(elo_chinese_results)
|
| 435 |
-
long_context_plots = gr.Button("Long Context")
|
| 436 |
-
update_long_context_plot = lambda _: get_plots(elo_long_results)
|
| 437 |
with gr.Row():
|
| 438 |
with gr.Column():
|
| 439 |
gr.Markdown(
|
|
@@ -457,10 +459,10 @@ You can find more discussions in this blog [post](https://lmsys.org/blog/2023-12
|
|
| 457 |
)
|
| 458 |
plot_4 = gr.Plot(p4, show_label=False)
|
| 459 |
|
| 460 |
-
|
| 461 |
-
|
| 462 |
-
|
| 463 |
-
|
| 464 |
|
| 465 |
gr.Markdown(acknowledgment_md)
|
| 466 |
|
|
@@ -494,7 +496,7 @@ block_css = """
|
|
| 494 |
|
| 495 |
#arena_leaderboard_dataframe td {
|
| 496 |
line-height: 0.15em;
|
| 497 |
-
font-size:
|
| 498 |
}
|
| 499 |
#arena_leaderboard_dataframe th {
|
| 500 |
font-size: 20px;
|
|
@@ -503,7 +505,7 @@ block_css = """
|
|
| 503 |
|
| 504 |
#full_leaderboard_dataframe td {
|
| 505 |
line-height: 0.15em;
|
| 506 |
-
font-size:
|
| 507 |
}
|
| 508 |
#full_leaderboard_dataframe th {
|
| 509 |
font-size: 20px;
|
|
@@ -538,12 +540,10 @@ We thank [Kaggle](https://www.kaggle.com/), [MBZUAI](https://mbzuai.ac.ae/), [a1
|
|
| 538 |
|
| 539 |
def build_demo(elo_results_file, leaderboard_table_file):
|
| 540 |
text_size = gr.themes.sizes.text_lg
|
| 541 |
-
print("text_size", text_size)
|
| 542 |
|
| 543 |
with gr.Blocks(
|
| 544 |
title="Chatbot Arena Leaderboard",
|
| 545 |
-
# theme=gr.themes.
|
| 546 |
-
# theme='reilnuud/polite',
|
| 547 |
theme = gr.themes.Base.load("theme.json"),
|
| 548 |
css=block_css,
|
| 549 |
) as demo:
|
|
|
|
| 31 |
return leaderboard_md
|
| 32 |
|
| 33 |
|
| 34 |
+
def make_arena_leaderboard_md(arena_df, arena_subset_df=None, name="Overall"):
|
| 35 |
+
total_votes = sum(arena_df["num_battles"]) // 2
|
| 36 |
+
total_models = len(arena_df)
|
| 37 |
+
space = " "
|
| 38 |
+
if arena_subset_df is not None:
|
| 39 |
+
total_subset_votes = sum(arena_subset_df["num_battles"]) // 2
|
| 40 |
+
total_subset_models = len(arena_subset_df)
|
| 41 |
+
vote_str = f"{space} {name} #models: **{total_subset_models}**.{space} {name} #votes: **{'{:,}'.format(total_subset_votes)}**."
|
| 42 |
+
else:
|
| 43 |
+
vote_str = ""
|
| 44 |
+
leaderboard_md = f"""
|
| 45 |
+
Total #models: **{total_models}**.{space} Total #votes: **{"{:,}".format(total_votes)}**.{vote_str}{space} Last updated: March 29, 2024.
|
| 46 |
+
|
| 47 |
+
Contribute your vote 🗳️ at [chat.lmsys.org](https://chat.lmsys.org)! You can find code to recreate these tables and plots in this [notebook]({notebook_url}).
|
| 48 |
+
|
| 49 |
+
**NEW!** Click the buttons below to view the ELO leaderboard and stats for different input categories. You are currently viewing **{name}** inputs.
|
| 50 |
+
"""
|
| 51 |
+
return leaderboard_md
|
| 52 |
+
|
| 53 |
# def make_arena_leaderboard_md(arena_df, arena_chinese_df, arena_long_df, arena_english_df):
|
| 54 |
+
# # Calculate totals for each arena
|
| 55 |
# total_votes = sum(arena_df["num_battles"]) // 2
|
| 56 |
+
# total_chinese_votes = sum(arena_chinese_df["num_battles"]) // 2
|
|
|
|
|
|
|
| 57 |
# total_long_votes = sum(arena_long_df["num_battles"]) // 2
|
|
|
|
| 58 |
# total_english_votes = sum(arena_english_df["num_battles"]) // 2
|
|
|
|
| 59 |
|
| 60 |
+
# # Constructing the markdown table
|
| 61 |
# leaderboard_md = f"""
|
| 62 |
+
# Last updated: March 29, 2024.
|
| 63 |
+
# | | **Total** | English | Chinese | Long Context |
|
| 64 |
+
# | :-------------- | :-----------------------: | :-----------------------: | :-----------------------: | :-----------------------: |
|
| 65 |
+
# | # Votes | **{"{:,}".format(total_votes)}** | {"{:,}".format(total_english_votes)} | {"{:,}".format(total_chinese_votes)} | {"{:,}".format(total_long_votes)} |
|
| 66 |
+
# | # Models | **{len(arena_df)}** | {len(arena_english_df)}| {len(arena_chinese_df)} | {len(arena_long_df)} |
|
| 67 |
|
| 68 |
+
# Contribute your vote 🗳️ at [chat.lmsys.org](https://chat.lmsys.org)! You can find code to recreate these tables and plots in this [notebook]({notebook_url}).
|
| 69 |
# """
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
|
| 71 |
+
# return leaderboard_md
|
|
|
|
|
|
|
|
|
|
| 72 |
|
| 73 |
|
| 74 |
|
|
|
|
| 233 |
|
| 234 |
def create_ranking_str(ranking, ranking_difference):
|
| 235 |
if ranking_difference > 0:
|
| 236 |
+
return f"{int(ranking)} (\u2191{int(ranking_difference)})"
|
| 237 |
elif ranking_difference < 0:
|
| 238 |
+
return f"{int(ranking)} (\u2193{int(-ranking_difference)})"
|
| 239 |
else:
|
| 240 |
return f"{int(ranking)}"
|
| 241 |
|
|
|
|
| 295 |
print(f"{model_key} - {e}")
|
| 296 |
return values
|
| 297 |
|
| 298 |
+
def update_leaderboard_and_plots(button, arena_df, model_table_df, arena_subset_df, elo_subset_results):
|
| 299 |
+
arena_values = get_arena_table(arena_df, model_table_df, arena_subset_df)
|
| 300 |
p1 = elo_subset_results["win_fraction_heatmap"]
|
| 301 |
p2 = elo_subset_results["battle_count_heatmap"]
|
| 302 |
p3 = elo_subset_results["bootstrap_elo_rating"]
|
| 303 |
p4 = elo_subset_results["average_win_rate_bar"]
|
| 304 |
+
more_stats_md = f"""## More Statistics for Chatbot Arena ({button})\n
|
| 305 |
+
You can find more discussions in this blog [post](https://lmsys.org/blog/2023-12-07-leaderboard/).
|
| 306 |
+
"""
|
| 307 |
+
leaderboard_md = make_arena_leaderboard_md(arena_df, arena_subset_df, name=button)
|
| 308 |
+
return arena_values, p1, p2, p3, p4, more_stats_md, leaderboard_md
|
| 309 |
|
| 310 |
|
| 311 |
def build_leaderboard_tab(elo_results_file, leaderboard_table_file, show_plot=False):
|
|
|
|
| 332 |
default_md = make_default_md(arena_df, elo_results)
|
| 333 |
|
| 334 |
md_1 = gr.Markdown(default_md, elem_id="leaderboard_markdown")
|
| 335 |
+
# md = make_arena_leaderboard_md(arena_df, arena_chinese_df, arena_long_df, arena_english_df)
|
| 336 |
if leaderboard_table_file:
|
| 337 |
data = load_leaderboard_table_csv(leaderboard_table_file)
|
| 338 |
model_table_df = pd.DataFrame(data)
|
|
|
|
| 341 |
# arena table
|
| 342 |
arena_table_vals = get_arena_table(arena_df, model_table_df)
|
| 343 |
with gr.Tab("Arena Elo", id=0):
|
| 344 |
+
md = make_arena_leaderboard_md(arena_df)
|
| 345 |
+
leaderboard_markdown = gr.Markdown(md, elem_id="leaderboard_markdown")
|
| 346 |
with gr.Row():
|
| 347 |
overall_rating = gr.Button("Overall")
|
| 348 |
+
# update_overall_rating_df = lambda _: get_arena_table(arena_df, model_table_df)
|
| 349 |
+
update_overall_rating_df = lambda x: update_leaderboard_and_plots(x, arena_df, model_table_df, None, elo_results)
|
| 350 |
english_rating = gr.Button("English")
|
| 351 |
+
update_english_rating_df = lambda x: update_leaderboard_and_plots(x, arena_df, model_table_df, arena_english_df, elo_english_results)
|
| 352 |
+
# update_english_rating_df = lambda _: get_arena_table(arena_df, model_table_df, arena_english_df)
|
| 353 |
chinese_rating = gr.Button("Chinese")
|
| 354 |
+
update_chinese_rating_df = lambda x: update_leaderboard_and_plots(x, arena_df, model_table_df, arena_chinese_df, elo_chinese_results)
|
| 355 |
+
# update_chinese_rating_df = lambda _: get_arena_table(arena_df, model_table_df, arena_chinese_df)
|
| 356 |
long_context_rating = gr.Button("Long Context")
|
| 357 |
+
update_long_context_rating_df = lambda x: update_leaderboard_and_plots(x, arena_df, model_table_df, arena_long_df, elo_long_results)
|
| 358 |
+
# update_long_context_rating_df = lambda _: get_arena_table(arena_df, model_table_df, arena_long_df)
|
| 359 |
elo_display_df = gr.Dataframe(
|
| 360 |
headers=[
|
| 361 |
"Rank",
|
|
|
|
| 380 |
value=arena_table_vals,
|
| 381 |
elem_id="arena_leaderboard_dataframe",
|
| 382 |
height=700,
|
| 383 |
+
column_widths=[70, 190, 110, 100, 90, 160, 150, 140],
|
| 384 |
wrap=True,
|
| 385 |
)
|
| 386 |
# Setup the button click action
|
| 387 |
+
# overall_rating.click(fn=update_overall_rating_df, inputs=overall_rating, outputs=elo_display_df)
|
| 388 |
+
# english_rating.click(fn=update_english_rating_df, inputs=english_rating, outputs=elo_display_df)
|
| 389 |
+
# chinese_rating.click(fn=update_chinese_rating_df, inputs=chinese_rating ,outputs=elo_display_df)
|
| 390 |
+
# long_context_rating.click(fn=update_long_context_rating_df, inputs=long_context_rating, outputs=elo_display_df)
|
| 391 |
|
| 392 |
with gr.Tab("Full Leaderboard", id=1):
|
| 393 |
md = make_full_leaderboard_md(elo_results)
|
|
|
|
| 430 |
leader_component_values[:] = [default_md, p1, p2, p3, p4]
|
| 431 |
|
| 432 |
if show_plot:
|
| 433 |
+
more_stats_md = gr.Markdown(
|
| 434 |
f"""## More Statistics for Chatbot Arena\n
|
| 435 |
+
You can find more discussions in this blog [post](https://lmsys.org/blog/2023-12-07-leaderboard/).
|
|
|
|
| 436 |
""",
|
| 437 |
elem_id="leaderboard_markdown"
|
| 438 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 439 |
with gr.Row():
|
| 440 |
with gr.Column():
|
| 441 |
gr.Markdown(
|
|
|
|
| 459 |
)
|
| 460 |
plot_4 = gr.Plot(p4, show_label=False)
|
| 461 |
|
| 462 |
+
overall_rating.click(fn=update_overall_rating_df, inputs=overall_rating, outputs=[elo_display_df, plot_1, plot_2, plot_3, plot_4, more_stats_md, leaderboard_markdown])
|
| 463 |
+
english_rating.click(fn=update_english_rating_df, inputs=english_rating, outputs=[elo_display_df, plot_1, plot_2, plot_3, plot_4, more_stats_md, leaderboard_markdown])
|
| 464 |
+
chinese_rating.click(fn=update_chinese_rating_df, inputs=chinese_rating ,outputs=[elo_display_df, plot_1, plot_2, plot_3, plot_4, more_stats_md, leaderboard_markdown])
|
| 465 |
+
long_context_rating.click(fn=update_long_context_rating_df, inputs=long_context_rating, outputs=[elo_display_df, plot_1, plot_2, plot_3, plot_4, more_stats_md, leaderboard_markdown])
|
| 466 |
|
| 467 |
gr.Markdown(acknowledgment_md)
|
| 468 |
|
|
|
|
| 496 |
|
| 497 |
#arena_leaderboard_dataframe td {
|
| 498 |
line-height: 0.15em;
|
| 499 |
+
font-size: 18px;
|
| 500 |
}
|
| 501 |
#arena_leaderboard_dataframe th {
|
| 502 |
font-size: 20px;
|
|
|
|
| 505 |
|
| 506 |
#full_leaderboard_dataframe td {
|
| 507 |
line-height: 0.15em;
|
| 508 |
+
font-size: 18px;
|
| 509 |
}
|
| 510 |
#full_leaderboard_dataframe th {
|
| 511 |
font-size: 20px;
|
|
|
|
| 540 |
|
| 541 |
def build_demo(elo_results_file, leaderboard_table_file):
|
| 542 |
text_size = gr.themes.sizes.text_lg
|
|
|
|
| 543 |
|
| 544 |
with gr.Blocks(
|
| 545 |
title="Chatbot Arena Leaderboard",
|
| 546 |
+
# theme=gr.themes.Base(text_size=text_size),
|
|
|
|
| 547 |
theme = gr.themes.Base.load("theme.json"),
|
| 548 |
css=block_css,
|
| 549 |
) as demo:
|