Lisa Dunlap
commited on
Commit
·
e121d4e
1
Parent(s):
df2a130
moved around text for asthetics purposes
Browse files
app.py
CHANGED
|
@@ -26,7 +26,7 @@ def make_default_md(arena_df, elo_results):
|
|
| 26 |
| [Vote](https://chat.lmsys.org) | [Blog](https://lmsys.org/blog/2023-05-03-arena/) | [GitHub](https://github.com/lm-sys/FastChat) | [Paper](https://arxiv.org/abs/2306.05685) | [Dataset](https://github.com/lm-sys/FastChat/blob/main/docs/dataset_release.md) | [Twitter](https://twitter.com/lmsysorg) | [Discord](https://discord.gg/HSWAKCrnFx) |
|
| 27 |
|
| 28 |
LMSYS [Chatbot Arena](https://lmsys.org/blog/2023-05-03-arena/) is a crowdsourced open platform for LLM evals.
|
| 29 |
-
We've collected over **500,000** human preference votes to rank LLMs with the Elo ranking system.
|
| 30 |
"""
|
| 31 |
return leaderboard_md
|
| 32 |
|
|
@@ -44,34 +44,10 @@ def make_arena_leaderboard_md(arena_df, arena_subset_df=None, name="Overall"):
|
|
| 44 |
leaderboard_md = f"""
|
| 45 |
Total #models: **{total_models}**.{space} Total #votes: **{"{:,}".format(total_votes)}**.{vote_str}{space} Last updated: March 29, 2024.
|
| 46 |
|
| 47 |
-
Contribute your vote 🗳️ at [chat.lmsys.org](https://chat.lmsys.org)! You can find code to recreate these tables and plots in this [notebook]({notebook_url}).
|
| 48 |
-
|
| 49 |
**NEW!** Click the buttons below to view the ELO leaderboard and stats for different input categories. You are currently viewing **{name}** inputs.
|
| 50 |
"""
|
| 51 |
return leaderboard_md
|
| 52 |
|
| 53 |
-
# def make_arena_leaderboard_md(arena_df, arena_chinese_df, arena_long_df, arena_english_df):
|
| 54 |
-
# # Calculate totals for each arena
|
| 55 |
-
# total_votes = sum(arena_df["num_battles"]) // 2
|
| 56 |
-
# total_chinese_votes = sum(arena_chinese_df["num_battles"]) // 2
|
| 57 |
-
# total_long_votes = sum(arena_long_df["num_battles"]) // 2
|
| 58 |
-
# total_english_votes = sum(arena_english_df["num_battles"]) // 2
|
| 59 |
-
|
| 60 |
-
# # Constructing the markdown table
|
| 61 |
-
# leaderboard_md = f"""
|
| 62 |
-
# Last updated: March 29, 2024.
|
| 63 |
-
# | | **Total** | English | Chinese | Long Context |
|
| 64 |
-
# | :-------------- | :-----------------------: | :-----------------------: | :-----------------------: | :-----------------------: |
|
| 65 |
-
# | # Votes | **{"{:,}".format(total_votes)}** | {"{:,}".format(total_english_votes)} | {"{:,}".format(total_chinese_votes)} | {"{:,}".format(total_long_votes)} |
|
| 66 |
-
# | # Models | **{len(arena_df)}** | {len(arena_english_df)}| {len(arena_chinese_df)} | {len(arena_long_df)} |
|
| 67 |
-
|
| 68 |
-
# Contribute your vote 🗳️ at [chat.lmsys.org](https://chat.lmsys.org)! You can find code to recreate these tables and plots in this [notebook]({notebook_url}).
|
| 69 |
-
# """
|
| 70 |
-
|
| 71 |
-
# return leaderboard_md
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
def make_full_leaderboard_md(elo_results):
|
| 76 |
leaderboard_md = f"""
|
| 77 |
Three benchmarks are displayed: **Arena Elo**, **MT-Bench** and **MMLU**.
|
|
@@ -301,8 +277,7 @@ def update_leaderboard_and_plots(button, arena_df, model_table_df, arena_subset_
|
|
| 301 |
p2 = elo_subset_results["battle_count_heatmap"]
|
| 302 |
p3 = elo_subset_results["bootstrap_elo_rating"]
|
| 303 |
p4 = elo_subset_results["average_win_rate_bar"]
|
| 304 |
-
more_stats_md = f"""## More Statistics for Chatbot Arena ({button})
|
| 305 |
-
You can find more discussions in this blog [post](https://lmsys.org/blog/2023-12-07-leaderboard/).
|
| 306 |
"""
|
| 307 |
leaderboard_md = make_arena_leaderboard_md(arena_df, arena_subset_df, name=button)
|
| 308 |
return arena_values, p1, p2, p3, p4, more_stats_md, leaderboard_md
|
|
@@ -383,11 +358,6 @@ def build_leaderboard_tab(elo_results_file, leaderboard_table_file, show_plot=Fa
|
|
| 383 |
column_widths=[70, 190, 110, 100, 90, 160, 150, 140],
|
| 384 |
wrap=True,
|
| 385 |
)
|
| 386 |
-
# Setup the button click action
|
| 387 |
-
# overall_rating.click(fn=update_overall_rating_df, inputs=overall_rating, outputs=elo_display_df)
|
| 388 |
-
# english_rating.click(fn=update_english_rating_df, inputs=english_rating, outputs=elo_display_df)
|
| 389 |
-
# chinese_rating.click(fn=update_chinese_rating_df, inputs=chinese_rating ,outputs=elo_display_df)
|
| 390 |
-
# long_context_rating.click(fn=update_long_context_rating_df, inputs=long_context_rating, outputs=elo_display_df)
|
| 391 |
|
| 392 |
with gr.Tab("Full Leaderboard", id=1):
|
| 393 |
md = make_full_leaderboard_md(elo_results)
|
|
@@ -422,7 +392,7 @@ def build_leaderboard_tab(elo_results_file, leaderboard_table_file, show_plot=Fa
|
|
| 422 |
gr.Markdown(
|
| 423 |
f"""Note: we take the 95% confidence interval into account when determining a model's ranking.
|
| 424 |
A model is ranked higher only if its lower bound of model score is higher than the upper bound of the other model's score.
|
| 425 |
-
See Figure 3 below for visualization of the confidence intervals.
|
| 426 |
""",
|
| 427 |
elem_id="leaderboard_markdown"
|
| 428 |
)
|
|
@@ -430,32 +400,31 @@ See Figure 3 below for visualization of the confidence intervals.
|
|
| 430 |
leader_component_values[:] = [default_md, p1, p2, p3, p4]
|
| 431 |
|
| 432 |
if show_plot:
|
| 433 |
-
more_stats_md = gr.Markdown(
|
| 434 |
-
|
| 435 |
-
|
| 436 |
-
|
| 437 |
-
|
| 438 |
-
)
|
| 439 |
with gr.Row():
|
| 440 |
with gr.Column():
|
| 441 |
gr.Markdown(
|
| 442 |
-
"#### Figure 1: Fraction of Model A Wins for All Non-tied A vs. B Battles"
|
| 443 |
)
|
| 444 |
-
plot_1 = gr.Plot(p1, show_label=False)
|
| 445 |
with gr.Column():
|
| 446 |
gr.Markdown(
|
| 447 |
-
"#### Figure 2: Battle Count for Each Combination of Models (without Ties)"
|
| 448 |
)
|
| 449 |
plot_2 = gr.Plot(p2, show_label=False)
|
| 450 |
with gr.Row():
|
| 451 |
with gr.Column():
|
| 452 |
gr.Markdown(
|
| 453 |
-
"#### Figure 3: Confidence Intervals on Model Strength (via Bootstrapping)"
|
| 454 |
)
|
| 455 |
plot_3 = gr.Plot(p3, show_label=False)
|
| 456 |
with gr.Column():
|
| 457 |
gr.Markdown(
|
| 458 |
-
"#### Figure 4: Average Win Rate Against All Other Models (Assuming Uniform Sampling and No Ties)"
|
| 459 |
)
|
| 460 |
plot_4 = gr.Plot(p4, show_label=False)
|
| 461 |
|
|
@@ -494,21 +463,23 @@ block_css = """
|
|
| 494 |
line-height: 0.1em;
|
| 495 |
}
|
| 496 |
|
| 497 |
-
#
|
| 498 |
-
|
| 499 |
-
|
| 500 |
-
}
|
| 501 |
-
#arena_leaderboard_dataframe th {
|
| 502 |
-
font-size: 20px;
|
| 503 |
}
|
| 504 |
|
| 505 |
-
|
| 506 |
-
|
| 507 |
-
|
| 508 |
-
|
| 509 |
-
|
| 510 |
-
|
| 511 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 512 |
}
|
| 513 |
|
| 514 |
footer {
|
|
@@ -540,11 +511,12 @@ We thank [Kaggle](https://www.kaggle.com/), [MBZUAI](https://mbzuai.ac.ae/), [a1
|
|
| 540 |
|
| 541 |
def build_demo(elo_results_file, leaderboard_table_file):
|
| 542 |
text_size = gr.themes.sizes.text_lg
|
| 543 |
-
|
|
|
|
| 544 |
with gr.Blocks(
|
| 545 |
title="Chatbot Arena Leaderboard",
|
| 546 |
-
|
| 547 |
-
theme = gr.themes.Base.load("theme.json"),
|
| 548 |
css=block_css,
|
| 549 |
) as demo:
|
| 550 |
leader_components = build_leaderboard_tab(
|
|
|
|
| 26 |
| [Vote](https://chat.lmsys.org) | [Blog](https://lmsys.org/blog/2023-05-03-arena/) | [GitHub](https://github.com/lm-sys/FastChat) | [Paper](https://arxiv.org/abs/2306.05685) | [Dataset](https://github.com/lm-sys/FastChat/blob/main/docs/dataset_release.md) | [Twitter](https://twitter.com/lmsysorg) | [Discord](https://discord.gg/HSWAKCrnFx) |
|
| 27 |
|
| 28 |
LMSYS [Chatbot Arena](https://lmsys.org/blog/2023-05-03-arena/) is a crowdsourced open platform for LLM evals.
|
| 29 |
+
We've collected over **500,000** human preference votes to rank LLMs with the Elo ranking system. Contribute your vote 🗳️ at [chat.lmsys.org](https://chat.lmsys.org)!
|
| 30 |
"""
|
| 31 |
return leaderboard_md
|
| 32 |
|
|
|
|
| 44 |
leaderboard_md = f"""
|
| 45 |
Total #models: **{total_models}**.{space} Total #votes: **{"{:,}".format(total_votes)}**.{vote_str}{space} Last updated: March 29, 2024.
|
| 46 |
|
|
|
|
|
|
|
| 47 |
**NEW!** Click the buttons below to view the ELO leaderboard and stats for different input categories. You are currently viewing **{name}** inputs.
|
| 48 |
"""
|
| 49 |
return leaderboard_md
|
| 50 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
def make_full_leaderboard_md(elo_results):
|
| 52 |
leaderboard_md = f"""
|
| 53 |
Three benchmarks are displayed: **Arena Elo**, **MT-Bench** and **MMLU**.
|
|
|
|
| 277 |
p2 = elo_subset_results["battle_count_heatmap"]
|
| 278 |
p3 = elo_subset_results["bootstrap_elo_rating"]
|
| 279 |
p4 = elo_subset_results["average_win_rate_bar"]
|
| 280 |
+
more_stats_md = f"""## More Statistics for Chatbot Arena ({button})
|
|
|
|
| 281 |
"""
|
| 282 |
leaderboard_md = make_arena_leaderboard_md(arena_df, arena_subset_df, name=button)
|
| 283 |
return arena_values, p1, p2, p3, p4, more_stats_md, leaderboard_md
|
|
|
|
| 358 |
column_widths=[70, 190, 110, 100, 90, 160, 150, 140],
|
| 359 |
wrap=True,
|
| 360 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 361 |
|
| 362 |
with gr.Tab("Full Leaderboard", id=1):
|
| 363 |
md = make_full_leaderboard_md(elo_results)
|
|
|
|
| 392 |
gr.Markdown(
|
| 393 |
f"""Note: we take the 95% confidence interval into account when determining a model's ranking.
|
| 394 |
A model is ranked higher only if its lower bound of model score is higher than the upper bound of the other model's score.
|
| 395 |
+
See Figure 3 below for visualization of the confidence intervals. Code to recreate these tables and plots in this [notebook]({notebook_url}) and more discussions in this blog [post](https://lmsys.org/blog/2023-12-07-leaderboard/).
|
| 396 |
""",
|
| 397 |
elem_id="leaderboard_markdown"
|
| 398 |
)
|
|
|
|
| 400 |
leader_component_values[:] = [default_md, p1, p2, p3, p4]
|
| 401 |
|
| 402 |
if show_plot:
|
| 403 |
+
# more_stats_md = gr.Markdown(
|
| 404 |
+
# f"""## More Statistics for Chatbot Arena (Overall)""",
|
| 405 |
+
# elem_id="leaderboard_markdown"
|
| 406 |
+
# )
|
| 407 |
+
more_stats_md = gr.Button("More Statistics for Chatbot Arena (Overall)", elem_id="non-interactive-button")
|
|
|
|
| 408 |
with gr.Row():
|
| 409 |
with gr.Column():
|
| 410 |
gr.Markdown(
|
| 411 |
+
"#### Figure 1: Fraction of Model A Wins for All Non-tied A vs. B Battles", elem_id="plot-title", variant="panel"
|
| 412 |
)
|
| 413 |
+
plot_1 = gr.Plot(p1, show_label=False, elem_id="plot-container")
|
| 414 |
with gr.Column():
|
| 415 |
gr.Markdown(
|
| 416 |
+
"#### Figure 2: Battle Count for Each Combination of Models (without Ties)", elem_id="plot-title"
|
| 417 |
)
|
| 418 |
plot_2 = gr.Plot(p2, show_label=False)
|
| 419 |
with gr.Row():
|
| 420 |
with gr.Column():
|
| 421 |
gr.Markdown(
|
| 422 |
+
"#### Figure 3: Confidence Intervals on Model Strength (via Bootstrapping)", elem_id="plot-title"
|
| 423 |
)
|
| 424 |
plot_3 = gr.Plot(p3, show_label=False)
|
| 425 |
with gr.Column():
|
| 426 |
gr.Markdown(
|
| 427 |
+
"#### Figure 4: Average Win Rate Against All Other Models (Assuming Uniform Sampling and No Ties)", elem_id="plot-title"
|
| 428 |
)
|
| 429 |
plot_4 = gr.Plot(p4, show_label=False)
|
| 430 |
|
|
|
|
| 463 |
line-height: 0.1em;
|
| 464 |
}
|
| 465 |
|
| 466 |
+
#plot-title {
|
| 467 |
+
text-align: center;
|
| 468 |
+
display:block;
|
|
|
|
|
|
|
|
|
|
| 469 |
}
|
| 470 |
|
| 471 |
+
#non-interactive-button {
|
| 472 |
+
display: inline-block;
|
| 473 |
+
padding: 10px 10px;
|
| 474 |
+
background-color: #f7f7f7; /* Super light grey background */
|
| 475 |
+
color: #000000; /* Black text */
|
| 476 |
+
text-align: center;
|
| 477 |
+
font-size: 26px; /* Larger text */
|
| 478 |
+
border-radius: 0; /* Straight edges, no border radius */
|
| 479 |
+
border: 0px solid #dcdcdc; /* A light grey border to match the background */
|
| 480 |
+
font-weight: bold;
|
| 481 |
+
user-select: none; /* The text inside the button is not selectable */
|
| 482 |
+
pointer-events: none; /* The button is non-interactive */
|
| 483 |
}
|
| 484 |
|
| 485 |
footer {
|
|
|
|
| 511 |
|
| 512 |
def build_demo(elo_results_file, leaderboard_table_file):
|
| 513 |
text_size = gr.themes.sizes.text_lg
|
| 514 |
+
theme = gr.themes.Base(text_size=text_size)
|
| 515 |
+
theme.set(button_secondary_background_fill_hover="*primary_300", button_secondary_background_fill_hover_dark="*primary_700")
|
| 516 |
with gr.Blocks(
|
| 517 |
title="Chatbot Arena Leaderboard",
|
| 518 |
+
theme=theme,
|
| 519 |
+
# theme = gr.themes.Base.load("theme.json"),
|
| 520 |
css=block_css,
|
| 521 |
) as demo:
|
| 522 |
leader_components = build_leaderboard_tab(
|