Spaces:
Running
Running
Commit
·
0b8a8d2
1
Parent(s):
bf17f55
Add speech tab
Browse files- app.py +47 -7
- src/about.py +8 -0
- src/display/css_html_js.py +15 -0
- src/display/utils.py +16 -1
- src/leaderboard/read_evals.py +19 -9
- src/populate.py +3 -3
app.py
CHANGED
|
@@ -15,10 +15,13 @@ from src.about import (
|
|
| 15 |
from src.display.css_html_js import custom_css
|
| 16 |
from src.display.utils import (
|
| 17 |
BENCHMARK_COLS,
|
|
|
|
| 18 |
COLS,
|
|
|
|
| 19 |
EVAL_COLS,
|
| 20 |
EVAL_TYPES,
|
| 21 |
AutoEvalColumn,
|
|
|
|
| 22 |
ModelType,
|
| 23 |
fields,
|
| 24 |
WeightType,
|
|
@@ -58,20 +61,22 @@ except Exception:
|
|
| 58 |
pending_eval_queue_df,
|
| 59 |
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
| 60 |
|
| 61 |
-
def init_leaderboard(dataframe):
|
| 62 |
if dataframe is None or dataframe.empty:
|
| 63 |
raise ValueError("Leaderboard DataFrame is empty or None.")
|
|
|
|
|
|
|
| 64 |
return Leaderboard(
|
| 65 |
value=dataframe,
|
| 66 |
-
datatype=[c.type for c in fields(
|
| 67 |
select_columns=SelectColumns(
|
| 68 |
-
default_selection=[c.name for c in fields(
|
| 69 |
-
cant_deselect=[c.name for c in fields(
|
| 70 |
label="Select Columns to Display:",
|
| 71 |
),
|
| 72 |
# search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
|
| 73 |
-
search_columns=[
|
| 74 |
-
hide_columns=[c.name for c in fields(
|
| 75 |
filter_columns=[
|
| 76 |
# ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
|
| 77 |
# ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
|
|
@@ -111,6 +116,19 @@ leaderboard_dataframes = {
|
|
| 111 |
COLS,
|
| 112 |
BENCHMARK_COLS,
|
| 113 |
region if region != "All" else None,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
)
|
| 115 |
for region in REGIONS
|
| 116 |
}
|
|
@@ -165,10 +183,32 @@ with demo:
|
|
| 165 |
elem_id=f"leaderboard-{region_key}",
|
| 166 |
elem_classes=["visible"] if region_key == "All" else []
|
| 167 |
):
|
| 168 |
-
init_leaderboard(leaderboard_dataframes[region_key])
|
| 169 |
|
| 170 |
# JS hook to toggle visible leaderboard
|
| 171 |
region_dropdown.change(None, js=js_switch_code, inputs=[region_dropdown])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 172 |
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
|
| 173 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
| 174 |
|
|
|
|
| 15 |
from src.display.css_html_js import custom_css
|
| 16 |
from src.display.utils import (
|
| 17 |
BENCHMARK_COLS,
|
| 18 |
+
SPEECH_BENCHMARK_COLS,
|
| 19 |
COLS,
|
| 20 |
+
COLS_SPEECH,
|
| 21 |
EVAL_COLS,
|
| 22 |
EVAL_TYPES,
|
| 23 |
AutoEvalColumn,
|
| 24 |
+
AutoEvalColumnSpeech,
|
| 25 |
ModelType,
|
| 26 |
fields,
|
| 27 |
WeightType,
|
|
|
|
| 61 |
pending_eval_queue_df,
|
| 62 |
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
| 63 |
|
| 64 |
+
def init_leaderboard(dataframe,result_type='text'):
|
| 65 |
if dataframe is None or dataframe.empty:
|
| 66 |
raise ValueError("Leaderboard DataFrame is empty or None.")
|
| 67 |
+
column_class = AutoEvalColumn if result_type == "text" else AutoEvalColumnSpeech
|
| 68 |
+
|
| 69 |
return Leaderboard(
|
| 70 |
value=dataframe,
|
| 71 |
+
datatype=[c.type for c in fields(column_class)],
|
| 72 |
select_columns=SelectColumns(
|
| 73 |
+
default_selection=[c.name for c in fields(column_class) if c.displayed_by_default],
|
| 74 |
+
cant_deselect=[c.name for c in fields(column_class) if c.never_hidden],
|
| 75 |
label="Select Columns to Display:",
|
| 76 |
),
|
| 77 |
# search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
|
| 78 |
+
search_columns=[column_class.model.name],
|
| 79 |
+
hide_columns=[c.name for c in fields(column_class) if c.hidden],
|
| 80 |
filter_columns=[
|
| 81 |
# ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
|
| 82 |
# ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
|
|
|
|
| 116 |
COLS,
|
| 117 |
BENCHMARK_COLS,
|
| 118 |
region if region != "All" else None,
|
| 119 |
+
result_type="text"
|
| 120 |
+
)
|
| 121 |
+
for region in REGIONS
|
| 122 |
+
}
|
| 123 |
+
|
| 124 |
+
leaderboard_dataframes_speech = {
|
| 125 |
+
region: get_leaderboard_df(
|
| 126 |
+
EVAL_RESULTS_PATH,
|
| 127 |
+
EVAL_REQUESTS_PATH,
|
| 128 |
+
COLS_SPEECH,
|
| 129 |
+
SPEECH_BENCHMARK_COLS,
|
| 130 |
+
region if region != "All" else None,
|
| 131 |
+
result_type="speech"
|
| 132 |
)
|
| 133 |
for region in REGIONS
|
| 134 |
}
|
|
|
|
| 183 |
elem_id=f"leaderboard-{region_key}",
|
| 184 |
elem_classes=["visible"] if region_key == "All" else []
|
| 185 |
):
|
| 186 |
+
init_leaderboard(leaderboard_dataframes[region_key], result_type="text")
|
| 187 |
|
| 188 |
# JS hook to toggle visible leaderboard
|
| 189 |
region_dropdown.change(None, js=js_switch_code, inputs=[region_dropdown])
|
| 190 |
+
|
| 191 |
+
with gr.TabItem("🗣️ mSTEB Speech Benchmark", elem_id="speech-benchmark-tab-table", id=1):
|
| 192 |
+
with gr.Row():
|
| 193 |
+
speech_region_dropdown = gr.Dropdown(
|
| 194 |
+
choices=list(REGION_MAP.keys()),
|
| 195 |
+
label="Select Region",
|
| 196 |
+
value="All",
|
| 197 |
+
interactive=True,
|
| 198 |
+
)
|
| 199 |
+
|
| 200 |
+
for display_name, region_key in REGION_MAP.items():
|
| 201 |
+
with gr.Column(
|
| 202 |
+
elem_id=f"speech-leaderboard-{region_key}",
|
| 203 |
+
elem_classes=["visible"] if region_key == "All" else []
|
| 204 |
+
):
|
| 205 |
+
init_leaderboard(leaderboard_dataframes_speech[region_key],result_type='speech')
|
| 206 |
+
|
| 207 |
+
speech_region_dropdown.change(
|
| 208 |
+
None,
|
| 209 |
+
js=js_switch_code.replace("leaderboard-", "speech-leaderboard-"),
|
| 210 |
+
inputs=[speech_region_dropdown]
|
| 211 |
+
)
|
| 212 |
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
|
| 213 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
| 214 |
|
src/about.py
CHANGED
|
@@ -19,6 +19,14 @@ class Tasks(Enum):
|
|
| 19 |
task4 = Task("machine_translation_xx_eng", "chrf", "MT (xx-en)")
|
| 20 |
task5 = Task("machine_translation_eng_xx", "chrf", "MT (en-xx)")
|
| 21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
NUM_FEWSHOT = 0 # Change with your few shot
|
| 23 |
# ---------------------------------------------------
|
| 24 |
|
|
|
|
| 19 |
task4 = Task("machine_translation_xx_eng", "chrf", "MT (xx-en)")
|
| 20 |
task5 = Task("machine_translation_eng_xx", "chrf", "MT (en-xx)")
|
| 21 |
|
| 22 |
+
class SpeechTasks(Enum):
|
| 23 |
+
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
| 24 |
+
task0 = Task("lid", "acc", "LID")
|
| 25 |
+
task1 = Task("topic_classification", "acc", "TC")
|
| 26 |
+
task2 = Task("rc_qa", "acc", "RC-QA")
|
| 27 |
+
task3 = Task("asr", "cer", "ASR")
|
| 28 |
+
task4 = Task("s2tt", "chrf", "S2TT")
|
| 29 |
+
|
| 30 |
NUM_FEWSHOT = 0 # Change with your few shot
|
| 31 |
# ---------------------------------------------------
|
| 32 |
|
src/display/css_html_js.py
CHANGED
|
@@ -109,6 +109,21 @@ custom_css = """
|
|
| 109 |
display: block;
|
| 110 |
}
|
| 111 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
"""
|
| 113 |
|
| 114 |
get_window_url_params = """
|
|
|
|
| 109 |
display: block;
|
| 110 |
}
|
| 111 |
|
| 112 |
+
|
| 113 |
+
[id^="speech-leaderboard-"] {
|
| 114 |
+
display: none;
|
| 115 |
+
}
|
| 116 |
+
#speech-leaderboard-All.visible,
|
| 117 |
+
#speech-leaderboard-Africa.visible,
|
| 118 |
+
#speech-leaderboard-Americas_Oceania.visible,
|
| 119 |
+
#speech-leaderboard-Asia_S.visible,
|
| 120 |
+
#speech-leaderboard-Asia_SE.visible,
|
| 121 |
+
#speech-leaderboard-Asia_W_C.visible,
|
| 122 |
+
#speech-leaderboard-Asia_E.visible,
|
| 123 |
+
#speech-leaderboard-Europe_W_N_S.visible,
|
| 124 |
+
#speech-leaderboard-Europe_E.visible {
|
| 125 |
+
display: block;
|
| 126 |
+
}
|
| 127 |
"""
|
| 128 |
|
| 129 |
get_window_url_params = """
|
src/display/utils.py
CHANGED
|
@@ -4,6 +4,7 @@ from enum import Enum
|
|
| 4 |
import pandas as pd
|
| 5 |
|
| 6 |
from src.about import Tasks
|
|
|
|
| 7 |
|
| 8 |
def fields(raw_class):
|
| 9 |
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
|
@@ -29,6 +30,18 @@ auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "ma
|
|
| 29 |
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️ (Class. Tasks)", "number", True)])
|
| 30 |
for task in Tasks:
|
| 31 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
# Model information
|
| 33 |
# auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
| 34 |
# auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
|
@@ -42,6 +55,7 @@ for task in Tasks:
|
|
| 42 |
|
| 43 |
# We use make dataclass to dynamically fill the scores from Tasks
|
| 44 |
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
|
|
|
| 45 |
|
| 46 |
## For the queue columns in the submission tab
|
| 47 |
@dataclass(frozen=True)
|
|
@@ -102,9 +116,10 @@ class Precision(Enum):
|
|
| 102 |
|
| 103 |
# Column selection
|
| 104 |
COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
|
|
|
|
| 105 |
|
| 106 |
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
| 107 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
| 108 |
|
| 109 |
BENCHMARK_COLS = [t.value.col_name for t in Tasks]
|
| 110 |
-
|
|
|
|
| 4 |
import pandas as pd
|
| 5 |
|
| 6 |
from src.about import Tasks
|
| 7 |
+
from src.about import SpeechTasks
|
| 8 |
|
| 9 |
def fields(raw_class):
|
| 10 |
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
|
|
|
| 30 |
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️ (Class. Tasks)", "number", True)])
|
| 31 |
for task in Tasks:
|
| 32 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
| 33 |
+
|
| 34 |
+
### Speech leaderboard columns
|
| 35 |
+
auto_eval_column_dict_speech = []
|
| 36 |
+
# Init
|
| 37 |
+
# auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("Model Type", "str", True, never_hidden=True)])
|
| 38 |
+
auto_eval_column_dict_speech.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
| 39 |
+
#Scores
|
| 40 |
+
auto_eval_column_dict_speech.append(["average", ColumnContent, ColumnContent("Average ⬆️ (Class. Tasks)", "number", True)])
|
| 41 |
+
for task in SpeechTasks:
|
| 42 |
+
auto_eval_column_dict_speech.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
| 43 |
+
|
| 44 |
+
|
| 45 |
# Model information
|
| 46 |
# auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
| 47 |
# auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
|
|
|
| 55 |
|
| 56 |
# We use make dataclass to dynamically fill the scores from Tasks
|
| 57 |
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
| 58 |
+
AutoEvalColumnSpeech = make_dataclass("AutoEvalColumnSpeech", auto_eval_column_dict_speech, frozen=True)
|
| 59 |
|
| 60 |
## For the queue columns in the submission tab
|
| 61 |
@dataclass(frozen=True)
|
|
|
|
| 116 |
|
| 117 |
# Column selection
|
| 118 |
COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
|
| 119 |
+
COLS_SPEECH = [c.name for c in fields(AutoEvalColumnSpeech) if not c.hidden]
|
| 120 |
|
| 121 |
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
| 122 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
| 123 |
|
| 124 |
BENCHMARK_COLS = [t.value.col_name for t in Tasks]
|
| 125 |
+
SPEECH_BENCHMARK_COLS = [t.value.col_name for t in SpeechTasks]
|
src/leaderboard/read_evals.py
CHANGED
|
@@ -8,7 +8,7 @@ import dateutil
|
|
| 8 |
import numpy as np
|
| 9 |
|
| 10 |
from src.display.formatting import make_clickable_model
|
| 11 |
-
from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
|
| 12 |
from src.submission.check_validity import is_model_on_hub
|
| 13 |
|
| 14 |
|
|
@@ -34,7 +34,7 @@ class EvalResult:
|
|
| 34 |
regions: dict = None
|
| 35 |
|
| 36 |
@classmethod
|
| 37 |
-
def init_from_json_file(self, json_filepath):
|
| 38 |
"""Inits the result from the specific model result file"""
|
| 39 |
with open(json_filepath) as fp:
|
| 40 |
data = json.load(fp)
|
|
@@ -70,7 +70,10 @@ class EvalResult:
|
|
| 70 |
|
| 71 |
# Extract results available in this file (some results are split in several files)
|
| 72 |
results = {}
|
| 73 |
-
|
|
|
|
|
|
|
|
|
|
| 74 |
task = task.value
|
| 75 |
|
| 76 |
# We average all scores of a given metric (not all metrics are present in all files)
|
|
@@ -84,7 +87,7 @@ class EvalResult:
|
|
| 84 |
regions_processed_results = {}
|
| 85 |
for region, region_results in regions.items():
|
| 86 |
processed = {}
|
| 87 |
-
for task in
|
| 88 |
task = task.value
|
| 89 |
|
| 90 |
# We average all scores of a given metric (not all metrics are present in all files)
|
|
@@ -124,13 +127,15 @@ class EvalResult:
|
|
| 124 |
except Exception:
|
| 125 |
print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
|
| 126 |
|
| 127 |
-
def to_dict(self, region=None):
|
| 128 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
| 129 |
# print(self.results)
|
|
|
|
|
|
|
| 130 |
results = self.results if region is None else self.regions.get(region, {})
|
| 131 |
acc_values = [
|
| 132 |
results[task.value.benchmark]
|
| 133 |
-
for task in
|
| 134 |
if task.value.metric == "acc" and task.value.benchmark in results
|
| 135 |
]
|
| 136 |
# print(acc_values)
|
|
@@ -154,7 +159,7 @@ class EvalResult:
|
|
| 154 |
# AutoEvalColumn.still_on_hub.name: self.still_on_hub,
|
| 155 |
}
|
| 156 |
|
| 157 |
-
for task in
|
| 158 |
if task.value.benchmark in results:
|
| 159 |
data_dict[task.value.col_name] = results[task.value.benchmark]
|
| 160 |
else:
|
|
@@ -185,12 +190,17 @@ def get_request_file_for_model(requests_path, model_name, precision):
|
|
| 185 |
return request_file
|
| 186 |
|
| 187 |
|
| 188 |
-
def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
|
| 189 |
"""From the path of the results folder root, extract all needed info for results"""
|
|
|
|
| 190 |
model_result_filepaths = []
|
| 191 |
|
| 192 |
for root, _, files in os.walk(results_path):
|
| 193 |
# We should only have json files in model results
|
|
|
|
|
|
|
|
|
|
|
|
|
| 194 |
if len(files) == 0 or any([not f.endswith(".json") for f in files]):
|
| 195 |
continue
|
| 196 |
|
|
@@ -206,7 +216,7 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
|
|
| 206 |
eval_results = {}
|
| 207 |
for model_result_filepath in model_result_filepaths:
|
| 208 |
# Creation of result
|
| 209 |
-
eval_result = EvalResult.init_from_json_file(model_result_filepath)
|
| 210 |
# print('testing this one')
|
| 211 |
# print(eval_result)
|
| 212 |
eval_result.update_with_request_file(requests_path)
|
|
|
|
| 8 |
import numpy as np
|
| 9 |
|
| 10 |
from src.display.formatting import make_clickable_model
|
| 11 |
+
from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType, SpeechTasks
|
| 12 |
from src.submission.check_validity import is_model_on_hub
|
| 13 |
|
| 14 |
|
|
|
|
| 34 |
regions: dict = None
|
| 35 |
|
| 36 |
@classmethod
|
| 37 |
+
def init_from_json_file(self, json_filepath, result_type='speech'):
|
| 38 |
"""Inits the result from the specific model result file"""
|
| 39 |
with open(json_filepath) as fp:
|
| 40 |
data = json.load(fp)
|
|
|
|
| 70 |
|
| 71 |
# Extract results available in this file (some results are split in several files)
|
| 72 |
results = {}
|
| 73 |
+
|
| 74 |
+
task_enum = Tasks if result_type == "text" else SpeechTasks
|
| 75 |
+
|
| 76 |
+
for task in task_enum:
|
| 77 |
task = task.value
|
| 78 |
|
| 79 |
# We average all scores of a given metric (not all metrics are present in all files)
|
|
|
|
| 87 |
regions_processed_results = {}
|
| 88 |
for region, region_results in regions.items():
|
| 89 |
processed = {}
|
| 90 |
+
for task in task_enum:
|
| 91 |
task = task.value
|
| 92 |
|
| 93 |
# We average all scores of a given metric (not all metrics are present in all files)
|
|
|
|
| 127 |
except Exception:
|
| 128 |
print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
|
| 129 |
|
| 130 |
+
def to_dict(self, region=None, result_type='text'):
|
| 131 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
| 132 |
# print(self.results)
|
| 133 |
+
task_enum = Tasks if result_type == "text" else SpeechTasks
|
| 134 |
+
|
| 135 |
results = self.results if region is None else self.regions.get(region, {})
|
| 136 |
acc_values = [
|
| 137 |
results[task.value.benchmark]
|
| 138 |
+
for task in task_enum
|
| 139 |
if task.value.metric == "acc" and task.value.benchmark in results
|
| 140 |
]
|
| 141 |
# print(acc_values)
|
|
|
|
| 159 |
# AutoEvalColumn.still_on_hub.name: self.still_on_hub,
|
| 160 |
}
|
| 161 |
|
| 162 |
+
for task in task_enum:
|
| 163 |
if task.value.benchmark in results:
|
| 164 |
data_dict[task.value.col_name] = results[task.value.benchmark]
|
| 165 |
else:
|
|
|
|
| 190 |
return request_file
|
| 191 |
|
| 192 |
|
| 193 |
+
def get_raw_eval_results(results_path: str, requests_path: str, result_type: str = "text") -> list[EvalResult]:
|
| 194 |
"""From the path of the results folder root, extract all needed info for results"""
|
| 195 |
+
# result type
|
| 196 |
model_result_filepaths = []
|
| 197 |
|
| 198 |
for root, _, files in os.walk(results_path):
|
| 199 |
# We should only have json files in model results
|
| 200 |
+
if result_type == "text" and "msteb_text_results" not in root:
|
| 201 |
+
continue
|
| 202 |
+
if result_type == "speech" and "msteb_speech_results" not in root:
|
| 203 |
+
continue
|
| 204 |
if len(files) == 0 or any([not f.endswith(".json") for f in files]):
|
| 205 |
continue
|
| 206 |
|
|
|
|
| 216 |
eval_results = {}
|
| 217 |
for model_result_filepath in model_result_filepaths:
|
| 218 |
# Creation of result
|
| 219 |
+
eval_result = EvalResult.init_from_json_file(model_result_filepath,result_type)
|
| 220 |
# print('testing this one')
|
| 221 |
# print(eval_result)
|
| 222 |
eval_result.update_with_request_file(requests_path)
|
src/populate.py
CHANGED
|
@@ -8,12 +8,12 @@ from src.display.utils import AutoEvalColumn, EvalQueueColumn
|
|
| 8 |
from src.leaderboard.read_evals import get_raw_eval_results
|
| 9 |
|
| 10 |
|
| 11 |
-
def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list, region=None) -> pd.DataFrame:
|
| 12 |
"""Creates a dataframe from all the individual experiment results"""
|
| 13 |
-
raw_data = get_raw_eval_results(results_path, requests_path)
|
| 14 |
# this here if region is none gets main results. I have to pass region value here to get region based results
|
| 15 |
# and they should come.
|
| 16 |
-
all_data_json = [v.to_dict(region) for v in raw_data]
|
| 17 |
# print('all_data_json', all_data_json)
|
| 18 |
df = pd.DataFrame.from_records(all_data_json)
|
| 19 |
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
|
|
|
| 8 |
from src.leaderboard.read_evals import get_raw_eval_results
|
| 9 |
|
| 10 |
|
| 11 |
+
def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list, region=None, result_type="text") -> pd.DataFrame:
|
| 12 |
"""Creates a dataframe from all the individual experiment results"""
|
| 13 |
+
raw_data = get_raw_eval_results(results_path, requests_path, result_type=result_type)
|
| 14 |
# this here if region is none gets main results. I have to pass region value here to get region based results
|
| 15 |
# and they should come.
|
| 16 |
+
all_data_json = [v.to_dict(region, result_type) for v in raw_data]
|
| 17 |
# print('all_data_json', all_data_json)
|
| 18 |
df = pd.DataFrame.from_records(all_data_json)
|
| 19 |
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|