luelhagos commited on
Commit
cf9ec51
·
1 Parent(s): 62aeff2

Compare overall tasks by regions

Browse files
app.py CHANGED
@@ -28,7 +28,7 @@ from src.display.utils import (
28
  Precision, REGION_MAP
29
  )
30
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
31
- from src.populate import get_evaluation_queue_df, get_leaderboard_df
32
  from src.submission.submit import handle_csv_submission
33
 
34
  text_sample_path = "src/submission_samples/model_name_text.csv"
@@ -123,6 +123,24 @@ leaderboard_dataframes_speech = {
123
  )
124
  for region in REGION_MAP.values()
125
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
  # Preload leaderboard blocks
127
  js_switch_code = """
128
  (displayRegion) => {
@@ -142,9 +160,9 @@ js_switch_code = """
142
  const target = document.getElementById("leaderboard-" + region);
143
  if (target) {
144
  target.classList.add("visible");
145
- // 🧠 Trigger reflow to fix row cutoff
146
- void target.offsetHeight; // Trigger reflow
147
- target.style.display = "none"; // Hide momentarily
148
  requestAnimationFrame(() => {
149
  target.style.display = "";
150
  });
@@ -180,6 +198,44 @@ with demo:
180
  js=js_switch_code.replace("leaderboard-", "speech-leaderboard-"),
181
  inputs=[speech_region_dropdown]
182
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
183
  with gr.TabItem("🏅 mSTEB Text Benchmark", elem_id="llm-benchmark-tab-table", id=0):
184
  with gr.Row():
185
  region_dropdown = gr.Dropdown(
@@ -200,6 +256,44 @@ with demo:
200
  # JS hook to toggle visible leaderboard
201
  region_dropdown.change(None, js=js_switch_code, inputs=[region_dropdown])
202
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
203
  with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
204
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
205
 
 
28
  Precision, REGION_MAP
29
  )
30
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
31
+ from src.populate import get_evaluation_queue_df, get_leaderboard_df, get_regional_comparison_df
32
  from src.submission.submit import handle_csv_submission
33
 
34
  text_sample_path = "src/submission_samples/model_name_text.csv"
 
123
  )
124
  for region in REGION_MAP.values()
125
  }
126
+
127
+ # Pre-generate comparison dataframes
128
+ default_comparison_regions = ["All", "Africa", "Asia (S)"]
129
+ text_comparison_df = get_regional_comparison_df(
130
+ EVAL_RESULTS_PATH,
131
+ EVAL_REQUESTS_PATH,
132
+ default_comparison_regions,
133
+ BENCHMARK_COLS,
134
+ result_type="text"
135
+ )
136
+
137
+ speech_comparison_df = get_regional_comparison_df(
138
+ EVAL_RESULTS_PATH,
139
+ EVAL_REQUESTS_PATH,
140
+ default_comparison_regions,
141
+ SPEECH_BENCHMARK_COLS,
142
+ result_type="speech"
143
+ )
144
  # Preload leaderboard blocks
145
  js_switch_code = """
146
  (displayRegion) => {
 
160
  const target = document.getElementById("leaderboard-" + region);
161
  if (target) {
162
  target.classList.add("visible");
163
+ // Trigger reflow to fix row cutoff
164
+ void target.offsetHeight;
165
+ target.style.display = "none";
166
  requestAnimationFrame(() => {
167
  target.style.display = "";
168
  });
 
198
  js=js_switch_code.replace("leaderboard-", "speech-leaderboard-"),
199
  inputs=[speech_region_dropdown]
200
  )
201
+
202
+ # Regional comparison section for speech
203
+ with gr.Row():
204
+ gr.Markdown("### 🔍 Compare Regions (Average All Tasks)", elem_classes="markdown-text")
205
+
206
+ with gr.Row():
207
+ speech_compare_regions = gr.CheckboxGroup(
208
+ choices=list(REGION_MAP.keys()),
209
+ label="Select regions to compare",
210
+ value=default_comparison_regions,
211
+ interactive=True,
212
+ )
213
+
214
+ # Use Dataframe for dynamic columns
215
+ speech_comparison_output = gr.Dataframe(
216
+ value=speech_comparison_df,
217
+ interactive=False,
218
+ wrap=True,
219
+ elem_id="speech-comparison-table"
220
+ )
221
+
222
+ def update_speech_comparison(selected_regions):
223
+ if not selected_regions:
224
+ return pd.DataFrame()
225
+ df = get_regional_comparison_df(
226
+ EVAL_RESULTS_PATH,
227
+ EVAL_REQUESTS_PATH,
228
+ selected_regions,
229
+ SPEECH_BENCHMARK_COLS,
230
+ result_type="speech"
231
+ )
232
+ return df
233
+
234
+ speech_compare_regions.change(
235
+ update_speech_comparison,
236
+ inputs=[speech_compare_regions],
237
+ outputs=[speech_comparison_output]
238
+ )
239
  with gr.TabItem("🏅 mSTEB Text Benchmark", elem_id="llm-benchmark-tab-table", id=0):
240
  with gr.Row():
241
  region_dropdown = gr.Dropdown(
 
256
  # JS hook to toggle visible leaderboard
257
  region_dropdown.change(None, js=js_switch_code, inputs=[region_dropdown])
258
 
259
+ # Regional comparison section for text
260
+ with gr.Row():
261
+ gr.Markdown("### 🔍 Compare Regions (Average All Tasks)", elem_classes="markdown-text")
262
+
263
+ with gr.Row():
264
+ text_compare_regions = gr.CheckboxGroup(
265
+ choices=list(REGION_MAP.keys()),
266
+ label="Select regions to compare",
267
+ value=default_comparison_regions,
268
+ interactive=True,
269
+ )
270
+
271
+ # Use Dataframe for dynamic columns
272
+ text_comparison_output = gr.Dataframe(
273
+ value=text_comparison_df,
274
+ interactive=False,
275
+ wrap=True,
276
+ elem_id="text-comparison-table"
277
+ )
278
+
279
+ def update_text_comparison(selected_regions):
280
+ if not selected_regions:
281
+ return pd.DataFrame()
282
+ df = get_regional_comparison_df(
283
+ EVAL_RESULTS_PATH,
284
+ EVAL_REQUESTS_PATH,
285
+ selected_regions,
286
+ BENCHMARK_COLS,
287
+ result_type="text"
288
+ )
289
+ return df
290
+
291
+ text_compare_regions.change(
292
+ update_text_comparison,
293
+ inputs=[text_compare_regions],
294
+ outputs=[text_comparison_output]
295
+ )
296
+
297
  with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
298
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
299
 
src/about.py CHANGED
@@ -11,22 +11,22 @@ class Task:
11
  # Select your tasks here
12
  # ---------------------------------------------------
13
  class Tasks(Enum):
14
- # task_key in the json file, metric_key in the json file, name to display in the leaderboard
15
- task0 = Task("lid", "acc", "LID")
16
- task1 = Task("topic_classification", "acc", "TC")
17
- task2 = Task("rc_qa", "acc", "RC-QA")
18
- task3 = Task("nli", "acc", "NLI")
19
- task4 = Task("machine_translation_xx_eng", "chrf", "MT (xx-en)")
20
- task5 = Task("machine_translation_eng_xx", "chrf", "MT (en-xx)")
21
 
22
  class SpeechTasks(Enum):
23
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
24
- task0 = Task("lid", "acc", "LID")
25
- task1 = Task("topic_classification", "acc", "TC")
26
- task2 = Task("rc_qa", "acc", "RC-QA")
27
- task3 = Task("asr", "cer", "ASR")
28
- task4 = Task("s2tt_xx_eng", "chrf", "S2TT (xx-en)")
29
- #task5 = Task("s2tt_eng_xx", "chrf", "S2TT (en-xx)")
30
 
31
  NUM_FEWSHOT = 0 # Change with your few shot
32
  # ---------------------------------------------------
 
11
  # Select your tasks here
12
  # ---------------------------------------------------
13
  class Tasks(Enum):
14
+ # task_key in the json file, metric_key in the json file, name to display in the leaderboard
15
+ task0 = Task("lid", "acc", "LID (ACC)")
16
+ task1 = Task("topic_classification", "acc", "TC (ACC)")
17
+ task2 = Task("rc_qa", "acc", "RC-QA (ACC)")
18
+ task3 = Task("nli", "acc", "NLI (ACC)")
19
+ task4 = Task("machine_translation_xx_eng", "chrf", "MT (xx-en) (CHRF++)")
20
+ task5 = Task("machine_translation_eng_xx", "chrf", "MT (en-xx) (CHRF++)")
21
 
22
  class SpeechTasks(Enum):
23
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
24
+ task0 = Task("lid", "acc", "LID (ACC)")
25
+ task1 = Task("topic_classification", "acc", "TC (ACC)")
26
+ task2 = Task("rc_qa", "acc", "RC-QA (ACC)")
27
+ task3 = Task("asr", "cer", "ASR (100-CER)")
28
+ task4 = Task("s2tt_xx_eng", "chrf", "S2TT (xx-en) (CHRF++)")
29
+ #task5 = Task("s2tt_eng_xx", "chrf", "S2TT (en-xx) (CHRF++)")
30
 
31
  NUM_FEWSHOT = 0 # Change with your few shot
32
  # ---------------------------------------------------
src/display/css_html_js.py CHANGED
@@ -124,6 +124,26 @@ custom_css = """
124
  #speech-leaderboard-Europe_E.visible {
125
  display: block;
126
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
  """
128
 
129
  get_window_url_params = """
 
124
  #speech-leaderboard-Europe_E.visible {
125
  display: block;
126
  }
127
+
128
+ /* Target Gradio Dataframe headers with correct selectors */
129
+ #speech-comparison-table table thead th,
130
+ #text-comparison-table table thead th {
131
+ font-weight: 400 !important;
132
+ font-size: 14px !important;
133
+ padding: 6px !important;
134
+ }
135
+
136
+ /* Target with wrapper class */
137
+ #speech-comparison-table .table-wrap table thead th,
138
+ #text-comparison-table .table-wrap table thead th {
139
+ font-weight: 400 !important;
140
+ font-size: 14px !important;
141
+ }
142
+
143
+ /* Global table header override */
144
+ table thead th {
145
+ font-weight: 400 !important;
146
+ }
147
  """
148
 
149
  get_window_url_params = """
src/display/utils.py CHANGED
@@ -1,4 +1,4 @@
1
- from dataclasses import dataclass, make_dataclass
2
  from enum import Enum
3
 
4
  import pandas as pd
@@ -22,24 +22,36 @@ class ColumnContent:
22
  never_hidden: bool = False
23
 
24
  ## Leaderboard columns
 
 
25
  auto_eval_column_dict = []
26
  # Init
27
- # auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("Model Type", "str", True, never_hidden=True)])
28
- auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
 
 
29
  #Scores
30
- auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️ (Class. Tasks)", "number", True)])
 
31
  for task in Tasks:
32
- auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
 
33
 
34
  ### Speech leaderboard columns
 
 
35
  auto_eval_column_dict_speech = []
36
  # Init
37
- # auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("Model Type", "str", True, never_hidden=True)])
38
- auto_eval_column_dict_speech.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
 
 
39
  #Scores
40
- auto_eval_column_dict_speech.append(["average", ColumnContent, ColumnContent("Average ⬆️ (Class. Tasks)", "number", True)])
 
41
  for task in SpeechTasks:
42
- auto_eval_column_dict_speech.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
 
43
 
44
 
45
  # Model information
@@ -55,10 +67,16 @@ for task in SpeechTasks:
55
 
56
  # We use make dataclass to dynamically fill the scores from Tasks
57
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
 
 
 
 
58
  AutoEvalColumnSpeech = make_dataclass("AutoEvalColumnSpeech", auto_eval_column_dict_speech, frozen=True)
 
 
 
59
 
60
  ## For the queue columns in the submission tab
61
- @dataclass(frozen=True)
62
  class EvalQueueColumn: # Queue column
63
  model = ColumnContent("model", "markdown", True)
64
  revision = ColumnContent("revision", "str", True)
 
1
+ from dataclasses import dataclass, make_dataclass, field
2
  from enum import Enum
3
 
4
  import pandas as pd
 
22
  never_hidden: bool = False
23
 
24
  ## Leaderboard columns
25
+ # Store column content instances separately
26
+ auto_eval_column_content = {}
27
  auto_eval_column_dict = []
28
  # Init
29
+ # auto_eval_column_content["model_type_symbol"] = ColumnContent("Model Type", "str", True, never_hidden=True)
30
+ # auto_eval_column_dict.append(["model_type_symbol", ColumnContent])
31
+ auto_eval_column_content["model"] = ColumnContent("Model", "markdown", True, never_hidden=True)
32
+ auto_eval_column_dict.append(["model", ColumnContent])
33
  #Scores
34
+ auto_eval_column_content["average"] = ColumnContent("Average ⬆️", "number", True)
35
+ auto_eval_column_dict.append(["average", ColumnContent])
36
  for task in Tasks:
37
+ auto_eval_column_content[task.name] = ColumnContent(task.value.col_name, "number", True)
38
+ auto_eval_column_dict.append([task.name, ColumnContent])
39
 
40
  ### Speech leaderboard columns
41
+ # Store column content instances separately
42
+ auto_eval_column_content_speech = {}
43
  auto_eval_column_dict_speech = []
44
  # Init
45
+ # auto_eval_column_content_speech["model_type_symbol"] = ColumnContent("Model Type", "str", True, never_hidden=True)
46
+ # auto_eval_column_dict_speech.append(["model_type_symbol", ColumnContent])
47
+ auto_eval_column_content_speech["model"] = ColumnContent("Model", "markdown", True, never_hidden=True)
48
+ auto_eval_column_dict_speech.append(["model", ColumnContent])
49
  #Scores
50
+ auto_eval_column_content_speech["average"] = ColumnContent("Average ⬆️", "number", True)
51
+ auto_eval_column_dict_speech.append(["average", ColumnContent])
52
  for task in SpeechTasks:
53
+ auto_eval_column_content_speech[task.name] = ColumnContent(task.value.col_name, "number", True)
54
+ auto_eval_column_dict_speech.append([task.name, ColumnContent])
55
 
56
 
57
  # Model information
 
67
 
68
  # We use make dataclass to dynamically fill the scores from Tasks
69
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
70
+ # Set class attributes with the ColumnContent instances
71
+ for col_name, col_content in auto_eval_column_content.items():
72
+ setattr(AutoEvalColumn, col_name, col_content)
73
+
74
  AutoEvalColumnSpeech = make_dataclass("AutoEvalColumnSpeech", auto_eval_column_dict_speech, frozen=True)
75
+ # Set class attributes with the ColumnContent instances
76
+ for col_name, col_content in auto_eval_column_content_speech.items():
77
+ setattr(AutoEvalColumnSpeech, col_name, col_content)
78
 
79
  ## For the queue columns in the submission tab
 
80
  class EvalQueueColumn: # Queue column
81
  model = ColumnContent("model", "markdown", True)
82
  revision = ColumnContent("revision", "str", True)
src/leaderboard/read_evals.py CHANGED
@@ -129,20 +129,36 @@ class EvalResult:
129
 
130
  def to_dict(self, region=None, result_type='text'):
131
  """Converts the Eval Result to a dict compatible with our dataframe display"""
132
- # print(self.results)
133
  task_enum = Tasks if result_type == "text" else SpeechTasks
134
 
135
  results = self.results if region is None else self.regions.get(region, {})
136
- acc_values = [
137
- results[task.value.benchmark]
138
- for task in task_enum
139
- if task.value.metric == "acc" and task.value.benchmark in results
140
- ]
141
- # print(acc_values)
142
 
143
- average = sum(acc_values) / len(acc_values) if acc_values else None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
 
145
- # average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
146
  data_dict = {
147
  "eval_name": self.eval_name, # not a column, just a save name,
148
  # AutoEvalColumn.precision.name: self.precision.value.name,
@@ -161,9 +177,31 @@ class EvalResult:
161
 
162
  for task in task_enum:
163
  if task.value.benchmark in results:
164
- data_dict[task.value.col_name] = results[task.value.benchmark]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
165
  else:
166
- data_dict[task.value.col_name] = None # or np.nan if preferred
167
 
168
  return data_dict
169
 
@@ -217,8 +255,6 @@ def get_raw_eval_results(results_path: str, requests_path: str, result_type: str
217
  for model_result_filepath in model_result_filepaths:
218
  # Creation of result
219
  eval_result = EvalResult.init_from_json_file(model_result_filepath,result_type)
220
- # print('testing this one')
221
- # print(eval_result)
222
  eval_result.update_with_request_file(requests_path)
223
 
224
  # Store results of same eval together
@@ -231,11 +267,9 @@ def get_raw_eval_results(results_path: str, requests_path: str, result_type: str
231
  results = []
232
  for v in eval_results.values():
233
  try:
234
- v.to_dict() # we test if the dict version is complete
235
  results.append(v)
236
- except KeyError: # not all eval values present
237
  continue
238
- # print('results')
239
- # print(results)
240
 
241
  return results
 
129
 
130
  def to_dict(self, region=None, result_type='text'):
131
  """Converts the Eval Result to a dict compatible with our dataframe display"""
 
132
  task_enum = Tasks if result_type == "text" else SpeechTasks
133
 
134
  results = self.results if region is None else self.regions.get(region, {})
 
 
 
 
 
 
135
 
136
+ # Calculate average for ALL tasks (main)
137
+ all_scores = []
138
+ for task in task_enum:
139
+ if task.value.benchmark in results:
140
+ result_dict = results[task.value.benchmark]
141
+ # Handle nested dict structure (e.g., {'acc': 0.812})
142
+ if isinstance(result_dict, dict) and task.value.metric in result_dict:
143
+ score = result_dict[task.value.metric]
144
+ # Convert to percentage if needed
145
+ if score <= 1:
146
+ score = score * 100
147
+ # Convert CER to 100-CER for average calculation
148
+ if task.value.metric == "cer":
149
+ score = 100 - score
150
+ all_scores.append(score)
151
+ elif isinstance(result_dict, (int, float)):
152
+ score = result_dict
153
+ if score <= 1:
154
+ score = score * 100
155
+ # Convert CER to 100-CER for average calculation
156
+ if task.value.metric == "cer":
157
+ score = 100 - score
158
+ all_scores.append(score)
159
+
160
+ average = sum(all_scores) / len(all_scores) if all_scores else None
161
 
 
162
  data_dict = {
163
  "eval_name": self.eval_name, # not a column, just a save name,
164
  # AutoEvalColumn.precision.name: self.precision.value.name,
 
177
 
178
  for task in task_enum:
179
  if task.value.benchmark in results:
180
+ result_dict = results[task.value.benchmark]
181
+ # Handle nested dict structure
182
+ if isinstance(result_dict, dict) and task.value.metric in result_dict:
183
+ score = result_dict[task.value.metric]
184
+ # Convert to percentage if needed
185
+ if task.value.metric in ["acc", "chrf"] and score <= 1:
186
+ score = score * 100
187
+ elif task.value.metric == "cer":
188
+ if score <= 1:
189
+ score = score * 100
190
+ # Convert CER to 100-CER
191
+ score = 100 - score
192
+ data_dict[task.value.col_name] = score
193
+ elif isinstance(result_dict, (int, float)):
194
+ score = result_dict
195
+ # Convert CER to 100-CER
196
+ if task.value.metric == "cer":
197
+ if score <= 1:
198
+ score = score * 100
199
+ score = 100 - score
200
+ data_dict[task.value.col_name] = score
201
+ else:
202
+ data_dict[task.value.col_name] = None
203
  else:
204
+ data_dict[task.value.col_name] = None
205
 
206
  return data_dict
207
 
 
255
  for model_result_filepath in model_result_filepaths:
256
  # Creation of result
257
  eval_result = EvalResult.init_from_json_file(model_result_filepath,result_type)
 
 
258
  eval_result.update_with_request_file(requests_path)
259
 
260
  # Store results of same eval together
 
267
  results = []
268
  for v in eval_results.values():
269
  try:
270
+ v.to_dict()
271
  results.append(v)
272
+ except KeyError:
273
  continue
 
 
274
 
275
  return results
src/populate.py CHANGED
@@ -4,17 +4,14 @@ import os
4
  import pandas as pd
5
 
6
  from src.display.formatting import has_no_nan_values, make_clickable_model, has_at_least_one_benchmark
7
- from src.display.utils import AutoEvalColumn, EvalQueueColumn
8
  from src.leaderboard.read_evals import get_raw_eval_results
9
 
10
 
11
  def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list, region=None, result_type="text") -> pd.DataFrame:
12
  """Creates a dataframe from all the individual experiment results"""
13
  raw_data = get_raw_eval_results(results_path, requests_path, result_type=result_type)
14
- # this here if region is none gets main results. I have to pass region value here to get region based results
15
- # and they should come.
16
  all_data_json = [v.to_dict(region, result_type) for v in raw_data]
17
- # print('all_data_json', all_data_json)
18
  df = pd.DataFrame.from_records(all_data_json)
19
  df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
20
  df = df[cols].round(decimals=2)
@@ -57,3 +54,91 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
57
  df_running = pd.DataFrame.from_records(running_list, columns=cols)
58
  df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
59
  return df_finished[cols], df_running[cols], df_pending[cols]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  import pandas as pd
5
 
6
  from src.display.formatting import has_no_nan_values, make_clickable_model, has_at_least_one_benchmark
7
+ from src.display.utils import AutoEvalColumn, EvalQueueColumn, REGION_MAP
8
  from src.leaderboard.read_evals import get_raw_eval_results
9
 
10
 
11
  def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list, region=None, result_type="text") -> pd.DataFrame:
12
  """Creates a dataframe from all the individual experiment results"""
13
  raw_data = get_raw_eval_results(results_path, requests_path, result_type=result_type)
 
 
14
  all_data_json = [v.to_dict(region, result_type) for v in raw_data]
 
15
  df = pd.DataFrame.from_records(all_data_json)
16
  df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
17
  df = df[cols].round(decimals=2)
 
54
  df_running = pd.DataFrame.from_records(running_list, columns=cols)
55
  df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
56
  return df_finished[cols], df_running[cols], df_pending[cols]
57
+
58
+
59
+ def get_regional_comparison_df(results_path: str, requests_path: str, selected_regions: list, benchmark_cols: list, result_type="text") -> pd.DataFrame:
60
+ """Creates a comparison dataframe"""
61
+ if not selected_regions or len(selected_regions) < 1:
62
+ return pd.DataFrame()
63
+
64
+ raw_data = get_raw_eval_results(results_path, requests_path, result_type=result_type)
65
+ from src.about import Tasks, SpeechTasks
66
+
67
+ task_enum = Tasks if result_type == "text" else SpeechTasks
68
+
69
+ region_short_names = {
70
+ "All": "All",
71
+ "Africa": "Africa",
72
+ "Americas/Oceania": "Americas/Oceania",
73
+ "Asia (S)": "Asia (S)",
74
+ "Asia (SE)": "Asia (SE)",
75
+ "Asia (W, C)": "Asia (W,C)",
76
+ "Asia (E)": "Asia (E)",
77
+ "Europe (W, N, S)": "Europe (W,N,S)",
78
+ "Europe (E)": "Europe (E)",
79
+ }
80
+
81
+ # Build comparison data
82
+ comparison_data = []
83
+ for eval_result in raw_data:
84
+ row = {
85
+ "Model": eval_result.full_model,
86
+ }
87
+
88
+ # Add average scores for each selected region
89
+ for region in selected_regions:
90
+ region_key = REGION_MAP.get(region, region)
91
+
92
+ # Get region results
93
+ if region_key == "All":
94
+ results = eval_result.results
95
+ else:
96
+ results = eval_result.regions.get(region_key, {}) if eval_result.regions else {}
97
+
98
+ # Calculate average across ALL tasks for comparison table
99
+ all_scores = []
100
+ for task in task_enum:
101
+ if task.value.benchmark in results:
102
+ result_dict = results[task.value.benchmark]
103
+ # Handle nested dict structure
104
+ if isinstance(result_dict, dict) and task.value.metric in result_dict:
105
+ score = result_dict[task.value.metric]
106
+ elif isinstance(result_dict, (int, float)):
107
+ score = result_dict
108
+ else:
109
+ continue
110
+
111
+ if score is not None:
112
+ # Convert to percentage if needed
113
+ if task.value.metric in ["acc", "chrf"] and score <= 1:
114
+ score = score * 100
115
+ elif task.value.metric == "cer":
116
+ if score <= 1:
117
+ score = score * 100
118
+ score = 100 - score # Convert CER to accuracy-like metric
119
+ all_scores.append(score)
120
+
121
+ # Calculate average of all tasks
122
+ avg_all_tasks = sum(all_scores) / len(all_scores) if all_scores else None
123
+
124
+ # Use shortened region name
125
+ short_name = region_short_names.get(region, region)
126
+ row[short_name] = avg_all_tasks
127
+
128
+ comparison_data.append(row)
129
+
130
+ if not comparison_data:
131
+ return pd.DataFrame()
132
+
133
+ df = pd.DataFrame(comparison_data)
134
+
135
+ # Sort by the first region's average score
136
+ first_region_short = region_short_names.get(selected_regions[0], selected_regions[0])
137
+ if first_region_short in df.columns:
138
+ df = df.sort_values(by=[first_region_short], ascending=False)
139
+
140
+ # Round numerical columns
141
+ numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
142
+ df[numeric_cols] = df[numeric_cols].round(decimals=2)
143
+
144
+ return df
src/submission/submit.py CHANGED
@@ -41,13 +41,11 @@ def handle_csv_submission(
41
  path_or_fileobj=csv_save_path,
42
  path_in_repo=remote_path,
43
  repo_id=QUEUE_REPO,
44
- repo_type="dataset", # or "model" if you made the repo that way
45
  commit_message=f"Add {result_type} request for {model_name} at {current_time}",
46
  )
47
 
48
- # Remove the local file
49
  os.remove(csv_save_path)
50
- # this converts dataframe to json and uploads it to results
51
 
52
 
53
  try:
@@ -83,7 +81,6 @@ def convert_csv_to_json_and_upload(df: pd.DataFrame, model_name: str, result_typ
83
  raise ValueError(f"Region '{region}' not found in REGION_MAP keys.")
84
 
85
  # --- Build JSON ---
86
- # I go over the regions in the CSV and create a JSON object.
87
  model_json = {
88
  "config": {"model_name": model_name},
89
  "results": {},
@@ -100,7 +97,6 @@ def convert_csv_to_json_and_upload(df: pd.DataFrame, model_name: str, result_typ
100
  continue
101
  task = find_task_by_col_name(col, task_enum)
102
  if val is not None and not pd.isna(val) and isinstance(val, (int, float)):
103
- print(f" value {val}")
104
  at_least_one_number = True
105
  model_json["results"][task.value.benchmark] = {task.value.metric: val/100}
106
  else:
@@ -112,8 +108,6 @@ def convert_csv_to_json_and_upload(df: pd.DataFrame, model_name: str, result_typ
112
  if val is not None and not pd.isna(val) and isinstance(val, (int, float)):
113
  model_json["regions"][REGION_MAP[region_display]][task.value.benchmark] = {task.value.metric: val/100}
114
 
115
- # Check if at least one number is present in the results
116
- print(at_least_one_number)
117
  if at_least_one_number is False:
118
  raise ValueError("No valid numeric results found in the CSV. Please check your input.")
119
 
 
41
  path_or_fileobj=csv_save_path,
42
  path_in_repo=remote_path,
43
  repo_id=QUEUE_REPO,
44
+ repo_type="dataset",
45
  commit_message=f"Add {result_type} request for {model_name} at {current_time}",
46
  )
47
 
 
48
  os.remove(csv_save_path)
 
49
 
50
 
51
  try:
 
81
  raise ValueError(f"Region '{region}' not found in REGION_MAP keys.")
82
 
83
  # --- Build JSON ---
 
84
  model_json = {
85
  "config": {"model_name": model_name},
86
  "results": {},
 
97
  continue
98
  task = find_task_by_col_name(col, task_enum)
99
  if val is not None and not pd.isna(val) and isinstance(val, (int, float)):
 
100
  at_least_one_number = True
101
  model_json["results"][task.value.benchmark] = {task.value.metric: val/100}
102
  else:
 
108
  if val is not None and not pd.isna(val) and isinstance(val, (int, float)):
109
  model_json["regions"][REGION_MAP[region_display]][task.value.benchmark] = {task.value.metric: val/100}
110
 
 
 
111
  if at_least_one_number is False:
112
  raise ValueError("No valid numeric results found in the CSV. Please check your input.")
113