DOoM-lb / src /columns.py
Anonumous's picture
Refactor project structure and update project version
0d67035
"""
Column definitions for DeathMath Leaderboard table.
"""
import logging
from dataclasses import dataclass, make_dataclass
from enum import Enum
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
def fields(raw_class):
"""Extract non-dunder fields from a dataclass."""
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
@dataclass
class Task:
"""Represents a benchmark task with its metrics."""
benchmark: str
metric: str
col_name: str
class Tasks(Enum):
"""Available benchmark tasks for DeathMath leaderboard."""
math = Task("RussianMath", "score", "math_score")
physics = Task("RussianPhysics", "score", "physics_score")
combined = Task("Combined", "score", "score")
@dataclass(frozen=True)
class ColumnContent:
"""Configuration for a leaderboard table column."""
name: str
type: str
displayed_by_default: bool
hidden: bool = False
never_hidden: bool = False
dummy: bool = False
auto_eval_column_dict = []
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("model", "markdown", True, never_hidden=True)])
auto_eval_column_dict.append(["score", ColumnContent, ColumnContent("score", "number", True)])
for task in Tasks:
if task != Tasks.combined:
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
auto_eval_column_dict.append(["total_tokens", ColumnContent, ColumnContent("total_tokens", "number", False)])
auto_eval_column_dict.append(["evaluation_time", ColumnContent, ColumnContent("evaluation_time", "number", False)])
auto_eval_column_dict.append(["system_prompt", ColumnContent, ColumnContent("system_prompt", "str", False)])
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)