File size: 1,870 Bytes
0d67035
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
"""
Column definitions for DeathMath Leaderboard table.
"""

import logging
from dataclasses import dataclass, make_dataclass
from enum import Enum

logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")


def fields(raw_class):
    """Extract non-dunder fields from a dataclass."""
    return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]


@dataclass
class Task:
    """Represents a benchmark task with its metrics."""

    benchmark: str
    metric: str
    col_name: str


class Tasks(Enum):
    """Available benchmark tasks for DeathMath leaderboard."""

    math = Task("RussianMath", "score", "math_score")
    physics = Task("RussianPhysics", "score", "physics_score")
    combined = Task("Combined", "score", "score")


@dataclass(frozen=True)
class ColumnContent:
    """Configuration for a leaderboard table column."""

    name: str
    type: str
    displayed_by_default: bool
    hidden: bool = False
    never_hidden: bool = False
    dummy: bool = False


auto_eval_column_dict = []
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("model", "markdown", True, never_hidden=True)])
auto_eval_column_dict.append(["score", ColumnContent, ColumnContent("score", "number", True)])
for task in Tasks:
    if task != Tasks.combined:
        auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])

auto_eval_column_dict.append(["total_tokens", ColumnContent, ColumnContent("total_tokens", "number", False)])
auto_eval_column_dict.append(["evaluation_time", ColumnContent, ColumnContent("evaluation_time", "number", False)])
auto_eval_column_dict.append(["system_prompt", ColumnContent, ColumnContent("system_prompt", "str", False)])

AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)