Spaces:
Sleeping
Sleeping
Joschka Strueber
commited on
Commit
·
0a42e99
1
Parent(s):
4077e51
[Add] add bbh and gpqa benchmarks again with correct answer_index selection
Browse files- app.py +1 -1
- src/app_util.py +2 -2
- src/dataloading.py +46 -16
- src/utils.py +14 -1
app.py
CHANGED
|
@@ -21,7 +21,7 @@ metric_init = "CAPA"
|
|
| 21 |
with gr.Blocks(title="LLM Similarity Analyzer", css=app_util.custom_css) as demo:
|
| 22 |
gr.Markdown("# Model Similarity Comparison Tool")
|
| 23 |
gr.Markdown(links_markdown)
|
| 24 |
-
gr.Markdown('This is
|
| 25 |
|
| 26 |
with gr.Row():
|
| 27 |
dataset_dropdown = gr.Dropdown(
|
|
|
|
| 21 |
with gr.Blocks(title="LLM Similarity Analyzer", css=app_util.custom_css) as demo:
|
| 22 |
gr.Markdown("# Model Similarity Comparison Tool")
|
| 23 |
gr.Markdown(links_markdown)
|
| 24 |
+
gr.Markdown('This is an interactive demo for the recent publication "[Great Models Think Alike and this Undermines AI Oversight](https://huggingface.co/papers/2502.04313)." You can compare the functional similarity of hundreds of Language Models on the Open LLM Leaderboard v2 benchmark datasets.')
|
| 25 |
|
| 26 |
with gr.Row():
|
| 27 |
dataset_dropdown = gr.Dropdown(
|
src/app_util.py
CHANGED
|
@@ -90,9 +90,9 @@ def update_datasets_based_on_models(selected_models, current_dataset):
|
|
| 90 |
|
| 91 |
custom_css = """
|
| 92 |
.image-container img {
|
| 93 |
-
width:
|
| 94 |
height: auto !important; /* Maintain aspect ratio */
|
| 95 |
-
max-width:
|
| 96 |
display: block;
|
| 97 |
margin: auto; /* Center the image */
|
| 98 |
}
|
|
|
|
| 90 |
|
| 91 |
custom_css = """
|
| 92 |
.image-container img {
|
| 93 |
+
width: 80% !important; /* Make it 80% of the parent container */
|
| 94 |
height: auto !important; /* Maintain aspect ratio */
|
| 95 |
+
max-width: 800px; /* Optional: Set a max limit */
|
| 96 |
display: block;
|
| 97 |
margin: auto; /* Center the image */
|
| 98 |
}
|
src/dataloading.py
CHANGED
|
@@ -4,6 +4,7 @@ from huggingface_hub import HfApi
|
|
| 4 |
|
| 5 |
from functools import lru_cache
|
| 6 |
|
|
|
|
| 7 |
|
| 8 |
def get_leaderboard_models_reload():
|
| 9 |
api = HfApi()
|
|
@@ -69,7 +70,7 @@ def get_leaderboard_models_cached():
|
|
| 69 |
|
| 70 |
def get_leaderboard_datasets(model_ids):
|
| 71 |
if model_ids is None:
|
| 72 |
-
return ['bbh_boolean_expressions', 'bbh_causal_judgement', 'bbh_date_understanding', 'bbh_disambiguation_qa', 'bbh_formal_fallacies', 'bbh_geometric_shapes', 'bbh_hyperbaton', 'bbh_logical_deduction_five_objects', 'bbh_logical_deduction_seven_objects', 'bbh_logical_deduction_three_objects', 'bbh_movie_recommendation', 'bbh_navigate', 'bbh_object_counting', 'bbh_penguins_in_a_table', 'bbh_reasoning_about_colored_objects', 'bbh_ruin_names', 'bbh_salient_translation_error_detection', 'bbh_snarks', 'bbh_sports_understanding', 'bbh_temporal_sequences', 'bbh_tracking_shuffled_objects_five_objects', 'bbh_tracking_shuffled_objects_seven_objects', 'bbh_tracking_shuffled_objects_three_objects', 'bbh_web_of_lies', 'gpqa_diamond', 'gpqa_extended', 'gpqa_main', '
|
| 73 |
|
| 74 |
# Map each model to its corresponding leaderboard version
|
| 75 |
leaderboard_model_ids = [f"open-llm-leaderboard/{model_id.replace('/', '__')}-details" for model_id in model_ids]
|
|
@@ -87,7 +88,7 @@ def get_leaderboard_datasets(model_ids):
|
|
| 87 |
common_datasets = set.intersection(*model_datasets.values())
|
| 88 |
|
| 89 |
# Filter datasets that are not MCQ or currently do not work
|
| 90 |
-
ignore = ["
|
| 91 |
discard = []
|
| 92 |
for dataset in common_datasets:
|
| 93 |
for ignore_data in ignore:
|
|
@@ -98,22 +99,56 @@ def get_leaderboard_datasets(model_ids):
|
|
| 98 |
return sorted(common_datasets)
|
| 99 |
|
| 100 |
|
| 101 |
-
def filter_labels(doc):
|
| 102 |
labels = []
|
|
|
|
| 103 |
if "answer_index" in doc[0].keys():
|
| 104 |
for d in doc:
|
| 105 |
labels.append(d["answer_index"])
|
| 106 |
-
|
| 107 |
for d in doc:
|
| 108 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 109 |
labels.append(0)
|
| 110 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 111 |
labels.append(1)
|
| 112 |
-
|
| 113 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
return labels
|
| 115 |
|
| 116 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 117 |
|
| 118 |
def load_run_data(model_name, dataset_name):
|
| 119 |
try:
|
|
@@ -125,14 +160,9 @@ def load_run_data(model_name, dataset_name):
|
|
| 125 |
data = data.sort("doc_id")
|
| 126 |
data = data.to_dict()
|
| 127 |
|
| 128 |
-
# Get
|
| 129 |
-
log_probs =
|
| 130 |
-
|
| 131 |
-
log_prob = np.array([float(option[0]) for option in resp])
|
| 132 |
-
log_probs.append(log_prob)
|
| 133 |
-
|
| 134 |
-
# Get ground truth labels
|
| 135 |
-
labels = filter_labels(data["doc"])
|
| 136 |
|
| 137 |
except Exception as e:
|
| 138 |
print(e)
|
|
|
|
| 4 |
|
| 5 |
from functools import lru_cache
|
| 6 |
|
| 7 |
+
from utils import opt_in_pars_to_index, get_test_target
|
| 8 |
|
| 9 |
def get_leaderboard_models_reload():
|
| 10 |
api = HfApi()
|
|
|
|
| 70 |
|
| 71 |
def get_leaderboard_datasets(model_ids):
|
| 72 |
if model_ids is None:
|
| 73 |
+
return ['bbh_boolean_expressions', 'bbh_causal_judgement', 'bbh_date_understanding', 'bbh_disambiguation_qa', 'bbh_formal_fallacies', 'bbh_geometric_shapes', 'bbh_hyperbaton', 'bbh_logical_deduction_five_objects', 'bbh_logical_deduction_seven_objects', 'bbh_logical_deduction_three_objects', 'bbh_movie_recommendation', 'bbh_navigate', 'bbh_object_counting', 'bbh_penguins_in_a_table', 'bbh_reasoning_about_colored_objects', 'bbh_ruin_names', 'bbh_salient_translation_error_detection', 'bbh_snarks', 'bbh_sports_understanding', 'bbh_temporal_sequences', 'bbh_tracking_shuffled_objects_five_objects', 'bbh_tracking_shuffled_objects_seven_objects', 'bbh_tracking_shuffled_objects_three_objects', 'bbh_web_of_lies', 'gpqa_diamond', 'gpqa_extended', 'gpqa_main', 'mmlu_pro', 'musr_murder_mysteries', 'musr_object_placements', 'musr_team_allocation']
|
| 74 |
|
| 75 |
# Map each model to its corresponding leaderboard version
|
| 76 |
leaderboard_model_ids = [f"open-llm-leaderboard/{model_id.replace('/', '__')}-details" for model_id in model_ids]
|
|
|
|
| 88 |
common_datasets = set.intersection(*model_datasets.values())
|
| 89 |
|
| 90 |
# Filter datasets that are not MCQ or currently do not work
|
| 91 |
+
ignore = ["math_", "ifeval"]
|
| 92 |
discard = []
|
| 93 |
for dataset in common_datasets:
|
| 94 |
for ignore_data in ignore:
|
|
|
|
| 99 |
return sorted(common_datasets)
|
| 100 |
|
| 101 |
|
| 102 |
+
def filter_labels(dataset_name, doc):
|
| 103 |
labels = []
|
| 104 |
+
test_target, target_key = get_test_target(doc[0])
|
| 105 |
if "answer_index" in doc[0].keys():
|
| 106 |
for d in doc:
|
| 107 |
labels.append(d["answer_index"])
|
| 108 |
+
elif test_target.starts_with("("):
|
| 109 |
for d in doc:
|
| 110 |
+
labels.append(opt_in_pars_to_index(d[target_key]))
|
| 111 |
+
elif dataset_name in ["bbh_boolean_expression"]:
|
| 112 |
+
for d in doc:
|
| 113 |
+
if d[target_key] == "True":
|
| 114 |
+
labels.append(1)
|
| 115 |
+
elif d[target_key] == "False":
|
| 116 |
labels.append(0)
|
| 117 |
+
elif dataset_name in ["bbh_causal_judgement", "bbh_navigate", "bbh_web_of_lies"]:
|
| 118 |
+
for d in doc:
|
| 119 |
+
if d[target_key] == "Yes":
|
| 120 |
+
labels.append(0)
|
| 121 |
+
elif d[target_key] == "No":
|
| 122 |
labels.append(1)
|
| 123 |
+
elif dataset_name in ["bbh_formal_fallacies"]:
|
| 124 |
+
for d in doc:
|
| 125 |
+
if d[target_key] == "valid":
|
| 126 |
+
labels.append(0)
|
| 127 |
+
elif d[target_key] == "invalid":
|
| 128 |
+
labels.append(1)
|
| 129 |
+
elif dataset_name in ["bbh_sports_understanding"]:
|
| 130 |
+
for d in doc:
|
| 131 |
+
if d[target_key] == "yes":
|
| 132 |
+
labels.append(0)
|
| 133 |
+
elif d[target_key] == "no":
|
| 134 |
+
labels.append(1)
|
| 135 |
+
elif test_target.is_digit():
|
| 136 |
+
for d in doc:
|
| 137 |
+
labels.append(int(d[target_key]))
|
| 138 |
+
|
| 139 |
return labels
|
| 140 |
|
| 141 |
|
| 142 |
+
def filter_responses(data):
|
| 143 |
+
# Get log probabilities for each response
|
| 144 |
+
log_probs = []
|
| 145 |
+
|
| 146 |
+
for resp in data["filtered_resps"]:
|
| 147 |
+
log_prob = np.array([float(option[0]) for option in resp])
|
| 148 |
+
log_probs.append(log_prob)
|
| 149 |
+
|
| 150 |
+
return log_probs
|
| 151 |
+
|
| 152 |
|
| 153 |
def load_run_data(model_name, dataset_name):
|
| 154 |
try:
|
|
|
|
| 160 |
data = data.sort("doc_id")
|
| 161 |
data = data.to_dict()
|
| 162 |
|
| 163 |
+
# Get ground truth labels and logits
|
| 164 |
+
log_probs = filter_responses(dataset_name, data)
|
| 165 |
+
labels = filter_labels(dataset_name, data["doc"])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 166 |
|
| 167 |
except Exception as e:
|
| 168 |
print(e)
|
src/utils.py
CHANGED
|
@@ -8,4 +8,17 @@ def softmax(logits: np.ndarray) -> np.ndarray:
|
|
| 8 |
def one_hot(probs: np.array) -> np.array:
|
| 9 |
one_hot = np.zeros_like(probs)
|
| 10 |
one_hot[np.argmax(probs)] = 1
|
| 11 |
-
return one_hot
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
def one_hot(probs: np.array) -> np.array:
|
| 9 |
one_hot = np.zeros_like(probs)
|
| 10 |
one_hot[np.argmax(probs)] = 1
|
| 11 |
+
return one_hot
|
| 12 |
+
|
| 13 |
+
def opt_in_pars_to_index(s):
|
| 14 |
+
if s.startswith("(") and s.endswith(")"):
|
| 15 |
+
letter = s[1] # Extract the letter inside the parentheses
|
| 16 |
+
return ord(letter) - ord("A") # Convert to zero-based index
|
| 17 |
+
else:
|
| 18 |
+
raise ValueError("Invalid format")
|
| 19 |
+
|
| 20 |
+
def get_test_target(doc):
|
| 21 |
+
if "target" in doc:
|
| 22 |
+
return doc["target"], "target"
|
| 23 |
+
elif "answer" in doc:
|
| 24 |
+
return doc["answer"], "answer"
|