Spaces:

bethgelab
/

lm-similarity

Sleeping

App Files Files Community

Joschka Strueber commited on Feb 7

Commit

0a42e99

1 Parent(s): 4077e51

[Add] add bbh and gpqa benchmarks again with correct answer_index selection

Browse files

Files changed (4) hide show

app.py +1 -1
src/app_util.py +2 -2
src/dataloading.py +46 -16
src/utils.py +14 -1

app.py CHANGED Viewed

@@ -21,7 +21,7 @@ metric_init = "CAPA"
 with gr.Blocks(title="LLM Similarity Analyzer", css=app_util.custom_css) as demo:
     gr.Markdown("# Model Similarity Comparison Tool")
     gr.Markdown(links_markdown)
-    gr.Markdown('This is a demo for the recent publication "[Great Models Think Alike and this Undermines AI Oversight](https://huggingface.co/papers/2502.04313)."')
     with gr.Row():
         dataset_dropdown = gr.Dropdown(

 with gr.Blocks(title="LLM Similarity Analyzer", css=app_util.custom_css) as demo:
     gr.Markdown("# Model Similarity Comparison Tool")
     gr.Markdown(links_markdown)
+    gr.Markdown('This is an interactive demo for the recent publication "[Great Models Think Alike and this Undermines AI Oversight](https://huggingface.co/papers/2502.04313)." You can compare the functional similarity of hundreds of Language Models on the Open LLM Leaderboard v2 benchmark datasets.')
     with gr.Row():
         dataset_dropdown = gr.Dropdown(

src/app_util.py CHANGED Viewed

@@ -90,9 +90,9 @@ def update_datasets_based_on_models(selected_models, current_dataset):
 custom_css = """
 .image-container img {
-    width: 90% !important;  /* Make it 90% of the parent container */
     height: auto !important; /* Maintain aspect ratio */
-    max-width: 1000px; /* Optional: Set a max limit */
     display: block;
     margin: auto; /* Center the image */
 }

 custom_css = """
 .image-container img {
+    width: 80% !important;  /* Make it 80% of the parent container */
     height: auto !important; /* Maintain aspect ratio */
+    max-width: 800px; /* Optional: Set a max limit */
     display: block;
     margin: auto; /* Center the image */
 }

src/dataloading.py CHANGED Viewed

@@ -4,6 +4,7 @@ from huggingface_hub import HfApi
 from functools import lru_cache
 def get_leaderboard_models_reload():
     api = HfApi()
@@ -69,7 +70,7 @@ def get_leaderboard_models_cached():
 def get_leaderboard_datasets(model_ids):
     if model_ids is None:
-        return ['bbh_boolean_expressions', 'bbh_causal_judgement', 'bbh_date_understanding', 'bbh_disambiguation_qa', 'bbh_formal_fallacies', 'bbh_geometric_shapes', 'bbh_hyperbaton', 'bbh_logical_deduction_five_objects', 'bbh_logical_deduction_seven_objects', 'bbh_logical_deduction_three_objects', 'bbh_movie_recommendation', 'bbh_navigate', 'bbh_object_counting', 'bbh_penguins_in_a_table', 'bbh_reasoning_about_colored_objects', 'bbh_ruin_names', 'bbh_salient_translation_error_detection', 'bbh_snarks', 'bbh_sports_understanding', 'bbh_temporal_sequences', 'bbh_tracking_shuffled_objects_five_objects', 'bbh_tracking_shuffled_objects_seven_objects', 'bbh_tracking_shuffled_objects_three_objects', 'bbh_web_of_lies', 'gpqa_diamond', 'gpqa_extended', 'gpqa_main', 'ifeval', 'math_algebra_hard', 'math_counting_and_prob_hard', 'math_geometry_hard', 'math_intermediate_algebra_hard', 'math_num_theory_hard', 'math_prealgebra_hard', 'math_precalculus_hard', 'mmlu_pro', 'musr_murder_mysteries', 'musr_object_placements', 'musr_team_allocation']
     # Map each model to its corresponding leaderboard version
     leaderboard_model_ids = [f"open-llm-leaderboard/{model_id.replace('/', '__')}-details" for model_id in model_ids]
@@ -87,7 +88,7 @@ def get_leaderboard_datasets(model_ids):
         common_datasets = set.intersection(*model_datasets.values())
     # Filter datasets that are not MCQ or currently do not work
-    ignore = ["bbh_", "gpqa_", "math_", "ifeval"]
     discard = []
     for dataset in common_datasets:
         for ignore_data in ignore:
@@ -98,22 +99,56 @@ def get_leaderboard_datasets(model_ids):
     return sorted(common_datasets)
-def filter_labels(doc):
     labels = []
     if "answer_index" in doc[0].keys():
         for d in doc:
             labels.append(d["answer_index"])
-    else:
         for d in doc:
-            if d["target"] == "False":
                 labels.append(0)
-            elif d["target"] == "True":
                 labels.append(1)
-            else:
-                raise ValueError("Invalid label")
     return labels
 def load_run_data(model_name, dataset_name):
     try:
@@ -125,14 +160,9 @@ def load_run_data(model_name, dataset_name):
         data = data.sort("doc_id")
         data = data.to_dict()
-        # Get log probabilities for each response
-        log_probs = []
-        for resp in data["filtered_resps"]:
-            log_prob = np.array([float(option[0]) for option in resp])
-            log_probs.append(log_prob)
-        # Get ground truth labels
-        labels = filter_labels(data["doc"])
     except Exception as e:
         print(e)

 from functools import lru_cache
+from utils import opt_in_pars_to_index, get_test_target
 def get_leaderboard_models_reload():
     api = HfApi()
 def get_leaderboard_datasets(model_ids):
     if model_ids is None:
+        return ['bbh_boolean_expressions', 'bbh_causal_judgement', 'bbh_date_understanding', 'bbh_disambiguation_qa', 'bbh_formal_fallacies', 'bbh_geometric_shapes', 'bbh_hyperbaton', 'bbh_logical_deduction_five_objects', 'bbh_logical_deduction_seven_objects', 'bbh_logical_deduction_three_objects', 'bbh_movie_recommendation', 'bbh_navigate', 'bbh_object_counting', 'bbh_penguins_in_a_table', 'bbh_reasoning_about_colored_objects', 'bbh_ruin_names', 'bbh_salient_translation_error_detection', 'bbh_snarks', 'bbh_sports_understanding', 'bbh_temporal_sequences', 'bbh_tracking_shuffled_objects_five_objects', 'bbh_tracking_shuffled_objects_seven_objects', 'bbh_tracking_shuffled_objects_three_objects', 'bbh_web_of_lies', 'gpqa_diamond', 'gpqa_extended', 'gpqa_main', 'mmlu_pro', 'musr_murder_mysteries', 'musr_object_placements', 'musr_team_allocation']
     # Map each model to its corresponding leaderboard version
     leaderboard_model_ids = [f"open-llm-leaderboard/{model_id.replace('/', '__')}-details" for model_id in model_ids]
         common_datasets = set.intersection(*model_datasets.values())
     # Filter datasets that are not MCQ or currently do not work
+    ignore = ["math_", "ifeval"]
     discard = []
     for dataset in common_datasets:
         for ignore_data in ignore:
     return sorted(common_datasets)
+def filter_labels(dataset_name, doc):
     labels = []
+    test_target, target_key = get_test_target(doc[0])
     if "answer_index" in doc[0].keys():
         for d in doc:
             labels.append(d["answer_index"])
+    elif test_target.starts_with("("):
         for d in doc:
+            labels.append(opt_in_pars_to_index(d[target_key]))
+    elif dataset_name in ["bbh_boolean_expression"]:
+        for d in doc:
+            if d[target_key] == "True":
+                labels.append(1)
+            elif d[target_key] == "False":
                 labels.append(0)
+    elif dataset_name in ["bbh_causal_judgement", "bbh_navigate", "bbh_web_of_lies"]:
+        for d in doc:
+            if d[target_key] == "Yes":
+                labels.append(0)
+            elif d[target_key] == "No":
                 labels.append(1)
+    elif dataset_name in ["bbh_formal_fallacies"]:
+        for d in doc:
+            if d[target_key] == "valid":
+                labels.append(0)
+            elif d[target_key] == "invalid":
+                labels.append(1)
+    elif dataset_name in ["bbh_sports_understanding"]:
+        for d in doc:
+            if d[target_key] == "yes":
+                labels.append(0)
+            elif d[target_key] == "no":
+                labels.append(1)
+    elif test_target.is_digit():
+        for d in doc:
+            labels.append(int(d[target_key]))
     return labels
+def filter_responses(data):
+    # Get log probabilities for each response
+    log_probs = []
+    for resp in data["filtered_resps"]:
+        log_prob = np.array([float(option[0]) for option in resp])
+        log_probs.append(log_prob)
+    return log_probs
 def load_run_data(model_name, dataset_name):
     try:
         data = data.sort("doc_id")
         data = data.to_dict()
+        # Get ground truth labels and logits
+        log_probs = filter_responses(dataset_name, data)
+        labels = filter_labels(dataset_name, data["doc"])
     except Exception as e:
         print(e)

src/utils.py CHANGED Viewed

@@ -8,4 +8,17 @@ def softmax(logits: np.ndarray) -> np.ndarray:
 def one_hot(probs: np.array) -> np.array:
     one_hot = np.zeros_like(probs)
     one_hot[np.argmax(probs)] = 1
-    return one_hot

 def one_hot(probs: np.array) -> np.array:
     one_hot = np.zeros_like(probs)
     one_hot[np.argmax(probs)] = 1
+    return one_hot
+def opt_in_pars_to_index(s):
+    if s.startswith("(") and s.endswith(")"):
+        letter = s[1]  # Extract the letter inside the parentheses
+        return ord(letter) - ord("A")  # Convert to zero-based index
+    else:
+        raise ValueError("Invalid format")
+def get_test_target(doc):
+     if "target" in doc:
+         return doc["target"], "target"
+     elif "answer" in doc:
+         return doc["answer"], "answer"