Spaces:
Sleeping
Sleeping
Merge branch #per/benchbench' into 'ibm/benchbench'
Browse files
app.py
CHANGED
|
@@ -400,7 +400,7 @@ st.dataframe(
|
|
| 400 |
column_order=cols_used,
|
| 401 |
hide_index=True,
|
| 402 |
use_container_width=True,
|
| 403 |
-
height=
|
| 404 |
column_config={col: {"alignment": "center"} for col in cols_used},
|
| 405 |
)
|
| 406 |
|
|
|
|
| 400 |
column_order=cols_used,
|
| 401 |
hide_index=True,
|
| 402 |
use_container_width=True,
|
| 403 |
+
height=500,
|
| 404 |
column_config={col: {"alignment": "center"} for col in cols_used},
|
| 405 |
)
|
| 406 |
|
cache/aggregate_scoress_cache_5e66a88dab42480065db47711c55c458.csv
CHANGED
|
@@ -1,122 +1,138 @@
|
|
| 1 |
model,score
|
| 2 |
-
gpt_4o_2024_05_13,0.
|
| 3 |
-
|
| 4 |
-
gpt_4o_2024_08_06,0.
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
qwen1_5_32b,0.
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
mistral_v0_1_7b,0.6239316239316239
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
llama_65b,0.
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
dbrx_instructruct,0.5344129554655871
|
| 58 |
jurassic_2_jumbo_178b,0.532051282051282
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
jurassic_2_grande_17b,0.
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
mistral_7b_v0_2,0.
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
luminous_extended_30b,0.2329059829059829
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
gpt_neox_20b,0.
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
gpt_j_6b,0.
|
| 112 |
luminous_base_13b,0.08333333333333333
|
| 113 |
-
|
| 114 |
-
gemma_1_1_2b_it,0.
|
| 115 |
-
olmo_7b,0.
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
pythia_6_9b,0.
|
| 121 |
-
|
| 122 |
-
|
|
|
|
| 1 |
model,score
|
| 2 |
+
gpt_4o_2024_05_13,0.9767482517482518
|
| 3 |
+
chatgpt_4o_latest,0.9754079254079254
|
| 4 |
+
gpt_4o_2024_08_06,0.9652680652680652
|
| 5 |
+
claude_3_5_sonnet_20240620,0.9572649572649573
|
| 6 |
+
gemini_1_5_pro_exp_0801,0.9545454545454546
|
| 7 |
+
llama3_1_70b_instruct,0.9343074620852398
|
| 8 |
+
gpt_4_turbo_2024_04_09,0.9055819180819181
|
| 9 |
+
claude_3_opus_20240229,0.8824397824397824
|
| 10 |
+
yi_large_preview,0.8714202464202464
|
| 11 |
+
llama3_1_405b_instruct,0.8598484848484849
|
| 12 |
+
gpt_4_0125_preview,0.8492118992118992
|
| 13 |
+
hermes_3_llama3_1_70b,0.8451178451178452
|
| 14 |
+
zephyr_orpo_141b_a35b_v0_1,0.8414055080721747
|
| 15 |
+
mistral_large_2407,0.8375291375291375
|
| 16 |
+
gpt_4o_mini_2024_07_18,0.8348776223776224
|
| 17 |
+
claude_2_0,0.8333333333333334
|
| 18 |
+
smaug_qwen2_72b_instruct,0.8331088664421997
|
| 19 |
+
gemini_1_5_pro_api_0514,0.8294871794871794
|
| 20 |
+
llama3_70b_instruct,0.8172801478357034
|
| 21 |
+
llama3_70b,0.8129154795821463
|
| 22 |
+
gemma_2_9b_it_dpo,0.8100649350649352
|
| 23 |
+
llama3_instruct_8b_simpo,0.7992424242424242
|
| 24 |
+
yi_large,0.7889194139194139
|
| 25 |
+
gemma_2_27b_it,0.776345259678593
|
| 26 |
+
qwen2_72b_instruct,0.7701936951936953
|
| 27 |
+
qwen1_5_32b,0.7678062678062678
|
| 28 |
+
gpt_4_0613,0.7641802641802643
|
| 29 |
+
phi_3_5_moe_instruct,0.7600448933782267
|
| 30 |
+
qwen1_5_110b_chat,0.7419770353103686
|
| 31 |
+
mixtral_8x22b_v0_1,0.7382154882154882
|
| 32 |
+
gemma_2_9b_it_simpo,0.7328042328042329
|
| 33 |
+
gemini_pro,0.7298951048951049
|
| 34 |
+
llama_2_70b,0.7293447293447294
|
| 35 |
+
gemini_1_5_flash_api_0514,0.7263403263403263
|
| 36 |
+
yi_34b,0.7188983855650521
|
| 37 |
+
deepseek_coder_v2,0.713053613053613
|
| 38 |
+
nous_hermes_2_mixtral_8x7b_dpo,0.7094017094017094
|
| 39 |
+
gpt_3_5_turbo_0613,0.6851851851851851
|
| 40 |
+
claude_2_1,0.6693861693861693
|
| 41 |
+
yi_1_5_34b_chat,0.6669566544566544
|
| 42 |
+
mistral_medium,0.657051282051282
|
| 43 |
+
phi_3_small_128k_instruct,0.6561167227833894
|
| 44 |
+
infinity_instruct_3m_0625_llama3_8b,0.6537598204264872
|
| 45 |
+
claude_instant_1_2,0.6486013986013985
|
| 46 |
mistral_v0_1_7b,0.6239316239316239
|
| 47 |
+
command_r_plus,0.6183108558108558
|
| 48 |
+
phi_3_5_mini_instruct,0.6103254769921437
|
| 49 |
+
llama3_1_8b_instruct,0.6080822469711359
|
| 50 |
+
gemma_2_9b_it,0.6048877048877048
|
| 51 |
+
yi_1_5_9b_chat,0.6041446208112875
|
| 52 |
+
claude_3_sonnet_20240229,0.5985236985236985
|
| 53 |
+
mixtral_8x22b_instruct_v0_1,0.585565052231719
|
| 54 |
+
qwen1_5_14b,0.5797720797720798
|
| 55 |
+
llama_65b,0.5759734093067427
|
| 56 |
+
deepseek_llm_67b_chat,0.5734841290396846
|
| 57 |
+
qwen1_5_32b_chat,0.571383349161127
|
| 58 |
+
wizardlm_70b,0.5620629370629371
|
| 59 |
+
yi_34b_chat,0.5558361391694725
|
| 60 |
+
qwen1_5_72b_chat,0.5463669663669664
|
| 61 |
+
dbrx_instructruct,0.5379867046533713
|
|
|
|
| 62 |
jurassic_2_jumbo_178b,0.532051282051282
|
| 63 |
+
mixtral_8x7b_v0_1,0.5310044893378227
|
| 64 |
+
openchat_3_5,0.5270655270655271
|
| 65 |
+
mistral_large_2402,0.5105672105672105
|
| 66 |
+
solar_10_7b_instruct_v1_0,0.5030864197530864
|
| 67 |
+
qwen2_7b_instruct,0.4970445192667415
|
| 68 |
+
phi_3_medium_4k_instruct,0.48541540763762986
|
| 69 |
+
dolphin_2_2_1_mistral_7b,0.4810606060606061
|
| 70 |
+
mistral_small_2402,0.47785547785547783
|
| 71 |
+
glm_4_9b_chat,0.4769547325102881
|
| 72 |
+
dbrx_instruct,0.4724025974025974
|
| 73 |
+
qwen1_5_14b_chat,0.45340153673487005
|
| 74 |
+
claude_3_haiku_20240307,0.44965034965034967
|
| 75 |
+
gemma_7b,0.4477682811016144
|
| 76 |
+
llama3_8b_instruct,0.4449662477440255
|
| 77 |
+
llama3_8b,0.4368471035137702
|
| 78 |
+
wizardlm_13b,0.42773892773892774
|
| 79 |
+
starling_lm_7b_alpha,0.42734323289878845
|
| 80 |
+
jurassic_2_grande_17b,0.4230769230769231
|
| 81 |
+
mistral_7b_v0_3,0.4228395061728395
|
| 82 |
+
llama_2_13b,0.4146881924659702
|
| 83 |
+
llama_2_70b_chat,0.412732329398996
|
| 84 |
+
phi_3_mini_4k_instruct,0.4048663270885493
|
| 85 |
+
openhermes_2_5_mistral_7b,0.40103708020374684
|
| 86 |
+
llama_2_13b_chat,0.38675213675213677
|
| 87 |
+
guanaco_33b,0.38374125874125875
|
| 88 |
+
phi_3_mini_128k_instruct,0.3778468445135112
|
| 89 |
+
mistral_7b_v0_2,0.3773849607182941
|
| 90 |
+
internlm2_chat_20b,0.37196969696969695
|
| 91 |
+
starling_lm_7b_beta,0.3611888111888112
|
| 92 |
+
gpt_3_5_turbo_0125,0.3591242091242091
|
| 93 |
+
tulu_2_dpo_70b,0.3585164835164835
|
| 94 |
+
qwen1_5_7b,0.35185185185185186
|
| 95 |
+
falcon_40b,0.3502690724912947
|
| 96 |
+
yi_1_5_6b_chat,0.33974132863021755
|
| 97 |
+
zephyr_7b_alpha,0.33875830959164294
|
| 98 |
+
command_r,0.3296911421911422
|
| 99 |
+
luminous_supreme_70b,0.32905982905982906
|
| 100 |
+
yi_6b,0.295346628679962
|
| 101 |
+
zephyr_7b_beta,0.28937667271000606
|
| 102 |
+
mixtral_8x7b_instruct_v0_1,0.284326167659501
|
| 103 |
+
qwen_14b_chat,0.2837995337995338
|
| 104 |
+
gemma_2_2b_it,0.28113553113553114
|
| 105 |
+
phi_3_small_8k_instruct,0.27051282051282055
|
| 106 |
+
gemma_1_1_7b_it,0.263927019482575
|
| 107 |
+
llama_2_7b,0.25466919911364355
|
| 108 |
+
mistral_7b_instruct_v0_2,0.250669392336059
|
| 109 |
+
mistral_7b_instruct_v0_3,0.24534231200897869
|
| 110 |
+
qwen1_5_7b_chat,0.24214088380755047
|
| 111 |
+
alpaca_7b,0.23484848484848483
|
| 112 |
luminous_extended_30b,0.2329059829059829
|
| 113 |
+
llama_13b,0.2222222222222222
|
| 114 |
+
phi_2,0.19812080923192033
|
| 115 |
+
qwen2_1_5b_instruct,0.1968574635241302
|
| 116 |
+
yi_6b_chat,0.19393939393939394
|
| 117 |
+
vicuna_7b,0.1885198135198135
|
| 118 |
+
gemma_7b_it,0.18790982679871568
|
| 119 |
+
olmo_7b_instruct,0.15669515669515668
|
| 120 |
+
vicuna_7b_v1_5,0.15454545454545454
|
| 121 |
+
vicuna_13b,0.14714452214452214
|
| 122 |
+
gpt_neox_20b,0.1419753086419753
|
| 123 |
+
falcon_40b_instruct,0.13187429854096522
|
| 124 |
+
qwen1_5_4b_chat,0.12542806987251431
|
| 125 |
+
falcon_7b,0.11380183602405824
|
| 126 |
+
llama_2_7b_chat,0.1122679789346456
|
| 127 |
+
gpt_j_6b,0.09876543209876543
|
| 128 |
luminous_base_13b,0.08333333333333333
|
| 129 |
+
gemma_2b_it,0.08119658119658119
|
| 130 |
+
gemma_1_1_2b_it,0.07454890788224121
|
| 131 |
+
olmo_7b,0.06220322886989553
|
| 132 |
+
qwen1_5_1_8b_chat,0.05544332210998878
|
| 133 |
+
qwen2_0_5b_instruct,0.055218855218855216
|
| 134 |
+
pythia_12b,0.05246913580246913
|
| 135 |
+
chatglm2_6b,0.029137529137529136
|
| 136 |
+
pythia_6_9b,0.018518518518518517
|
| 137 |
+
qwen1_5_0_5b_chat,0.012345679012345678
|
| 138 |
+
falcon_7b_instruct,0.011363636363636364
|
cache/agreements_cache_5e66a88dab42480065db47711c55c458.csv
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
cache/allbenchs_cache_5e66a88dab42480065db47711c55c458.csv
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
requirements.txt
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
git+https://github.com/ibm/benchbench@
|
| 2 |
|
| 3 |
altair==5.4.1
|
| 4 |
attrs==24.2.0
|
|
@@ -51,4 +51,5 @@ toml==0.10.2
|
|
| 51 |
tornado==6.4.1
|
| 52 |
typing_extensions==4.12.2
|
| 53 |
tzdata==2024.1
|
| 54 |
-
urllib3==2.2.2
|
|
|
|
|
|
| 1 |
+
git+https://github.com/ibm/benchbench@08c7757323d565b70d024d82b193861b406ddf9d
|
| 2 |
|
| 3 |
altair==5.4.1
|
| 4 |
attrs==24.2.0
|
|
|
|
| 51 |
tornado==6.4.1
|
| 52 |
typing_extensions==4.12.2
|
| 53 |
tzdata==2024.1
|
| 54 |
+
urllib3==2.2.2
|
| 55 |
+
tqdm
|