Spaces:
Running
Running
File size: 5,290 Bytes
48c27bb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 |
from models import classifier, judge
from dotenv import load_dotenv
import logfire
# Load API keys
load_dotenv()
# Setup Logfire
# We need send_to_logfire=True to capture traces under Pytest
# https://logfire.pydantic.dev/docs/reference/advanced/testing/
logfire.configure(send_to_logfire=True)
def classifier_logic(i):
"""
Return scenario flags for heuristic/few-shot classifier outputs.
Args:
i: Current iteration (for logging)
"""
old_revision = """Henry Purcell (/ˈpɜːrsəl/, rare: /pərˈsɛl/;[n 1] c. 10 September 1659[n 2] – 21 November 1695) was an English composer of Baroque music. He composed more than 100 songs, a tragic opera Dido and Aeneas, and wrote incidental music to a version of Shakespeare's A Midsummer Night's Dream called The Fairy Queen."""
new_revision = """Henry Purcell (/ˈpɜːrsəl/, rare: /pərˈsɛl/;[n 1] c. 10 September 1659[n 2] – 21 November 1695) was an English composer and organist of the middle Baroque era. He composed more than 100 songs, a tragic opera Dido and Aeneas, and wrote incidental music to a version of Shakespeare's A Midsummer Night's Dream called The Fairy Queen."""
with logfire.span("classifier_logic {i}", i=i):
# Run classifier models
heuristic = classifier(old_revision, new_revision, "heuristic")
few_shot = classifier(old_revision, new_revision, "few-shot")
heuristic_true = heuristic["noteworthy"] is True
few_shot_true = few_shot["noteworthy"] is True
only_heuristic_true = heuristic_true and not few_shot_true
only_few_shot_true = few_shot_true and not heuristic_true
both_true = heuristic_true and few_shot_true
both_false = (heuristic_true is False) and (few_shot_true is False)
return (
only_heuristic_true,
only_few_shot_true,
both_true,
both_false,
)
def judge_logic(i):
"""
Return scenario flags for judge outputs.
Args:
i: Current iteration (for logging)
"""
old_revision = """Kaman-Kalehöyük Archaeological Museum (Turkish: Kaman-Kalehöyük Arkeoloji Müzesi) is an archaeological museum in Kaman District of Kırşehir Province in Turkey. It exhibits artifacts of seven civilizations excavated in the nearby multi-period mound Kaman-Kalehöyük. It was opened in 2010. A Japanese garden is next to the museum building.[1][2]"""
new_revision = """The Kaman-Kalehöyük Archaeological Museum (Turkish: Kaman-Kalehöyük Arkeoloji Müzesi) is an archaeological museum in Çağırkan, Kaman District, Kırşehir Province, Turkey. It exhibits artifacts of seven civilizations excavated in the nearby multi-period mound Kaman-Kalehöyük. It opened in 2010. A Japanese garden is next to the museum building.[1][2]"""
with logfire.span("judge_logic {i}", i=i):
heuristic = classifier(old_revision, new_revision, "heuristic")
few_shot = classifier(old_revision, new_revision, "few-shot")
judge_few_shot = judge(
old_revision,
new_revision,
heuristic["rationale"],
few_shot["rationale"],
mode="aligned-fewshot",
)
judge_heuristic = judge(
old_revision,
new_revision,
heuristic["rationale"],
few_shot["rationale"],
mode="aligned-heuristic",
)
# Test condition is True if aligned judges both give False
judge_condition = (
judge_few_shot["noteworthy"] == False and judge_heuristic["noteworthy"] == False
)
return judge_condition
# pytest -vv test_models.py::test_classifier
def test_classifier():
"""Run classifier logic 5 times and compare outcomes."""
tries = 5
with logfire.span("test_classifier"):
outcomes = [classifier_logic(i) for i in range(tries)]
only_heuristic_true = sum(result[0] for result in outcomes)
only_few_shot_true = sum(result[1] for result in outcomes)
both_true = sum(result[2] for result in outcomes)
both_false = sum(result[3] for result in outcomes)
heuristic_true_count = only_heuristic_true + both_true
few_shot_true_count = only_few_shot_true + both_true
disagree_count = only_heuristic_true + only_few_shot_true
agree_count = both_true + both_false
few_shot_more_often = few_shot_true_count > heuristic_true_count
disagree_more_than_agree = disagree_count > agree_count
if not few_shot_more_often:
print(
"Few-shot classifier did not return True more often than the heuristic classifier."
)
if not disagree_more_than_agree:
print("Classifiers did not disagree more often than they agreed.")
assert few_shot_more_often and disagree_more_than_agree
# pytest -vv test_models.py::test_judge
def test_judge():
"""Run judge logic up to 5 times"""
current_try = 0
max_trys = 5
with logfire.span("test_judge"):
while current_try < max_trys:
result = judge_logic(current_try)
current_try += 1
if result is True:
print(f"Try {current_try} succeeded")
break
else:
print(f"Try {current_try} failed")
# The assert for pytest
assert result is True
|