File size: 5,290 Bytes
48c27bb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
from models import classifier, judge
from dotenv import load_dotenv
import logfire


# Load API keys
load_dotenv()
# Setup Logfire
# We need send_to_logfire=True to capture traces under Pytest
# https://logfire.pydantic.dev/docs/reference/advanced/testing/
logfire.configure(send_to_logfire=True)


def classifier_logic(i):
    """
    Return scenario flags for heuristic/few-shot classifier outputs.

    Args:
        i: Current iteration (for logging)

    """

    old_revision = """Henry Purcell (/ˈpɜːrsəl/, rare: /pərˈsɛl/;[n 1] c. 10 September 1659[n 2] – 21 November 1695) was an English composer of Baroque music.  He composed more than 100 songs, a tragic opera Dido and Aeneas, and wrote incidental music to a version of Shakespeare's A Midsummer Night's Dream called The Fairy Queen."""

    new_revision = """Henry Purcell (/ˈpɜːrsəl/, rare: /pərˈsɛl/;[n 1] c. 10 September 1659[n 2] – 21 November 1695) was an English composer and organist of the middle Baroque era.  He composed more than 100 songs, a tragic opera Dido and Aeneas, and wrote incidental music to a version of Shakespeare's A Midsummer Night's Dream called The Fairy Queen."""

    with logfire.span("classifier_logic {i}", i=i):
        # Run classifier models
        heuristic = classifier(old_revision, new_revision, "heuristic")
        few_shot = classifier(old_revision, new_revision, "few-shot")
        heuristic_true = heuristic["noteworthy"] is True
        few_shot_true = few_shot["noteworthy"] is True

        only_heuristic_true = heuristic_true and not few_shot_true
        only_few_shot_true = few_shot_true and not heuristic_true
        both_true = heuristic_true and few_shot_true
        both_false = (heuristic_true is False) and (few_shot_true is False)

    return (
        only_heuristic_true,
        only_few_shot_true,
        both_true,
        both_false,
    )


def judge_logic(i):
    """
    Return scenario flags for judge outputs.

    Args:
        i: Current iteration (for logging)

    """

    old_revision = """Kaman-Kalehöyük Archaeological Museum (Turkish: Kaman-Kalehöyük Arkeoloji Müzesi) is an archaeological museum in Kaman District of Kırşehir Province in Turkey. It exhibits artifacts of seven civilizations excavated in the nearby multi-period mound Kaman-Kalehöyük. It was opened in 2010. A Japanese garden is next to the museum building.[1][2]"""

    new_revision = """The Kaman-Kalehöyük Archaeological Museum (Turkish: Kaman-Kalehöyük Arkeoloji Müzesi) is an archaeological museum in Çağırkan, Kaman District, Kırşehir Province, Turkey. It exhibits artifacts of seven civilizations excavated in the nearby multi-period mound Kaman-Kalehöyük. It opened in 2010. A Japanese garden is next to the museum building.[1][2]"""

    with logfire.span("judge_logic {i}", i=i):
        heuristic = classifier(old_revision, new_revision, "heuristic")
        few_shot = classifier(old_revision, new_revision, "few-shot")
        judge_few_shot = judge(
            old_revision,
            new_revision,
            heuristic["rationale"],
            few_shot["rationale"],
            mode="aligned-fewshot",
        )
        judge_heuristic = judge(
            old_revision,
            new_revision,
            heuristic["rationale"],
            few_shot["rationale"],
            mode="aligned-heuristic",
        )

    # Test condition is True if aligned judges both give False
    judge_condition = (
        judge_few_shot["noteworthy"] == False and judge_heuristic["noteworthy"] == False
    )

    return judge_condition


# pytest -vv test_models.py::test_classifier
def test_classifier():
    """Run classifier logic 5 times and compare outcomes."""
    tries = 5
    with logfire.span("test_classifier"):
        outcomes = [classifier_logic(i) for i in range(tries)]

    only_heuristic_true = sum(result[0] for result in outcomes)
    only_few_shot_true = sum(result[1] for result in outcomes)
    both_true = sum(result[2] for result in outcomes)
    both_false = sum(result[3] for result in outcomes)

    heuristic_true_count = only_heuristic_true + both_true
    few_shot_true_count = only_few_shot_true + both_true
    disagree_count = only_heuristic_true + only_few_shot_true
    agree_count = both_true + both_false

    few_shot_more_often = few_shot_true_count > heuristic_true_count
    disagree_more_than_agree = disagree_count > agree_count

    if not few_shot_more_often:
        print(
            "Few-shot classifier did not return True more often than the heuristic classifier."
        )
    if not disagree_more_than_agree:
        print("Classifiers did not disagree more often than they agreed.")

    assert few_shot_more_often and disagree_more_than_agree


# pytest -vv test_models.py::test_judge
def test_judge():
    """Run judge logic up to 5 times"""
    current_try = 0
    max_trys = 5
    with logfire.span("test_judge"):
        while current_try < max_trys:
            result = judge_logic(current_try)
            current_try += 1
            if result is True:
                print(f"Try {current_try} succeeded")
                break
            else:
                print(f"Try {current_try} failed")
    # The assert for pytest
    assert result is True