Spaces:
Running
Running
initial commit
Browse files
app.py
CHANGED
|
@@ -166,29 +166,8 @@ def grade(file_obj, progress=gr.Progress()):
|
|
| 166 |
grade_sample_run_complete = False
|
| 167 |
temperature = 0.0
|
| 168 |
|
| 169 |
-
|
| 170 |
-
# try:
|
| 171 |
-
response = openai.ChatCompletion.create(
|
| 172 |
-
# model=gpt_model,
|
| 173 |
-
engine=gpt_model,
|
| 174 |
-
max_tokens=3,
|
| 175 |
-
temperature=temperature,
|
| 176 |
-
messages=messages)
|
| 177 |
-
content = response['choices'][0]['message']['content']
|
| 178 |
-
flag = True
|
| 179 |
-
try_time = 1
|
| 180 |
-
while flag:
|
| 181 |
try:
|
| 182 |
-
content = content.split(' ')[0].strip()
|
| 183 |
-
score = float(content)
|
| 184 |
-
if score > 1.0 or score < 0.0:
|
| 185 |
-
assert False
|
| 186 |
-
flag = False
|
| 187 |
-
except:
|
| 188 |
-
question = prompt + '\n' + ' | '.join([line['question'], line['answer'].replace("<AND>", " <AND> ").replace("<OR>", " <OR> "), model_pred, ""]) + "\nPredict the correctness of the answer (digit): "
|
| 189 |
-
messages = [
|
| 190 |
-
{"role": "user", "content": question},
|
| 191 |
-
]
|
| 192 |
response = openai.ChatCompletion.create(
|
| 193 |
# model=gpt_model,
|
| 194 |
engine=gpt_model,
|
|
@@ -196,18 +175,39 @@ def grade(file_obj, progress=gr.Progress()):
|
|
| 196 |
temperature=temperature,
|
| 197 |
messages=messages)
|
| 198 |
content = response['choices'][0]['message']['content']
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 211 |
|
| 212 |
if len(sample_grade['model']) >= j + 1:
|
| 213 |
sample_grade['model'][j] = response['model']
|
|
|
|
| 166 |
grade_sample_run_complete = False
|
| 167 |
temperature = 0.0
|
| 168 |
|
| 169 |
+
while not grade_sample_run_complete:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 170 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 171 |
response = openai.ChatCompletion.create(
|
| 172 |
# model=gpt_model,
|
| 173 |
engine=gpt_model,
|
|
|
|
| 175 |
temperature=temperature,
|
| 176 |
messages=messages)
|
| 177 |
content = response['choices'][0]['message']['content']
|
| 178 |
+
flag = True
|
| 179 |
+
try_time = 1
|
| 180 |
+
while flag:
|
| 181 |
+
try:
|
| 182 |
+
content = content.split(' ')[0].strip()
|
| 183 |
+
score = float(content)
|
| 184 |
+
if score > 1.0 or score < 0.0:
|
| 185 |
+
assert False
|
| 186 |
+
flag = False
|
| 187 |
+
except:
|
| 188 |
+
question = prompt + '\n' + ' | '.join([line['question'], line['answer'].replace("<AND>", " <AND> ").replace("<OR>", " <OR> "), model_pred, ""]) + "\nPredict the correctness of the answer (digit): "
|
| 189 |
+
messages = [
|
| 190 |
+
{"role": "user", "content": question},
|
| 191 |
+
]
|
| 192 |
+
response = openai.ChatCompletion.create(
|
| 193 |
+
# model=gpt_model,
|
| 194 |
+
engine=gpt_model,
|
| 195 |
+
max_tokens=3,
|
| 196 |
+
temperature=temperature,
|
| 197 |
+
messages=messages)
|
| 198 |
+
content = response['choices'][0]['message']['content']
|
| 199 |
+
try_time += 1
|
| 200 |
+
temperature += 0.5
|
| 201 |
+
print(f"{id} try {try_time} times")
|
| 202 |
+
print(content)
|
| 203 |
+
if try_time > 5:
|
| 204 |
+
score = 0.0
|
| 205 |
+
flag = False
|
| 206 |
+
grade_sample_run_complete = True
|
| 207 |
+
except:
|
| 208 |
+
# gpt4 may have token rate limit
|
| 209 |
+
print("sleep 30s")
|
| 210 |
+
time.sleep(30)
|
| 211 |
|
| 212 |
if len(sample_grade['model']) >= j + 1:
|
| 213 |
sample_grade['model'][j] = response['model']
|