Spaces:
Build error
Build error
drslimm
commited on
Commit
·
0d3bd5e
1
Parent(s):
a12a36a
omit eval long texts columns (comments)
Browse files- __pycache__/bangalore_score.cpython-39.pyc +0 -0
- bangalore_score.py +147 -63
__pycache__/bangalore_score.cpython-39.pyc
ADDED
|
Binary file (8.41 kB). View file
|
|
|
bangalore_score.py
CHANGED
|
@@ -85,32 +85,32 @@ class Bangalore_Score(evaluate.Metric):
|
|
| 85 |
|
| 86 |
def _download_and_prepare(self, dl_manager):
|
| 87 |
"""Optional: download external resources useful to compute the scores"""
|
| 88 |
-
# TODO: Download external resources if needed
|
| 89 |
import nltk
|
| 90 |
nltk.download('punkt')
|
| 91 |
|
|
|
|
| 92 |
import pandas as pd
|
| 93 |
-
from datasets import Dataset
|
| 94 |
|
|
|
|
| 95 |
# https://huggingface.co/evaluate-metric
|
| 96 |
-
|
| 97 |
-
evaluate.load('evaluate-metric/meteor')
|
| 98 |
-
# evaluate.load('evaluate-metric/
|
| 99 |
-
# evaluate.load('evaluate-metric/
|
| 100 |
-
# evaluate.load('evaluate-metric/
|
| 101 |
-
# evaluate.load('evaluate-metric/
|
| 102 |
-
# evaluate.load('
|
| 103 |
-
# evaluate.load('
|
| 104 |
-
# evaluate.load('rouge')
|
| 105 |
# evaluate.load('ncoop57/levenshtein_distance')
|
| 106 |
pass
|
| 107 |
|
| 108 |
-
|
| 109 |
def normalize_fn(
|
| 110 |
self,
|
| 111 |
example,
|
| 112 |
text_field='text',
|
| 113 |
-
unk_token='Not Disclosed'
|
|
|
|
| 114 |
):
|
| 115 |
"""
|
| 116 |
parse output text into headers, rows, and records
|
|
@@ -126,30 +126,38 @@ class Bangalore_Score(evaluate.Metric):
|
|
| 126 |
| ... | ... | ... |
|
| 127 |
"""
|
| 128 |
import pandas as pd
|
| 129 |
-
headers_text, records, rows_text = unk_token, [], unk_token
|
| 130 |
-
rows = dict(example)[text_field].strip().split('\n')
|
| 131 |
|
| 132 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 133 |
if len(rows) > 0:
|
| 134 |
-
|
| 135 |
-
headers_row =
|
| 136 |
-
|
| 137 |
-
headers_list =
|
| 138 |
-
|
| 139 |
|
| 140 |
-
#
|
| 141 |
if len(rows) > 2:
|
| 142 |
-
data_rows =
|
| 143 |
-
|
| 144 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 145 |
for row in data_rows:
|
| 146 |
cleaned_row = []
|
| 147 |
for cell in row:
|
| 148 |
-
# Remove leading and trailing whitespace
|
| 149 |
-
cell = cell.strip()
|
| 150 |
-
# Check if cell is empty or contains only whitespace
|
| 151 |
if not cell or cell.isspace():
|
| 152 |
-
cell =
|
| 153 |
cleaned_row.append(cell)
|
| 154 |
try:
|
| 155 |
if len(cleaned_row) == len(headers_list):
|
|
@@ -158,22 +166,71 @@ class Bangalore_Score(evaluate.Metric):
|
|
| 158 |
except Exception as e:
|
| 159 |
print(e)
|
| 160 |
|
| 161 |
-
# normalize
|
| 162 |
sorted_headers = sorted(set(list(headers_list)))
|
| 163 |
-
df = pd.DataFrame(records, columns=sorted_headers) #
|
| 164 |
-
df.fillna(unk_token, inplace=True)
|
| 165 |
-
df = df.sort_values(by=sorted_headers) #
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 166 |
# csv
|
| 167 |
csv_norm = df.to_csv(index=False, sep='|')
|
| 168 |
csv_norm = csv_norm.replace('|', ' | ') # add spaces around pipes
|
| 169 |
csv_norm = csv_norm.replace('\r', '') # remove carriage returns
|
| 170 |
-
|
| 171 |
-
# rows text
|
| 172 |
rows_text = csv_norm.split('\n')[1:-1]
|
| 173 |
rows_text = '\n'.join(rows_text).strip()
|
| 174 |
-
return {'rtext': rows_text}
|
| 175 |
|
| 176 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 177 |
"""
|
| 178 |
MSR (Most Similar Row / Record)
|
| 179 |
* computes metric for predictions
|
|
@@ -187,12 +244,20 @@ class Bangalore_Score(evaluate.Metric):
|
|
| 187 |
"""
|
| 188 |
scores_list = []
|
| 189 |
for ref, pred in zip([reference]*len(predictions), predictions):
|
| 190 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 191 |
scores_list.append(score)
|
|
|
|
| 192 |
if max_score:
|
| 193 |
best_score = max(scores_list)
|
| 194 |
else:
|
| 195 |
best_score = min(scores_list)
|
|
|
|
| 196 |
best_pred = predictions[scores_list.index(best_score)]
|
| 197 |
_predictions = []
|
| 198 |
for pred in predictions:
|
|
@@ -200,18 +265,27 @@ class Bangalore_Score(evaluate.Metric):
|
|
| 200 |
_predictions.append(pred)
|
| 201 |
return best_score, best_pred, _predictions
|
| 202 |
|
| 203 |
-
def amsr_fn(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 204 |
ref_text, pred_text = example['ref'].strip(), example['pred'].strip()
|
| 205 |
ref_rows, pred_rows = ref_text.split('\n'), pred_text.split('\n')
|
| 206 |
-
|
| 207 |
-
# test msr
|
| 208 |
msr_list = []
|
| 209 |
for ref in ref_rows:
|
| 210 |
if len(pred_rows) == 0:
|
| 211 |
msr_list.append(0)
|
| 212 |
continue
|
| 213 |
score, best_pred, pred_rows = self.msr_fn(reference=ref, predictions=pred_rows, **kwargs)
|
| 214 |
-
|
| 215 |
if False:
|
| 216 |
print(
|
| 217 |
'\n\n\n---'
|
|
@@ -225,13 +299,14 @@ class Bangalore_Score(evaluate.Metric):
|
|
| 225 |
msr_list.append(score)
|
| 226 |
|
| 227 |
aggregate_score = sum(msr_list) / len(msr_list)
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
|
|
|
| 235 |
return {'amsr': aggregate_score}
|
| 236 |
|
| 237 |
def _compute(
|
|
@@ -240,28 +315,37 @@ class Bangalore_Score(evaluate.Metric):
|
|
| 240 |
references,
|
| 241 |
metric,
|
| 242 |
metric_key,
|
| 243 |
-
|
| 244 |
-
num_proc=None
|
| 245 |
):
|
| 246 |
"""Returns the scores"""
|
| 247 |
import json
|
| 248 |
import evaluate
|
| 249 |
import pandas as pd
|
| 250 |
-
from datasets import Dataset, DatasetDict
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
proc_ds = DatasetDict({'predictions': pred_ds, 'references': refs_ds})
|
| 256 |
-
proc_ds = proc_ds.map(self.normalize_fn, num_proc=num_proc, desc='normalizing')
|
| 257 |
-
predictions = proc_ds['predictions']['rtext']
|
| 258 |
-
references = proc_ds['references']['rtext']
|
| 259 |
proc_ds = Dataset.from_dict({'pred': predictions, 'ref': references})
|
| 260 |
|
| 261 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 262 |
proc_ds = proc_ds.map(
|
| 263 |
-
lambda x: self.amsr_fn(
|
| 264 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 265 |
)
|
| 266 |
|
| 267 |
amsr_mean = sum(proc_ds['amsr']) / len(proc_ds['amsr'])
|
|
|
|
| 85 |
|
| 86 |
def _download_and_prepare(self, dl_manager):
|
| 87 |
"""Optional: download external resources useful to compute the scores"""
|
|
|
|
| 88 |
import nltk
|
| 89 |
nltk.download('punkt')
|
| 90 |
|
| 91 |
+
import evaluate
|
| 92 |
import pandas as pd
|
| 93 |
+
from datasets import Dataset
|
| 94 |
|
| 95 |
+
### metrics ###
|
| 96 |
# https://huggingface.co/evaluate-metric
|
| 97 |
+
###############
|
| 98 |
+
# evaluate.load('evaluate-metric/meteor') # respect order (machine translation)
|
| 99 |
+
# evaluate.load('evaluate-metric/wer') # word error rate
|
| 100 |
+
# evaluate.load('evaluate-metric/exact_match') # exact match
|
| 101 |
+
# evaluate.load('evaluate-metric/character') # character error rate
|
| 102 |
+
# evaluate.load('evaluate-metric/ter') # translation error rate
|
| 103 |
+
# evaluate.load('bleu') # no respect order (machine translation)
|
| 104 |
+
# evaluate.load('rouge') # no respect order (machine translation)
|
|
|
|
| 105 |
# evaluate.load('ncoop57/levenshtein_distance')
|
| 106 |
pass
|
| 107 |
|
|
|
|
| 108 |
def normalize_fn(
|
| 109 |
self,
|
| 110 |
example,
|
| 111 |
text_field='text',
|
| 112 |
+
unk_token='Not Disclosed',
|
| 113 |
+
return_df=False
|
| 114 |
):
|
| 115 |
"""
|
| 116 |
parse output text into headers, rows, and records
|
|
|
|
| 126 |
| ... | ... | ... |
|
| 127 |
"""
|
| 128 |
import pandas as pd
|
|
|
|
|
|
|
| 129 |
|
| 130 |
+
records = []
|
| 131 |
+
rows_text = unk_token
|
| 132 |
+
omit_columns = example.get('omit_columns', [])
|
| 133 |
+
|
| 134 |
+
text = dict(example)[text_field]
|
| 135 |
+
text = text.strip()
|
| 136 |
+
rows = text.split('\n')
|
| 137 |
+
|
| 138 |
+
# headers
|
| 139 |
if len(rows) > 0:
|
| 140 |
+
headers_row = rows[0]
|
| 141 |
+
headers_row = headers_row.strip()
|
| 142 |
+
headers_row = headers_row.strip('|')
|
| 143 |
+
headers_list = headers_row.split('|')
|
| 144 |
+
headers_list = [c.strip() for c in headers_list]
|
| 145 |
|
| 146 |
+
# records / rows
|
| 147 |
if len(rows) > 2:
|
| 148 |
+
data_rows = []
|
| 149 |
+
for row_text in rows[2:]:
|
| 150 |
+
row_text = row_text.strip()
|
| 151 |
+
row_text = row_text.strip('|')
|
| 152 |
+
row_values = row_text.split('|')
|
| 153 |
+
row_values = [v.strip() for v in row_values]
|
| 154 |
+
data_rows.append(row_values)
|
| 155 |
+
|
| 156 |
for row in data_rows:
|
| 157 |
cleaned_row = []
|
| 158 |
for cell in row:
|
|
|
|
|
|
|
|
|
|
| 159 |
if not cell or cell.isspace():
|
| 160 |
+
cell = unk_token
|
| 161 |
cleaned_row.append(cell)
|
| 162 |
try:
|
| 163 |
if len(cleaned_row) == len(headers_list):
|
|
|
|
| 166 |
except Exception as e:
|
| 167 |
print(e)
|
| 168 |
|
| 169 |
+
# normalize
|
| 170 |
sorted_headers = sorted(set(list(headers_list)))
|
| 171 |
+
df = pd.DataFrame(records, columns=sorted_headers) # normalize headers
|
| 172 |
+
df.fillna(unk_token, inplace=True) # fill NaNs
|
| 173 |
+
df = df.sort_values(by=sorted_headers) # normalize rows
|
| 174 |
+
|
| 175 |
+
# omit columns
|
| 176 |
+
if len(omit_columns) > 0:
|
| 177 |
+
omit_columns = [c.strip() for c in omit_columns]
|
| 178 |
+
omit_columns = [c for c in omit_columns if c in df.columns]
|
| 179 |
+
df = df.drop(columns=omit_columns)
|
| 180 |
+
|
| 181 |
+
# return df only
|
| 182 |
+
if return_df:
|
| 183 |
+
return df
|
| 184 |
+
|
| 185 |
# csv
|
| 186 |
csv_norm = df.to_csv(index=False, sep='|')
|
| 187 |
csv_norm = csv_norm.replace('|', ' | ') # add spaces around pipes
|
| 188 |
csv_norm = csv_norm.replace('\r', '') # remove carriage returns
|
| 189 |
+
# only rows text
|
|
|
|
| 190 |
rows_text = csv_norm.split('\n')[1:-1]
|
| 191 |
rows_text = '\n'.join(rows_text).strip()
|
|
|
|
| 192 |
|
| 193 |
+
if return_df:
|
| 194 |
+
return None
|
| 195 |
+
else:
|
| 196 |
+
return {'rtext': rows_text}
|
| 197 |
+
|
| 198 |
+
def ref_omit_columns(
|
| 199 |
+
self,
|
| 200 |
+
example,
|
| 201 |
+
max_chars=50,
|
| 202 |
+
):
|
| 203 |
+
"""
|
| 204 |
+
Detect columns to omit from eval
|
| 205 |
+
1. columns with text > max_chars are likely to be comments
|
| 206 |
+
* to be skipped in evaluation
|
| 207 |
+
* screws up metrics
|
| 208 |
+
:param example:
|
| 209 |
+
:param max_chars:
|
| 210 |
+
:param omit_columns:
|
| 211 |
+
:param text_field:
|
| 212 |
+
:return:
|
| 213 |
+
{
|
| 214 |
+
'omit_columns': <list of text columns>,
|
| 215 |
+
}
|
| 216 |
+
"""
|
| 217 |
+
comments = []
|
| 218 |
+
df = self.normalize_fn(example,text_field='ref', return_df=True,)
|
| 219 |
+
if df is not None:
|
| 220 |
+
for colname in df.columns:
|
| 221 |
+
c_lens = [len(str(x)) for x in df[colname]]
|
| 222 |
+
if max(c_lens) > max_chars:
|
| 223 |
+
comments.append(colname)
|
| 224 |
+
return {'omit_columns': comments}
|
| 225 |
+
|
| 226 |
+
def msr_fn(
|
| 227 |
+
self,
|
| 228 |
+
reference,
|
| 229 |
+
predictions,
|
| 230 |
+
metric,
|
| 231 |
+
metric_key,
|
| 232 |
+
max_score=True,
|
| 233 |
+
):
|
| 234 |
"""
|
| 235 |
MSR (Most Similar Row / Record)
|
| 236 |
* computes metric for predictions
|
|
|
|
| 244 |
"""
|
| 245 |
scores_list = []
|
| 246 |
for ref, pred in zip([reference]*len(predictions), predictions):
|
| 247 |
+
score_dict = metric.compute(references=[ref], predictions=[pred])
|
| 248 |
+
if isinstance(score_dict, dict):
|
| 249 |
+
score = score_dict.get(metric_key, 0)
|
| 250 |
+
elif isinstance(score_dict, float):
|
| 251 |
+
score = score_dict
|
| 252 |
+
else:
|
| 253 |
+
score = 0
|
| 254 |
scores_list.append(score)
|
| 255 |
+
|
| 256 |
if max_score:
|
| 257 |
best_score = max(scores_list)
|
| 258 |
else:
|
| 259 |
best_score = min(scores_list)
|
| 260 |
+
|
| 261 |
best_pred = predictions[scores_list.index(best_score)]
|
| 262 |
_predictions = []
|
| 263 |
for pred in predictions:
|
|
|
|
| 265 |
_predictions.append(pred)
|
| 266 |
return best_score, best_pred, _predictions
|
| 267 |
|
| 268 |
+
def amsr_fn(
|
| 269 |
+
self,
|
| 270 |
+
example,
|
| 271 |
+
**kwargs
|
| 272 |
+
):
|
| 273 |
+
"""
|
| 274 |
+
Aggregate MSR (Most Similar Row / Record)
|
| 275 |
+
:param example:
|
| 276 |
+
:param kwargs:
|
| 277 |
+
:return:
|
| 278 |
+
"""
|
| 279 |
ref_text, pred_text = example['ref'].strip(), example['pred'].strip()
|
| 280 |
ref_rows, pred_rows = ref_text.split('\n'), pred_text.split('\n')
|
| 281 |
+
### test msr
|
|
|
|
| 282 |
msr_list = []
|
| 283 |
for ref in ref_rows:
|
| 284 |
if len(pred_rows) == 0:
|
| 285 |
msr_list.append(0)
|
| 286 |
continue
|
| 287 |
score, best_pred, pred_rows = self.msr_fn(reference=ref, predictions=pred_rows, **kwargs)
|
| 288 |
+
### meteor STILL too flexible...
|
| 289 |
if False:
|
| 290 |
print(
|
| 291 |
'\n\n\n---'
|
|
|
|
| 299 |
msr_list.append(score)
|
| 300 |
|
| 301 |
aggregate_score = sum(msr_list) / len(msr_list)
|
| 302 |
+
if False:
|
| 303 |
+
print('ref_rows:')
|
| 304 |
+
for ref in ref_rows:
|
| 305 |
+
print(f'\t* {ref}')
|
| 306 |
+
print('\n\npred_rows:')
|
| 307 |
+
for pred in pred_text.split('\n'):
|
| 308 |
+
print(f'\t* {pred}')
|
| 309 |
+
print(f'\n\naggregate_score: {aggregate_score}')
|
| 310 |
return {'amsr': aggregate_score}
|
| 311 |
|
| 312 |
def _compute(
|
|
|
|
| 315 |
references,
|
| 316 |
metric,
|
| 317 |
metric_key,
|
| 318 |
+
best='max',
|
|
|
|
| 319 |
):
|
| 320 |
"""Returns the scores"""
|
| 321 |
import json
|
| 322 |
import evaluate
|
| 323 |
import pandas as pd
|
| 324 |
+
from datasets import Dataset, DatasetDict, disable_caching
|
| 325 |
+
|
| 326 |
+
disable_caching()
|
| 327 |
+
|
| 328 |
+
# 1. parse predictions and references
|
|
|
|
|
|
|
|
|
|
|
|
|
| 329 |
proc_ds = Dataset.from_dict({'pred': predictions, 'ref': references})
|
| 330 |
|
| 331 |
+
# 2. detect columns to omit from eval calculations (eg: comments)
|
| 332 |
+
proc_ds = proc_ds.map(self.ref_omit_columns, desc='omit_columns (ref)')
|
| 333 |
+
|
| 334 |
+
# 3. normalize predictions and references
|
| 335 |
+
predictions_ds = proc_ds.map(lambda x: self.normalize_fn(x, text_field='pred'))
|
| 336 |
+
references_ds = proc_ds.map(lambda x: self.normalize_fn(x, text_field='ref'))
|
| 337 |
+
eval_data = {'pred': predictions_ds['rtext'], 'ref': references_ds['rtext']}
|
| 338 |
+
|
| 339 |
+
# 4. compute amsr for given metric
|
| 340 |
+
proc_ds = Dataset.from_dict(eval_data)
|
| 341 |
proc_ds = proc_ds.map(
|
| 342 |
+
lambda x: self.amsr_fn(
|
| 343 |
+
example=x,
|
| 344 |
+
metric=metric,
|
| 345 |
+
metric_key=metric_key,
|
| 346 |
+
max_score=True if best == 'max' else False
|
| 347 |
+
),
|
| 348 |
+
desc=f'amsr ({metric_key})'
|
| 349 |
)
|
| 350 |
|
| 351 |
amsr_mean = sum(proc_ds['amsr']) / len(proc_ds['amsr'])
|