Spaces:
Build error
Build error
Jon Gauthier
commited on
Commit
·
0d58633
1
Parent(s):
dadceff
fall back to GPT2TokenizerFast for models which don't have a fast tokenizer (like OPT)
Browse files- syntaxgym.py +14 -1
syntaxgym.py
CHANGED
|
@@ -14,16 +14,21 @@
|
|
| 14 |
"""TODO: Add a description here."""
|
| 15 |
|
| 16 |
from collections import defaultdict
|
|
|
|
| 17 |
from typing import List, Dict, Tuple, NamedTuple
|
| 18 |
|
| 19 |
import datasets
|
| 20 |
import evaluate
|
| 21 |
import numpy as np
|
| 22 |
import torch
|
| 23 |
-
from transformers import AutoTokenizer, AutoModelForCausalLM,
|
|
|
|
|
|
|
| 24 |
|
| 25 |
from .prediction import Prediction
|
| 26 |
|
|
|
|
|
|
|
| 27 |
|
| 28 |
_CITATION = """\
|
| 29 |
@inproceedings{Hu:et-al:2020,
|
|
@@ -108,7 +113,15 @@ def prepare_tokenizer(model, batch_size, add_start_token=True) -> Tuple[PreTrain
|
|
| 108 |
tokenizer:
|
| 109 |
tokenizer_kwargs: suggested kwargs for any tokenizer calls
|
| 110 |
"""
|
|
|
|
| 111 |
tokenizer = AutoTokenizer.from_pretrained(model.name_or_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
|
| 113 |
# if batch_size > 1 (which generally leads to padding being required), and
|
| 114 |
# if there is not an already assigned pad_token, assign an existing
|
|
|
|
| 14 |
"""TODO: Add a description here."""
|
| 15 |
|
| 16 |
from collections import defaultdict
|
| 17 |
+
import logging
|
| 18 |
from typing import List, Dict, Tuple, NamedTuple
|
| 19 |
|
| 20 |
import datasets
|
| 21 |
import evaluate
|
| 22 |
import numpy as np
|
| 23 |
import torch
|
| 24 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM, \
|
| 25 |
+
PreTrainedTokenizer, PreTrainedTokenizerFast, \
|
| 26 |
+
GPT2TokenizerFast
|
| 27 |
|
| 28 |
from .prediction import Prediction
|
| 29 |
|
| 30 |
+
L = logging.getLogger(__name__)
|
| 31 |
+
|
| 32 |
|
| 33 |
_CITATION = """\
|
| 34 |
@inproceedings{Hu:et-al:2020,
|
|
|
|
| 113 |
tokenizer:
|
| 114 |
tokenizer_kwargs: suggested kwargs for any tokenizer calls
|
| 115 |
"""
|
| 116 |
+
|
| 117 |
tokenizer = AutoTokenizer.from_pretrained(model.name_or_path)
|
| 118 |
+
if not isinstance(tokenizer, PreTrainedTokenizerFast):
|
| 119 |
+
# We need a fast tokenizer because these are the only tokenizers that support
|
| 120 |
+
# return_offsets_mapping. Try to use GPT2 tokenizer -- this is sufficient for
|
| 121 |
+
# OPT.
|
| 122 |
+
L.warning(f"The model {model.name_or_path} does not have a fast tokenizer, "
|
| 123 |
+
f"which is required for this metric. Running with GPT2 tokenizer.")
|
| 124 |
+
tokenizer = GPT2TokenizerFast.from_pretrained(model.name_or_path)
|
| 125 |
|
| 126 |
# if batch_size > 1 (which generally leads to padding being required), and
|
| 127 |
# if there is not an already assigned pad_token, assign an existing
|