Julian Bilcke
so Docker is still running on CPU it eems
6a9dc02
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
if torch.cuda.is_available():
device = "cuda"
else:
device = "cpu"
try:
if torch.backends.mps.is_available():
device = "mps"
except:
pass
print("device: " + device)
def evaluate(instruction, tokenizer, model):
prompt = f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.
### Instruction:
{instruction}
### Response:"""
inputs = tokenizer(prompt, return_tensors="pt", truncation=True, padding=True)
input_ids = inputs["input_ids"].to(device)
with torch.no_grad():
generation_output = model.generate(input_ids)
s = generation_output
output = tokenizer.decode(s[0], skip_special_tokens=True)
return output.split("### Response:")[1].strip()
base_model = "WizardLM/WizardCoder-15B-V1.0"
load_8bit = False
print("loading tokenizer..")
tokenizer = AutoTokenizer.from_pretrained(base_model)
if device == "cuda":
model = AutoModelForCausalLM.from_pretrained(
base_model,
load_in_8bit=load_8bit,
torch_dtype=torch.float16,
device_map="auto",
)
else:
model = AutoModelForCausalLM.from_pretrained(
base_model,
device_map={"": device},
torch_dtype=torch.float16,
)
print("loaded tokenizer")
model.config.pad_token_id = tokenizer.pad_token_id
if not load_8bit:
model.half()
print("calling model.eval()")
model.eval()
if torch.__version__ >= "2" and sys.platform != "win32":
print("calling torch.compile(model)")
model = torch.compile(model)
instruction = "Write a short summary about AI."
print("calling evaluate..")
result = evaluate(instruction, tokenizer, model)
print("result: ")
print(result)