| import torch | |
| from transformers import AutoTokenizer, AutoModelForCausalLM | |
| if torch.cuda.is_available(): | |
| device = "cuda" | |
| else: | |
| device = "cpu" | |
| try: | |
| if torch.backends.mps.is_available(): | |
| device = "mps" | |
| except: | |
| pass | |
| print("device: " + device) | |
| def evaluate(instruction, tokenizer, model): | |
| prompt = f"""Below is an instruction that describes a task. Write a response that appropriately completes the request. | |
| ### Instruction: | |
| {instruction} | |
| ### Response:""" | |
| inputs = tokenizer(prompt, return_tensors="pt", truncation=True, padding=True) | |
| input_ids = inputs["input_ids"].to(device) | |
| with torch.no_grad(): | |
| generation_output = model.generate(input_ids) | |
| s = generation_output | |
| output = tokenizer.decode(s[0], skip_special_tokens=True) | |
| return output.split("### Response:")[1].strip() | |
| base_model = "WizardLM/WizardCoder-15B-V1.0" | |
| load_8bit = False | |
| print("loading tokenizer..") | |
| tokenizer = AutoTokenizer.from_pretrained(base_model) | |
| if device == "cuda": | |
| model = AutoModelForCausalLM.from_pretrained( | |
| base_model, | |
| load_in_8bit=load_8bit, | |
| torch_dtype=torch.float16, | |
| device_map="auto", | |
| ) | |
| else: | |
| model = AutoModelForCausalLM.from_pretrained( | |
| base_model, | |
| device_map={"": device}, | |
| torch_dtype=torch.float16, | |
| ) | |
| print("loaded tokenizer") | |
| model.config.pad_token_id = tokenizer.pad_token_id | |
| if not load_8bit: | |
| model.half() | |
| print("calling model.eval()") | |
| model.eval() | |
| if torch.__version__ >= "2" and sys.platform != "win32": | |
| print("calling torch.compile(model)") | |
| model = torch.compile(model) | |
| instruction = "Write a short summary about AI." | |
| print("calling evaluate..") | |
| result = evaluate(instruction, tokenizer, model) | |
| print("result: ") | |
| print(result) | |