# /// script # dependencies = ["transformers>=4.46.0", "torch", "peft", "bitsandbytes", "accelerate", "datasets", "human-eval", "tqdm", "protobuf", "sentencepiece", "mistral-common>=1.5.0"] # /// import os import torch from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig from peft import PeftModel from human_eval.data import write_jsonl, read_problems from human_eval.evaluation import evaluate_functional_correctness import tempfile import json from tqdm import tqdm print("="*60) print("EVALUATION: Base vs Fine-tuned on HumanEval") print("="*60) # Configuration BASE_MODEL = "mistralai/Devstral-Small-2505" FINETUNED_MODEL = "stmasson/alizee-coder-devstral-1-small" NUM_SAMPLES = 1 # samples per problem TEMPERATURE = 0.1 MAX_NEW_TOKENS = 512 # 4-bit quantization for memory efficiency bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_use_double_quant=True, ) def load_model(model_name, adapter_name=None): """Load model with optional LoRA adapter""" print(f"\nLoading model: {model_name}") if adapter_name: print(f"With adapter: {adapter_name}") tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token model = AutoModelForCausalLM.from_pretrained( model_name, quantization_config=bnb_config, device_map="auto", trust_remote_code=True, torch_dtype=torch.bfloat16, ) if adapter_name: model = PeftModel.from_pretrained(model, adapter_name) model = model.merge_and_unload() model.eval() return model, tokenizer def generate_completion(model, tokenizer, prompt, max_new_tokens=MAX_NEW_TOKENS): """Generate code completion""" inputs = tokenizer(prompt, return_tensors="pt").to(model.device) with torch.no_grad(): outputs = model.generate( **inputs, max_new_tokens=max_new_tokens, temperature=TEMPERATURE, do_sample=True if TEMPERATURE > 0 else False, pad_token_id=tokenizer.pad_token_id, eos_token_id=tokenizer.eos_token_id, ) completion = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True) # Extract code until function ends or stop tokens stop_tokens = ["\ndef ", "\nclass ", "\n#", "\nif __name__", "\n```"] for stop in stop_tokens: if stop in completion: completion = completion[:completion.index(stop)] return completion def evaluate_model(model, tokenizer, problems, model_name): """Evaluate model on HumanEval""" print(f"\nEvaluating {model_name}...") samples = [] for task_id, problem in tqdm(problems.items(), desc=f"Generating ({model_name})"): prompt = problem["prompt"] for _ in range(NUM_SAMPLES): completion = generate_completion(model, tokenizer, prompt) samples.append({ "task_id": task_id, "completion": completion }) # Write samples and evaluate with tempfile.NamedTemporaryFile(mode='w', suffix='.jsonl', delete=False) as f: sample_file = f.name write_jsonl(sample_file, samples) results = evaluate_functional_correctness(sample_file, k=[1]) os.unlink(sample_file) return results def main(): # Load HumanEval problems print("\nLoading HumanEval problems...") problems = read_problems() print(f"Total problems: {len(problems)}") results = {} # Evaluate base model print("\n" + "="*60) print("EVALUATING BASE MODEL") print("="*60) base_model, base_tokenizer = load_model(BASE_MODEL) results["base"] = evaluate_model(base_model, base_tokenizer, problems, "Devstral-Small (Base)") print(f"\nBase Model Results: {results['base']}") # Free memory del base_model torch.cuda.empty_cache() # Evaluate fine-tuned model print("\n" + "="*60) print("EVALUATING FINE-TUNED MODEL") print("="*60) ft_model, ft_tokenizer = load_model(BASE_MODEL, FINETUNED_MODEL) results["finetuned"] = evaluate_model(ft_model, ft_tokenizer, problems, "Alizee-Coder (Fine-tuned)") print(f"\nFine-tuned Model Results: {results['finetuned']}") # Summary print("\n" + "="*60) print("COMPARISON SUMMARY") print("="*60) print(f"\n{'Model':<40} {'pass@1':>10}") print("-"*52) print(f"{'Devstral-Small-2505 (Base)':<40} {results['base']['pass@1']*100:>9.1f}%") print(f"{'Alizee-Coder-Devstral (Fine-tuned)':<40} {results['finetuned']['pass@1']*100:>9.1f}%") improvement = (results['finetuned']['pass@1'] - results['base']['pass@1']) * 100 print(f"\n{'Improvement:':<40} {improvement:>+9.1f}%") # Save results with open("eval_results.json", "w") as f: json.dump(results, f, indent=2) print("\nResults saved to eval_results.json") if __name__ == "__main__": main()