Spaces:
Sleeping
Sleeping
| import torch | |
| from torch.utils.data import Dataset, DataLoader | |
| from sentence_transformers import SentenceTransformer, losses | |
| from tqdm import tqdm | |
| import gc | |
| from plugins.scansite import ScansitePlugin | |
| torch.cuda.empty_cache() | |
| class PreferenceDataset(Dataset): | |
| def __init__(self, data, tokenizer, max_length=128): | |
| self.data = data | |
| self.tokenizer = tokenizer | |
| self.max_length = max_length | |
| def __len__(self): | |
| return len(self.data) | |
| def __getitem__(self, idx): | |
| url, title, score = self.data[idx] | |
| encoded = self.tokenizer(title, padding='max_length', truncation=True, max_length=self.max_length, return_tensors="pt") | |
| return {key: val.squeeze(0) for key, val in encoded.items()}, torch.tensor(score, dtype=torch.float) | |
| def collate_fn(batch): | |
| input_ids = torch.stack([item[0]['input_ids'] for item in batch]) | |
| attention_mask = torch.stack([item[0]['attention_mask'] for item in batch]) | |
| scores = torch.stack([item[1] for item in batch]) | |
| return {'input_ids': input_ids, 'attention_mask': attention_mask}, scores | |
| def finetune(model_name='nomic-ai/nomic-embed-text-v1', output_model_name="embeddings-ft", num_epochs=2, learning_rate=2e-5, weight_decay=0.01, batch_size=8, num_warmup_steps=0): | |
| print(f"Fine-tuning parameters:\n" | |
| f"num_epochs: {num_epochs}\n" | |
| f"learning rate (lr): {learning_rate}\n" | |
| f"weight_decay: {weight_decay}\n" | |
| f"batch_size: {batch_size}\n" | |
| f"model_name: {model_name}\n" | |
| f"num_warmup_steps: {num_warmup_steps}") | |
| scansite_plugin = ScansitePlugin("scansite", None) | |
| reference_data_valid, reference_data_rejected = scansite_plugin.get_reference_data() | |
| valid_data_with_scores = [(url, title, (score - 1) / 8 + 0.5) for url, title, score in reference_data_valid] | |
| rejected_data_with_scores = [(url, title, 0.0) for url, title in reference_data_rejected] | |
| all_data = valid_data_with_scores + rejected_data_with_scores | |
| model = SentenceTransformer(model_name, trust_remote_code=True) | |
| tokenizer = model.tokenizer | |
| dataset = PreferenceDataset(all_data, tokenizer) | |
| dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn) | |
| loss_function = torch.nn.MSELoss() | |
| optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay) | |
| total_steps = len(dataloader) * num_epochs | |
| scheduler = torch.optim.lr_scheduler.LinearLR(optimizer, start_factor=1.0, end_factor=0.1, total_iters=total_steps) | |
| device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | |
| model.to(device) | |
| for epoch in range(num_epochs): | |
| model.train() | |
| for batch in tqdm(dataloader, desc=f"Epoch {epoch + 1}/{num_epochs}"): | |
| input_data, scores = batch | |
| input_data = {k: v.to(device) for k, v in input_data.items()} | |
| scores = scores.to(device) | |
| optimizer.zero_grad() | |
| embeddings = model(input_data)['sentence_embedding'] | |
| # Calcul de la similarité cosinus | |
| embeddings_norm = torch.nn.functional.normalize(embeddings, p=2, dim=1) | |
| cosine_similarities = torch.sum(embeddings_norm, dim=1) | |
| # Calcul de la perte | |
| loss = loss_function(cosine_similarities, scores) | |
| loss.backward() | |
| torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0) | |
| optimizer.step() | |
| scheduler.step() | |
| del embeddings, cosine_similarities | |
| torch.cuda.empty_cache() | |
| gc.collect() | |
| model.save(output_model_name) | |
| print("Finetuning terminé et modèle sauvegardé.") | |
| if __name__ == "__main__": | |
| finetune() | |