Spaces:

Tharun156
/

GestureLSM

Runtime error

App Files Files Community

GestureLSM / trainer /generative_trainer.py

Tharun156

Upload 149 files

f7400bf verified 14 days ago

raw

history blame contribute delete

42.7 kB

	import os
	import pprint
	import random
	import sys
	import time
	import warnings
	from typing import Dict

	import librosa
	import numpy as np
	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	import wandb
	from dataloaders import data_tools
	from dataloaders.data_tools import joints_list
	from loguru import logger
	from models.vq.model import RVQVAE
	from optimizers.optim_factory import create_optimizer
	from optimizers.scheduler_factory import create_scheduler
	from torch.nn.parallel import DistributedDataParallel as DDP
	from tqdm import tqdm
	from trainer.base_trainer import BaseTrainer
	from utils import (
	data_transfer,
	logger_tools,
	metric,
	other_tools,
	other_tools_hf,
	rotation_conversions as rc,
	)
	from utils.joints import hands_body_mask, lower_body_mask, upper_body_mask


	def convert_15d_to_6d(motion):
	"""
	Convert 15D motion to 6D motion, the current motion is 15D, but the eval model is 6D
	"""
	bs = motion.shape[0]
	motion_6d = motion.reshape(bs, -1, 55, 15)[:, :, :, 6:12]
	motion_6d = motion_6d.reshape(bs, -1, 55 * 6)
	return motion_6d


	class CustomTrainer(BaseTrainer):
	"""
	Generative Trainer to support various generative models
	"""

	def __init__(self, cfg, args):
	super().__init__(cfg, args)
	self.cfg = cfg
	self.args = args
	self.joints = 55

	self.ori_joint_list = joints_list["beat_smplx_joints"]
	self.tar_joint_list_face = joints_list["beat_smplx_face"]
	self.tar_joint_list_upper = joints_list["beat_smplx_upper"]
	self.tar_joint_list_hands = joints_list["beat_smplx_hands"]
	self.tar_joint_list_lower = joints_list["beat_smplx_lower"]

	self.joint_mask_face = np.zeros(len(list(self.ori_joint_list.keys())) * 3)
	self.joints = 55
	for joint_name in self.tar_joint_list_face:
	self.joint_mask_face[
	self.ori_joint_list[joint_name][1]
	- self.ori_joint_list[joint_name][0] : self.ori_joint_list[joint_name][
	1
	]
	] = 1
	self.joint_mask_upper = np.zeros(len(list(self.ori_joint_list.keys())) * 3)
	for joint_name in self.tar_joint_list_upper:
	self.joint_mask_upper[
	self.ori_joint_list[joint_name][1]
	- self.ori_joint_list[joint_name][0] : self.ori_joint_list[joint_name][
	1
	]
	] = 1
	self.joint_mask_hands = np.zeros(len(list(self.ori_joint_list.keys())) * 3)
	for joint_name in self.tar_joint_list_hands:
	self.joint_mask_hands[
	self.ori_joint_list[joint_name][1]
	- self.ori_joint_list[joint_name][0] : self.ori_joint_list[joint_name][
	1
	]
	] = 1
	self.joint_mask_lower = np.zeros(len(list(self.ori_joint_list.keys())) * 3)
	for joint_name in self.tar_joint_list_lower:
	self.joint_mask_lower[
	self.ori_joint_list[joint_name][1]
	- self.ori_joint_list[joint_name][0] : self.ori_joint_list[joint_name][
	1
	]
	] = 1

	self.tracker = other_tools.EpochTracker(
	["fgd", "bc", "l1div", "predict_x0_loss", "test_clip_fgd"],
	[True, True, True, True, True],
	)

	##### Model #####

	model_module = __import__(
	f"models.{cfg.model.model_name}", fromlist=["something"]
	)

	if self.cfg.ddp:
	self.model = getattr(model_module, cfg.model.g_name)(cfg).to(self.rank)
	process_group = torch.distributed.new_group()
	self.model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(
	self.model, process_group
	)
	self.model = DDP(
	self.model,
	device_ids=[self.rank],
	output_device=self.rank,
	broadcast_buffers=False,
	find_unused_parameters=False,
	)
	else:
	self.model = getattr(model_module, cfg.model.g_name)(cfg)

	if self.args.mode == "train":
	if self.rank == 0:
	logger.info(self.model)
	logger.info(f"init {self.cfg.model.g_name} success")
	wandb.watch(self.model)

	##### Optimizer and Scheduler #####
	self.opt = create_optimizer(self.cfg.solver, self.model)
	self.opt_s = create_scheduler(self.cfg.solver, self.opt)

	##### VQ-VAE models #####
	"""Initialize and load VQ-VAE models for different body parts."""
	# Body part VQ models
	self.vq_models = self._create_body_vq_models()

	# Set all VQ models to eval mode
	for model in self.vq_models.values():
	model.eval()

	self.vq_model_upper, self.vq_model_hands, self.vq_model_lower = (
	self.vq_models.values()
	)

	##### Loss functions #####
	self.reclatent_loss = nn.MSELoss()
	self.vel_loss = torch.nn.L1Loss(reduction="mean")

	##### Normalization #####
	self.mean = np.load("./mean_std/beatx_2_330_mean.npy")
	self.std = np.load("./mean_std/beatx_2_330_std.npy")

	# Extract body part specific normalizations
	for part in ["upper", "hands", "lower"]:
	mask = globals()[f"{part}_body_mask"]
	setattr(self, f"mean_{part}", torch.from_numpy(self.mean[mask]))
	setattr(self, f"std_{part}", torch.from_numpy(self.std[mask]))

	self.trans_mean = torch.from_numpy(
	np.load("./mean_std/beatx_2_trans_mean.npy")
	)
	self.trans_std = torch.from_numpy(
	np.load("./mean_std/beatx_2_trans_std.npy")
	)

	if self.args.checkpoint:
	try:
	ckpt_state_dict = torch.load(self.args.checkpoint, weights_only=False)[
	"model_state_dict"
	]
	except:
	ckpt_state_dict = torch.load(self.args.checkpoint, weights_only=False)[
	"model_state"
	]
	# remove 'audioEncoder' from the state_dict due to legacy issues
	ckpt_state_dict = {
	k: v
	for k, v in ckpt_state_dict.items()
	if "modality_encoder.audio_encoder." not in k
	}
	self.model.load_state_dict(ckpt_state_dict, strict=False)
	logger.info(f"Loaded checkpoint from {self.args.checkpoint}")

	def _create_body_vq_models(self) -> Dict[str, RVQVAE]:
	"""Create VQ-VAE models for body parts."""
	vq_configs = {
	"upper": {"dim_pose": 78},
	"hands": {"dim_pose": 180},
	"lower": {"dim_pose": 57},
	}

	vq_models = {}
	for part, config in vq_configs.items():
	model = self._create_rvqvae_model(config["dim_pose"], part)
	vq_models[part] = model

	return vq_models

	def _create_rvqvae_model(self, dim_pose: int, body_part: str) -> RVQVAE:
	"""Create a single RVQVAE model with specified configuration."""

	vq_args = self.args
	vq_args.num_quantizers = 6
	vq_args.shared_codebook = False
	vq_args.quantize_dropout_prob = 0.2
	vq_args.quantize_dropout_cutoff_index = 0
	vq_args.mu = 0.99
	vq_args.beta = 1.0
	model = RVQVAE(
	vq_args,
	input_width=dim_pose,
	nb_code=1024,
	code_dim=128,
	output_emb_width=128,
	down_t=2,
	stride_t=2,
	width=512,
	depth=3,
	dilation_growth_rate=3,
	activation="relu",
	norm=None,
	)

	# Load pretrained weights
	checkpoint_path = getattr(self.cfg, f"vqvae_{body_part}_path")
	model.load_state_dict(torch.load(checkpoint_path)["net"])
	return model

	def inverse_selection(self, filtered_t, selection_array, n):
	original_shape_t = np.zeros((n, selection_array.size))
	selected_indices = np.where(selection_array == 1)[0]
	for i in range(n):
	original_shape_t[i, selected_indices] = filtered_t[i]
	return original_shape_t

	def inverse_selection_tensor(self, filtered_t, selection_array, n):
	selection_array = torch.from_numpy(selection_array)
	original_shape_t = torch.zeros((n, 165))
	selected_indices = torch.where(selection_array == 1)[0]
	for i in range(n):
	original_shape_t[i, selected_indices] = filtered_t[i]
	return original_shape_t

	def _load_data(self, dict_data):
	facial_rep = dict_data["facial"]
	beta = dict_data["beta"]
	tar_trans = dict_data["trans"]
	tar_id = dict_data["id"]

	# process the pose data
	tar_pose = dict_data["pose"][:, :, :165]
	tar_trans_v = dict_data["trans_v"]
	tar_trans = dict_data["trans"]
	bs, n, j = tar_pose.shape[0], tar_pose.shape[1], self.joints
	tar_pose_hands = tar_pose[:, :, 25 * 3 : 55 * 3]
	tar_pose_hands = rc.axis_angle_to_matrix(tar_pose_hands.reshape(bs, n, 30, 3))
	tar_pose_hands = rc.matrix_to_rotation_6d(tar_pose_hands).reshape(bs, n, 30 * 6)

	tar_pose_upper = tar_pose[:, :, self.joint_mask_upper.astype(bool)]
	tar_pose_upper = rc.axis_angle_to_matrix(tar_pose_upper.reshape(bs, n, 13, 3))
	tar_pose_upper = rc.matrix_to_rotation_6d(tar_pose_upper).reshape(bs, n, 13 * 6)

	tar_pose_leg = tar_pose[:, :, self.joint_mask_lower.astype(bool)]
	tar_pose_leg = rc.axis_angle_to_matrix(tar_pose_leg.reshape(bs, n, 9, 3))
	tar_pose_leg = rc.matrix_to_rotation_6d(tar_pose_leg).reshape(bs, n, 9 * 6)

	tar_pose_lower = tar_pose_leg

	tar_pose_upper = (tar_pose_upper - self.mean_upper) / self.std_upper
	tar_pose_hands = (tar_pose_hands - self.mean_hands) / self.std_hands
	tar_pose_lower = (tar_pose_lower - self.mean_lower) / self.std_lower

	tar_trans_v = (tar_trans_v - self.trans_mean) / self.trans_std
	tar_pose_lower = torch.cat([tar_pose_lower, tar_trans_v], dim=-1)

	latent_upper_top = self.vq_model_upper.map2latent(tar_pose_upper)
	latent_hands_top = self.vq_model_hands.map2latent(tar_pose_hands)
	latent_lower_top = self.vq_model_lower.map2latent(tar_pose_lower)
	latent_lengths = [latent_upper_top.shape[1], latent_hands_top.shape[1], latent_lower_top.shape[1]]
	if len(set(latent_lengths)) != 1:
	min_len = min(latent_lengths)
	logger.warning(
	"Latent length mismatch detected (upper=%d, hands=%d, lower=%d); truncating to %d",
	latent_upper_top.shape[1],
	latent_hands_top.shape[1],
	latent_lower_top.shape[1],
	min_len,
	)
	latent_upper_top = latent_upper_top[:, :min_len, :]
	latent_hands_top = latent_hands_top[:, :min_len, :]
	latent_lower_top = latent_lower_top[:, :min_len, :]

	## TODO: Whether the latent scale is needed here?
	# latent_in = torch.cat([latent_upper_top, latent_hands_top, latent_lower_top], dim=2)
	latent_in = (
	torch.cat([latent_upper_top, latent_hands_top, latent_lower_top], dim=2) / 5
	)

	word = dict_data.get("word", None)

	# style feature is always None (without annotation, we never know what it is)
	style_feature = None

	audio_onset = None
	if self.cfg.data.onset_rep:
	audio_onset = dict_data["audio_onset"]

	return {
	"audio_onset": audio_onset,
	"word": word,
	"latent_in": latent_in,
	"tar_id": tar_id,
	"facial_rep": facial_rep,
	"beta": beta,
	"tar_pose": tar_pose,
	"trans": tar_trans,
	"style_feature": style_feature,
	}

	def _g_training(self, loaded_data, mode="train", epoch=0):
	self.model.train()
	cond_ = {"y": {}}
	cond_["y"]["audio_onset"] = loaded_data["audio_onset"]
	cond_["y"]["word"] = loaded_data["word"]
	cond_["y"]["id"] = loaded_data["tar_id"]
	cond_["y"]["seed"] = loaded_data["latent_in"][:, : self.cfg.pre_frames]
	cond_["y"]["style_feature"] = loaded_data["style_feature"]
	x0 = loaded_data["latent_in"]
	x0 = x0.permute(0, 2, 1).unsqueeze(2)

	g_loss_final = self.model.module.train_forward(cond_, x0)["loss"]

	self.tracker.update_meter("predict_x0_loss", "train", g_loss_final.item())

	if mode == "train":
	return g_loss_final

	def _g_test(self, loaded_data):
	self.model.eval()
	tar_beta = loaded_data["beta"]
	tar_pose = loaded_data["tar_pose"]
	tar_exps = loaded_data["facial_rep"]
	tar_trans = loaded_data["trans"]

	audio_onset = loaded_data["audio_onset"]
	in_word = loaded_data["word"]

	in_x0 = loaded_data["latent_in"]
	in_seed = loaded_data["latent_in"]

	bs, n, j = (
	loaded_data["tar_pose"].shape[0],
	loaded_data["tar_pose"].shape[1],
	self.joints,
	)

	remain = n % 8
	if remain != 0:

	tar_pose = tar_pose[:, :-remain, :]
	tar_beta = tar_beta[:, :-remain, :]
	tar_exps = tar_exps[:, :-remain, :]
	in_x0 = in_x0[
	:, : in_x0.shape[1] - (remain // self.cfg.vqvae_squeeze_scale), :
	]
	in_seed = in_seed[
	:, : in_x0.shape[1] - (remain // self.cfg.vqvae_squeeze_scale), :
	]
	in_word = in_word[:, :-remain]
	n = n - remain

	rec_all_upper = []
	rec_all_lower = []
	rec_all_hands = []
	vqvae_squeeze_scale = self.cfg.vqvae_squeeze_scale
	pre_frames_scaled = self.cfg.pre_frames * vqvae_squeeze_scale
	roundt = (n - pre_frames_scaled) // (
	self.cfg.data.pose_length - pre_frames_scaled
	)
	remain = (n - pre_frames_scaled) % (
	self.cfg.data.pose_length - pre_frames_scaled
	)
	round_l = self.cfg.pose_length - pre_frames_scaled
	round_audio = int(round_l / 3 * 5)

	in_audio_onset_tmp = None
	in_word_tmp = None
	for i in range(0, roundt):
	if audio_onset is not None:
	in_audio_onset_tmp = audio_onset[
	:,
	i * (16000 // 30 * round_l) : (i + 1) * (16000 // 30 * round_l)
	+ 16000 // 30 * self.cfg.pre_frames * vqvae_squeeze_scale,
	]
	if in_word is not None:
	in_word_tmp = in_word[
	:,
	i * (round_l) : (i + 1) * (round_l)
	+ self.cfg.pre_frames * vqvae_squeeze_scale,
	]

	in_id_tmp = loaded_data["tar_id"][
	:, i * (round_l) : (i + 1) * (round_l) + self.cfg.pre_frames
	]
	in_seed_tmp = in_seed[
	:,
	i
	* (round_l)
	// vqvae_squeeze_scale : (i + 1)
	* (round_l)
	// vqvae_squeeze_scale
	+ self.cfg.pre_frames,
	]

	if i == 0:
	in_seed_tmp = in_seed_tmp[:, : self.cfg.pre_frames, :]
	else:
	in_seed_tmp = last_sample[:, -self.cfg.pre_frames :, :]

	cond_ = {"y": {}}
	cond_["y"]["audio_onset"] = in_audio_onset_tmp
	cond_["y"]["word"] = in_word_tmp
	cond_["y"]["id"] = in_id_tmp
	cond_["y"]["seed"] = in_seed_tmp
	cond_["y"]["style_feature"] = torch.zeros([bs, 512])

	sample = self.model(cond_)["latents"]

	sample = sample.squeeze(2).permute(0, 2, 1)

	last_sample = sample.clone()

	code_dim = self.vq_model_upper.code_dim
	rec_latent_upper = sample[..., :code_dim]
	rec_latent_hands = sample[..., code_dim : code_dim * 2]
	rec_latent_lower = sample[..., code_dim * 2 : code_dim * 3]

	if i == 0:
	rec_all_upper.append(rec_latent_upper)
	rec_all_hands.append(rec_latent_hands)
	rec_all_lower.append(rec_latent_lower)
	else:
	rec_all_upper.append(rec_latent_upper[:, self.cfg.pre_frames :])
	rec_all_hands.append(rec_latent_hands[:, self.cfg.pre_frames :])
	rec_all_lower.append(rec_latent_lower[:, self.cfg.pre_frames :])

	try:
	rec_all_upper = torch.cat(rec_all_upper, dim=1) * 5
	rec_all_hands = torch.cat(rec_all_hands, dim=1) * 5
	rec_all_lower = torch.cat(rec_all_lower, dim=1) * 5
	except RuntimeError as exc:
	shape_summary = {
	"upper": [tuple(t.shape) for t in rec_all_upper],
	"hands": [tuple(t.shape) for t in rec_all_hands],
	"lower": [tuple(t.shape) for t in rec_all_lower],
	}
	logger.error("Failed to concatenate latent segments: %s \| shapes=%s", exc, shape_summary)
	raise

	rec_upper = self.vq_model_upper.latent2origin(rec_all_upper)[0]
	rec_hands = self.vq_model_hands.latent2origin(rec_all_hands)[0]
	rec_lower = self.vq_model_lower.latent2origin(rec_all_lower)[0]

	rec_trans_v = rec_lower[..., -3:]
	rec_trans_v = rec_trans_v * self.trans_std + self.trans_mean
	rec_trans = torch.zeros_like(rec_trans_v)
	rec_trans = torch.cumsum(rec_trans_v, dim=-2)
	rec_trans[..., 1] = rec_trans_v[..., 1]
	rec_lower = rec_lower[..., :-3]

	rec_upper = rec_upper * self.std_upper + self.mean_upper
	rec_hands = rec_hands * self.std_hands + self.mean_hands
	rec_lower = rec_lower * self.std_lower + self.mean_lower

	n = n - remain
	tar_pose = tar_pose[:, :n, :]
	tar_exps = tar_exps[:, :n, :]
	tar_trans = tar_trans[:, :n, :]
	tar_beta = tar_beta[:, :n, :]

	if hasattr(self.cfg.model, "use_exp") and self.cfg.model.use_exp:
	rec_exps = tar_exps # fallback to tar_exps since rec_face is not defined
	else:
	rec_exps = tar_exps

	rec_trans = tar_trans

	rec_pose_legs = rec_lower[:, :, :54]
	bs, n = rec_pose_legs.shape[0], rec_pose_legs.shape[1]
	rec_pose_upper = rec_upper.reshape(bs, n, 13, 6)
	rec_pose_upper = rc.rotation_6d_to_matrix(rec_pose_upper) #
	rec_pose_upper = rc.matrix_to_axis_angle(rec_pose_upper).reshape(bs * n, 13 * 3)
	rec_pose_upper_recover = self.inverse_selection_tensor(
	rec_pose_upper, self.joint_mask_upper, bs * n
	)
	rec_pose_lower = rec_pose_legs.reshape(bs, n, 9, 6)
	rec_pose_lower = rc.rotation_6d_to_matrix(rec_pose_lower)

	rec_pose_lower = rc.matrix_to_axis_angle(rec_pose_lower).reshape(bs * n, 9 * 3)
	rec_pose_lower_recover = self.inverse_selection_tensor(
	rec_pose_lower, self.joint_mask_lower, bs * n
	)
	rec_pose_hands = rec_hands.reshape(bs, n, 30, 6)
	rec_pose_hands = rc.rotation_6d_to_matrix(rec_pose_hands)
	rec_pose_hands = rc.matrix_to_axis_angle(rec_pose_hands).reshape(bs * n, 30 * 3)
	rec_pose_hands_recover = self.inverse_selection_tensor(
	rec_pose_hands, self.joint_mask_hands, bs * n
	)
	rec_pose = (
	rec_pose_upper_recover + rec_pose_lower_recover + rec_pose_hands_recover
	)
	rec_pose[:, 66:69] = tar_pose.reshape(bs * n, 55 * 3)[:, 66:69]

	rec_pose = rc.axis_angle_to_matrix(rec_pose.reshape(bs * n, j, 3))
	rec_pose = rc.matrix_to_rotation_6d(rec_pose).reshape(bs, n, j * 6)
	tar_pose = rc.axis_angle_to_matrix(tar_pose.reshape(bs * n, j, 3))
	tar_pose = rc.matrix_to_rotation_6d(tar_pose).reshape(bs, n, j * 6)

	return {
	"rec_pose": rec_pose,
	"rec_exps": rec_exps,
	"rec_trans": rec_trans,
	"tar_pose": tar_pose,
	"tar_exps": tar_exps,
	"tar_beta": tar_beta,
	"tar_trans": tar_trans,
	}

	def train(self, epoch):

	self.model.train()
	t_start = time.time()
	self.tracker.reset()
	for its, batch_data in enumerate(self.train_loader):
	loaded_data = self._load_data(batch_data)
	t_data = time.time() - t_start

	self.opt.zero_grad()
	g_loss_final = 0
	g_loss_final += self._g_training(loaded_data, "train", epoch)

	g_loss_final.backward()
	if self.cfg.solver.grad_norm != 0:
	torch.nn.utils.clip_grad_norm_(
	self.model.parameters(), self.cfg.solver.grad_norm
	)
	self.opt.step()

	mem_cost = torch.cuda.memory_cached() / 1e9
	lr_g = self.opt.param_groups[0]["lr"]

	t_train = time.time() - t_start - t_data
	t_start = time.time()
	if its % self.cfg.log_period == 0:
	self.train_recording(epoch, its, t_data, t_train, mem_cost, lr_g)
	if self.cfg.debug:
	if its == 1:
	break
	self.opt_s.step(epoch)

	@torch.no_grad()
	def _common_test_inference(
	self, data_loader, epoch, mode="val", max_iterations=None, save_results=False
	):
	"""
	Common inference logic shared by val, test, test_clip, and test_render methods.

	Args:
	data_loader: The data loader to iterate over
	epoch: Current epoch number
	mode: Mode string for logging ("val", "test", "test_clip", "test_render")
	max_iterations: Maximum number of iterations (None for no limit)
	save_results: Whether to save result files

	Returns:
	Dictionary containing computed metrics and results
	"""
	start_time = time.time()
	total_length = 0
	test_seq_list = self.test_data.selected_file
	align = 0
	latent_out = []
	latent_ori = []
	l2_all = 0
	lvel = 0
	results = []

	# Setup save path for test mode
	results_save_path = None
	if save_results:
	results_save_path = self.checkpoint_path + f"/{epoch}/"
	if mode == "test_render":
	if os.path.exists(results_save_path):
	import shutil

	shutil.rmtree(results_save_path)
	os.makedirs(results_save_path, exist_ok=True)

	self.model.eval()
	self.smplx.eval()
	if hasattr(self, "eval_copy"):
	self.eval_copy.eval()

	with torch.no_grad():
	iterator = enumerate(data_loader)
	if mode in ["test_clip", "test"]:
	iterator = enumerate(
	tqdm(data_loader, desc=f"Testing {mode}", leave=True)
	)

	for its, batch_data in iterator:
	if max_iterations is not None and its > max_iterations:
	break

	loaded_data = self._load_data(batch_data)
	net_out = self._g_test(loaded_data)

	tar_pose = net_out["tar_pose"]
	rec_pose = net_out["rec_pose"]
	tar_exps = net_out["tar_exps"]
	tar_beta = net_out["tar_beta"]
	rec_trans = net_out["rec_trans"]
	tar_trans = net_out.get("tar_trans", rec_trans)
	rec_exps = net_out.get("rec_exps", tar_exps)

	bs, n, j = tar_pose.shape[0], tar_pose.shape[1], self.joints

	# Handle frame rate conversion
	if (30 / self.cfg.data.pose_fps) != 1:
	assert 30 % self.cfg.data.pose_fps == 0
	n *= int(30 / self.cfg.data.pose_fps)
	tar_pose = torch.nn.functional.interpolate(
	tar_pose.permute(0, 2, 1),
	scale_factor=30 / self.cfg.data.pose_fps,
	mode="linear",
	).permute(0, 2, 1)
	scale_factor = (
	30 / self.cfg.data.pose_fps
	if mode != "test"
	else 30 / self.cfg.pose_fps
	)
	rec_pose = torch.nn.functional.interpolate(
	rec_pose.permute(0, 2, 1),
	scale_factor=scale_factor,
	mode="linear",
	).permute(0, 2, 1)

	# Calculate latent representations for evaluation
	if hasattr(self, "eval_copy") and mode != "test_render":
	remain = n % self.cfg.vae_test_len
	latent_out.append(
	self.eval_copy.map2latent(rec_pose[:, : n - remain])
	.reshape(-1, self.cfg.vae_length)
	.detach()
	.cpu()
	.numpy()
	)
	latent_ori.append(
	self.eval_copy.map2latent(tar_pose[:, : n - remain])
	.reshape(-1, self.cfg.vae_length)
	.detach()
	.cpu()
	.numpy()
	)

	rec_pose = rc.rotation_6d_to_matrix(rec_pose.reshape(bs * n, j, 6))
	rec_pose = rc.matrix_to_axis_angle(rec_pose).reshape(bs * n, j * 3)
	tar_pose = rc.rotation_6d_to_matrix(tar_pose.reshape(bs * n, j, 6))
	tar_pose = rc.matrix_to_axis_angle(tar_pose).reshape(bs * n, j * 3)

	# Generate SMPLX vertices and joints
	vertices_rec = self.smplx(
	betas=tar_beta.reshape(bs * n, 300),
	transl=rec_trans.reshape(bs * n, 3) - rec_trans.reshape(bs * n, 3),
	expression=tar_exps.reshape(bs * n, 100)
	- tar_exps.reshape(bs * n, 100),
	jaw_pose=rec_pose[:, 66:69],
	global_orient=rec_pose[:, :3],
	body_pose=rec_pose[:, 3 : 21 * 3 + 3],
	left_hand_pose=rec_pose[:, 25 * 3 : 40 * 3],
	right_hand_pose=rec_pose[:, 40 * 3 : 55 * 3],
	return_joints=True,
	leye_pose=rec_pose[:, 69:72],
	reye_pose=rec_pose[:, 72:75],
	)

	joints_rec = (
	vertices_rec["joints"]
	.detach()
	.cpu()
	.numpy()
	.reshape(bs, n, 127 * 3)[0, :n, : 55 * 3]
	)

	# Calculate L1 diversity
	if hasattr(self, "l1_calculator"):
	_ = self.l1_calculator.run(joints_rec)

	# Calculate alignment for single batch
	if (
	hasattr(self, "alignmenter")
	and self.alignmenter is not None
	and bs == 1
	and mode != "test_render"
	):
	in_audio_eval, sr = librosa.load(
	self.cfg.data.data_path
	+ "wave16k/"
	+ test_seq_list.iloc[its]["id"]
	+ ".wav"
	)
	in_audio_eval = librosa.resample(
	in_audio_eval, orig_sr=sr, target_sr=self.cfg.data.audio_sr
	)
	a_offset = int(
	self.align_mask
	* (self.cfg.data.audio_sr / self.cfg.data.pose_fps)
	)
	onset_bt = self.alignmenter.load_audio(
	in_audio_eval[
	: int(self.cfg.data.audio_sr / self.cfg.data.pose_fps * n)
	],
	a_offset,
	len(in_audio_eval) - a_offset,
	True,
	)
	beat_vel = self.alignmenter.load_pose(
	joints_rec, self.align_mask, n - self.align_mask, 30, True
	)
	align += self.alignmenter.calculate_align(
	onset_bt, beat_vel, 30
	) * (n - 2 * self.align_mask)

	# Mode-specific processing
	if mode == "test" and save_results:
	# Calculate facial losses for test mode
	vertices_rec_face = self.smplx(
	betas=tar_beta.reshape(bs * n, 300),
	transl=rec_trans.reshape(bs * n, 3)
	- rec_trans.reshape(bs * n, 3),
	expression=rec_exps.reshape(bs * n, 100),
	jaw_pose=rec_pose[:, 66:69],
	global_orient=rec_pose[:, :3] - rec_pose[:, :3],
	body_pose=rec_pose[:, 3 : 21 * 3 + 3]
	- rec_pose[:, 3 : 21 * 3 + 3],
	left_hand_pose=rec_pose[:, 25 * 3 : 40 * 3]
	- rec_pose[:, 25 * 3 : 40 * 3],
	right_hand_pose=rec_pose[:, 40 * 3 : 55 * 3]
	- rec_pose[:, 40 * 3 : 55 * 3],
	return_verts=True,
	return_joints=True,
	leye_pose=rec_pose[:, 69:72] - rec_pose[:, 69:72],
	reye_pose=rec_pose[:, 72:75] - rec_pose[:, 72:75],
	)
	vertices_tar_face = self.smplx(
	betas=tar_beta.reshape(bs * n, 300),
	transl=tar_trans.reshape(bs * n, 3)
	- tar_trans.reshape(bs * n, 3),
	expression=tar_exps.reshape(bs * n, 100),
	jaw_pose=tar_pose[:, 66:69],
	global_orient=tar_pose[:, :3] - tar_pose[:, :3],
	body_pose=tar_pose[:, 3 : 21 * 3 + 3]
	- tar_pose[:, 3 : 21 * 3 + 3],
	left_hand_pose=tar_pose[:, 25 * 3 : 40 * 3]
	- tar_pose[:, 25 * 3 : 40 * 3],
	right_hand_pose=tar_pose[:, 40 * 3 : 55 * 3]
	- tar_pose[:, 40 * 3 : 55 * 3],
	return_verts=True,
	return_joints=True,
	leye_pose=tar_pose[:, 69:72] - tar_pose[:, 69:72],
	reye_pose=tar_pose[:, 72:75] - tar_pose[:, 72:75],
	)

	facial_rec = (
	vertices_rec_face["vertices"].reshape(1, n, -1)[0, :n].cpu()
	)
	facial_tar = (
	vertices_tar_face["vertices"].reshape(1, n, -1)[0, :n].cpu()
	)
	face_vel_loss = self.vel_loss(
	facial_rec[1:, :] - facial_tar[:-1, :],
	facial_tar[1:, :] - facial_tar[:-1, :],
	)
	l2 = self.reclatent_loss(facial_rec, facial_tar)
	l2_all += l2.item() * n
	lvel += face_vel_loss.item() * n

	# Save results if needed
	if save_results:
	if mode == "test":
	# Save NPZ files for test mode
	tar_pose_np = tar_pose.detach().cpu().numpy()
	rec_pose_np = rec_pose.detach().cpu().numpy()
	rec_trans_np = (
	rec_trans.detach().cpu().numpy().reshape(bs * n, 3)
	)
	rec_exp_np = (
	rec_exps.detach().cpu().numpy().reshape(bs * n, 100)
	)
	tar_exp_np = (
	tar_exps.detach().cpu().numpy().reshape(bs * n, 100)
	)
	tar_trans_np = (
	tar_trans.detach().cpu().numpy().reshape(bs * n, 3)
	)

	gt_npz = np.load(
	self.cfg.data.data_path
	+ self.cfg.data.pose_rep
	+ "/"
	+ test_seq_list.iloc[its]["id"]
	+ ".npz",
	allow_pickle=True,
	)

	np.savez(
	results_save_path
	+ "gt_"
	+ test_seq_list.iloc[its]["id"]
	+ ".npz",
	betas=gt_npz["betas"],
	poses=tar_pose_np,
	expressions=tar_exp_np,
	trans=tar_trans_np,
	model="smplx2020",
	gender="neutral",
	mocap_frame_rate=30,
	)
	np.savez(
	results_save_path
	+ "res_"
	+ test_seq_list.iloc[its]["id"]
	+ ".npz",
	betas=gt_npz["betas"],
	poses=rec_pose_np,
	expressions=rec_exp_np,
	trans=rec_trans_np,
	model="smplx2020",
	gender="neutral",
	mocap_frame_rate=30,
	)

	elif mode == "test_render":
	# Save results and render for test_render mode
	audio_name = loaded_data["audio_name"][0]
	rec_pose_np = rec_pose.detach().cpu().numpy()
	rec_trans_np = (
	rec_trans.detach().cpu().numpy().reshape(bs * n, 3)
	)
	rec_exp_np = (
	rec_exps.detach().cpu().numpy().reshape(bs * n, 100)
	)

	gt_npz = np.load(
	"./demo/examples/2_scott_0_1_1.npz", allow_pickle=True
	)
	file_name = audio_name.split("/")[-1].split(".")[0]
	results_npz_file_save_path = (
	results_save_path + f"result_{file_name}.npz"
	)

	np.savez(
	results_npz_file_save_path,
	betas=gt_npz["betas"],
	poses=rec_pose_np,
	expressions=rec_exp_np,
	trans=rec_trans_np,
	model="smplx2020",
	gender="neutral",
	mocap_frame_rate=30,
	)

	render_vid_path = other_tools_hf.render_one_sequence_no_gt(
	results_npz_file_save_path,
	results_save_path,
	audio_name,
	self.cfg.data_path_1 + "smplx_models/",
	use_matplotlib=False,
	args=self.cfg,
	)

	total_length += n

	return {
	"total_length": total_length,
	"align": align,
	"latent_out": latent_out,
	"latent_ori": latent_ori,
	"l2_all": l2_all,
	"lvel": lvel,
	"start_time": start_time,
	}

	def val(self, epoch):
	self.tracker.reset()

	results = self._common_test_inference(
	self.test_loader, epoch, mode="val", max_iterations=15
	)

	total_length = results["total_length"]
	align = results["align"]
	latent_out = results["latent_out"]
	latent_ori = results["latent_ori"]
	l2_all = results["l2_all"]
	lvel = results["lvel"]
	start_time = results["start_time"]

	logger.info(f"l2 loss: {l2_all/total_length:.10f}")
	logger.info(f"lvel loss: {lvel/total_length:.10f}")

	latent_out_all = np.concatenate(latent_out, axis=0)
	latent_ori_all = np.concatenate(latent_ori, axis=0)

	fgd = data_tools.FIDCalculator.frechet_distance(latent_out_all, latent_ori_all)
	logger.info(f"fgd score: {fgd}")
	self.tracker.update_meter("fgd", "val", fgd)

	align_avg = align / (total_length - 2 * len(self.test_loader) * self.align_mask)
	logger.info(f"align score: {align_avg}")
	self.tracker.update_meter("bc", "val", align_avg)

	l1div = self.l1_calculator.avg()
	logger.info(f"l1div score: {l1div}")
	self.tracker.update_meter("l1div", "val", l1div)

	self.val_recording(epoch)

	end_time = time.time() - start_time
	logger.info(
	f"total inference time: {int(end_time)} s for {int(total_length/self.cfg.data.pose_fps)} s motion"
	)

	def test_clip(self, epoch):
	self.tracker.reset()

	# Test on CLIP dataset
	results_clip = self._common_test_inference(
	self.test_clip_loader, epoch, mode="test_clip"
	)

	total_length_clip = results_clip["total_length"]
	latent_out_clip = results_clip["latent_out"]
	latent_ori_clip = results_clip["latent_ori"]
	start_time = results_clip["start_time"]

	latent_out_all_clip = np.concatenate(latent_out_clip, axis=0)
	latent_ori_all_clip = np.concatenate(latent_ori_clip, axis=0)

	fgd_clip = data_tools.FIDCalculator.frechet_distance(
	latent_out_all_clip, latent_ori_all_clip
	)
	logger.info(f"test_clip fgd score: {fgd_clip}")
	self.tracker.update_meter("test_clip_fgd", "val", fgd_clip)

	current_time = time.time()
	test_clip_time = current_time - start_time
	logger.info(
	f"total test_clip inference time: {int(test_clip_time)} s for {int(total_length_clip/self.cfg.data.pose_fps)} s motion"
	)

	# Test on regular test dataset for recording
	results_test = self._common_test_inference(
	self.test_loader, epoch, mode="test_clip"
	)

	total_length = results_test["total_length"]
	align = results_test["align"]
	latent_out = results_test["latent_out"]
	latent_ori = results_test["latent_ori"]

	latent_out_all = np.concatenate(latent_out, axis=0)
	latent_ori_all = np.concatenate(latent_ori, axis=0)

	fgd = data_tools.FIDCalculator.frechet_distance(latent_out_all, latent_ori_all)
	logger.info(f"fgd score: {fgd}")
	self.tracker.update_meter("fgd", "val", fgd)

	align_avg = align / (total_length - 2 * len(self.test_loader) * self.align_mask)
	logger.info(f"align score: {align_avg}")
	self.tracker.update_meter("bc", "val", align_avg)

	l1div = self.l1_calculator.avg()
	logger.info(f"l1div score: {l1div}")
	self.tracker.update_meter("l1div", "val", l1div)

	self.val_recording(epoch)

	end_time = time.time() - current_time
	logger.info(
	f"total inference time: {int(end_time)} s for {int(total_length/self.cfg.data.pose_fps)} s motion"
	)

	def test(self, epoch):
	results_save_path = self.checkpoint_path + f"/{epoch}/"
	os.makedirs(results_save_path, exist_ok=True)

	results = self._common_test_inference(
	self.test_loader, epoch, mode="test", save_results=True
	)

	total_length = results["total_length"]
	align = results["align"]
	latent_out = results["latent_out"]
	latent_ori = results["latent_ori"]
	l2_all = results["l2_all"]
	lvel = results["lvel"]
	start_time = results["start_time"]

	logger.info(f"l2 loss: {l2_all/total_length:.10f}")
	logger.info(f"lvel loss: {lvel/total_length:.10f}")

	latent_out_all = np.concatenate(latent_out, axis=0)
	latent_ori_all = np.concatenate(latent_ori, axis=0)
	fgd = data_tools.FIDCalculator.frechet_distance(latent_out_all, latent_ori_all)
	logger.info(f"fgd score: {fgd}")
	self.test_recording("fgd", fgd, epoch)

	align_avg = align / (total_length - 2 * len(self.test_loader) * self.align_mask)
	logger.info(f"align score: {align_avg}")
	self.test_recording("bc", align_avg, epoch)

	l1div = self.l1_calculator.avg()
	logger.info(f"l1div score: {l1div}")
	self.test_recording("l1div", l1div, epoch)

	end_time = time.time() - start_time
	logger.info(
	f"total inference time: {int(end_time)} s for {int(total_length/self.cfg.data.pose_fps)} s motion"
	)

	def test_render(self, epoch):
	import platform

	if platform.system() == "Linux":
	os.environ["PYOPENGL_PLATFORM"] = "egl"

	"""
	input audio and text, output motion
	do not calculate loss and metric
	save video
	"""
	results = self._common_test_inference(
	self.test_loader, epoch, mode="test_render", save_results=True
	)

	def load_checkpoint(self, checkpoint):
	# checkpoint is already a dict, do NOT call torch.load again!
	try:
	ckpt_state_dict = checkpoint["model_state_dict"]
	except:
	ckpt_state_dict = checkpoint["model_state"]
	# remove 'audioEncoder' from the state_dict due to legacy issues
	ckpt_state_dict = {
	k: v
	for k, v in ckpt_state_dict.items()
	if "modality_encoder.audio_encoder." not in k
	}
	self.model.load_state_dict(ckpt_state_dict, strict=False)
	try:
	self.opt.load_state_dict(checkpoint["optimizer_state_dict"])
	except:
	print("No optimizer loaded!")
	if (
	"scheduler_state_dict" in checkpoint
	and checkpoint["scheduler_state_dict"] is not None
	):
	self.opt_s.load_state_dict(checkpoint["scheduler_state_dict"])
	if "val_best" in checkpoint:
	self.val_best = checkpoint["val_best"]
	logger.info("Checkpoint loaded successfully.")