hpcai-tech
/

vqvae

@@ -78,6 +78,11 @@ class VQVAE(PreTrainedModel):
         h = self.post_vq_conv(shift_dim(h, -1, 1))
         return self.decoder(h)
     def forward(self, x):
         z = self.pre_vq_conv(self.encoder(x))
         vq_output = self.codebook(z)
@@ -159,6 +164,18 @@ class Codebook(nn.Module):
         self.z_avg.data.copy_(_k_rand)
         self.N.data.copy_(torch.ones(self.n_codes))
     def forward(self, z):
         # z: [b, c, t, h, w]
         if self._need_init and self.training:

         h = self.post_vq_conv(shift_dim(h, -1, 1))
         return self.decoder(h)
+    def decode_from_embeddings(self, embeddings):
+        # embeddings: [b, c, t, h, w]
+        encodings = self.codebook.search_indices(embeddings)
+        return self.decode(encodings)
     def forward(self, x):
         z = self.pre_vq_conv(self.encoder(x))
         vq_output = self.codebook(z)
         self.z_avg.data.copy_(_k_rand)
         self.N.data.copy_(torch.ones(self.n_codes))
+    def search_indices(self, z):
+        # z: [b, c, t, h, w]
+        flat_inputs = shift_dim(z, 1, -1).flatten(end_dim=-2)
+        distances = (flat_inputs ** 2).sum(dim=1, keepdim=True) \
+                    - 2 * flat_inputs @ self.embeddings.t() \
+                    + (self.embeddings.t() ** 2).sum(dim=0, keepdim=True)
+        encoding_indices = torch.argmin(distances, dim=1)
+        encoding_indices = encoding_indices.view(z.shape[0], *z.shape[2:])
+        return encoding_indices
     def forward(self, z):
         # z: [b, c, t, h, w]
         if self._need_init and self.training: