Update app.py
Browse files
app.py
CHANGED
|
@@ -199,6 +199,61 @@ def _infer_one(
|
|
| 199 |
|
| 200 |
return processor.decode(out[0], skip_special_tokens=True).strip()
|
| 201 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 202 |
@spaces.GPU
|
| 203 |
def _get_face_embedding(
|
| 204 |
image: Image.Image
|
|
@@ -314,6 +369,28 @@ def _get_scenes_extraction(
|
|
| 314 |
"end": end_time.get_seconds()
|
| 315 |
})
|
| 316 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 317 |
cap.release()
|
| 318 |
return images, scene_info
|
| 319 |
|
|
@@ -595,12 +672,13 @@ def _extract_keyframes_every_second(
|
|
| 595 |
|
| 596 |
# Resize cropped frame back to original resolution
|
| 597 |
cropped = cv2.resize(cropped, (w, h))
|
|
|
|
| 598 |
|
| 599 |
timestamp = frame_number / fps # Timestamp of the extracted frame
|
| 600 |
|
| 601 |
# Save temporary image for debugging (not returned)
|
| 602 |
tmp_path = tmp_dir / f"frame_{sec:03d}.jpg"
|
| 603 |
-
cv2.imwrite(str(tmp_path),
|
| 604 |
|
| 605 |
# Append extracted frame and metadata
|
| 606 |
images.append(cropped)
|
|
@@ -706,6 +784,25 @@ def describe_batch(
|
|
| 706 |
temperature=temperature, context=context))
|
| 707 |
return outputs
|
| 708 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 709 |
|
| 710 |
def face_image_embedding(image: Image.Image) -> List[float] | None:
|
| 711 |
"""
|
|
@@ -963,6 +1060,32 @@ with gr.Blocks(title="Salamandra Vision 7B 路 ZeroGPU", css=custom_css,theme=gr.
|
|
| 963 |
)
|
| 964 |
gr.Markdown("---")
|
| 965 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 966 |
# ---------------------
|
| 967 |
# Section: Facial embeddings
|
| 968 |
# ---------------------
|
|
|
|
| 199 |
|
| 200 |
return processor.decode(out[0], skip_special_tokens=True).strip()
|
| 201 |
|
| 202 |
+
@spaces.GPU
|
| 203 |
+
def _get_face_embedding_casting(image: Image.Image) -> list[dict] | None:
|
| 204 |
+
"""
|
| 205 |
+
Returns list of dicts:
|
| 206 |
+
[
|
| 207 |
+
{
|
| 208 |
+
"embedding": <list[float]>,
|
| 209 |
+
"face_crop": <PIL.Image>
|
| 210 |
+
},
|
| 211 |
+
...
|
| 212 |
+
]
|
| 213 |
+
"""
|
| 214 |
+
try:
|
| 215 |
+
mtcnn, facenet = _load_face_models()
|
| 216 |
+
boxes, probs = mtcnn.detect(image)
|
| 217 |
+
|
| 218 |
+
if boxes is None:
|
| 219 |
+
return []
|
| 220 |
+
|
| 221 |
+
resultados = []
|
| 222 |
+
device = DEVICE if DEVICE == "cuda" and torch.cuda.is_available() else "cpu"
|
| 223 |
+
|
| 224 |
+
for box in boxes:
|
| 225 |
+
x1, y1, x2, y2 = map(int, box)
|
| 226 |
+
face_crop = image.crop((x1, y1, x2, y2))
|
| 227 |
+
|
| 228 |
+
face_tensor = mtcnn(face_crop)
|
| 229 |
+
if face_tensor is None:
|
| 230 |
+
continue
|
| 231 |
+
|
| 232 |
+
face_tensor = face_tensor.unsqueeze(0).to(device)
|
| 233 |
+
|
| 234 |
+
with torch.no_grad():
|
| 235 |
+
emb = facenet(face_tensor).cpu().numpy()[0]
|
| 236 |
+
|
| 237 |
+
emb = emb / np.linalg.norm(emb)
|
| 238 |
+
|
| 239 |
+
resultados.append({
|
| 240 |
+
"embedding": emb.astype(float).tolist(),
|
| 241 |
+
"face_crop": face_crop
|
| 242 |
+
})
|
| 243 |
+
|
| 244 |
+
del mtcnn
|
| 245 |
+
del facenet
|
| 246 |
+
|
| 247 |
+
if torch.cuda.is_available():
|
| 248 |
+
torch.cuda.empty_cache()
|
| 249 |
+
torch.cuda.ipc_collect()
|
| 250 |
+
|
| 251 |
+
return resultados
|
| 252 |
+
|
| 253 |
+
except Exception as e:
|
| 254 |
+
print(f"Face embedding failed: {e}")
|
| 255 |
+
return None
|
| 256 |
+
|
| 257 |
@spaces.GPU
|
| 258 |
def _get_face_embedding(
|
| 259 |
image: Image.Image
|
|
|
|
| 369 |
"end": end_time.get_seconds()
|
| 370 |
})
|
| 371 |
|
| 372 |
+
if len(scene_info) == 0:
|
| 373 |
+
cap.set(cv2.CAP_PROP_POS_FRAMES, offset_frames)
|
| 374 |
+
ret, frame = cap.read()
|
| 375 |
+
if ret:
|
| 376 |
+
h, w = frame.shape[:2]
|
| 377 |
+
|
| 378 |
+
ch, cw = int(h * crop_ratio), int(w * crop_ratio)
|
| 379 |
+
cropped_frame = frame[ch:h-ch, cw:w-cw]
|
| 380 |
+
|
| 381 |
+
img_rgb = cv2.cvtColor(cropped_frame, cv2.COLOR_BGR2RGB)
|
| 382 |
+
images.append(Image.fromarray(img_rgb))
|
| 383 |
+
|
| 384 |
+
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
| 385 |
+
fps = cap.get(cv2.CAP_PROP_FPS)
|
| 386 |
+
duration_seconds = total_frames / fps if fps > 0 else 0.0
|
| 387 |
+
|
| 388 |
+
scene_info.append({
|
| 389 |
+
"index": 1,
|
| 390 |
+
"start": 0.0,
|
| 391 |
+
"end": duration_seconds
|
| 392 |
+
})
|
| 393 |
+
|
| 394 |
cap.release()
|
| 395 |
return images, scene_info
|
| 396 |
|
|
|
|
| 672 |
|
| 673 |
# Resize cropped frame back to original resolution
|
| 674 |
cropped = cv2.resize(cropped, (w, h))
|
| 675 |
+
cropped_rgb = cv2.cvtColor(cropped, cv2.COLOR_BGR2RGB)
|
| 676 |
|
| 677 |
timestamp = frame_number / fps # Timestamp of the extracted frame
|
| 678 |
|
| 679 |
# Save temporary image for debugging (not returned)
|
| 680 |
tmp_path = tmp_dir / f"frame_{sec:03d}.jpg"
|
| 681 |
+
cv2.imwrite(str(tmp_path), cv2.cvtColor(cropped_rgb, cv2.COLOR_RGB2BGR))
|
| 682 |
|
| 683 |
# Append extracted frame and metadata
|
| 684 |
images.append(cropped)
|
|
|
|
| 784 |
temperature=temperature, context=context))
|
| 785 |
return outputs
|
| 786 |
|
| 787 |
+
def face_image_embedding_casting(image):
|
| 788 |
+
results = _get_face_embedding_casting(image)
|
| 789 |
+
|
| 790 |
+
if not results:
|
| 791 |
+
return [], []
|
| 792 |
+
|
| 793 |
+
# 1) Lista de im谩genes recortadas
|
| 794 |
+
face_crops = [r["face_crop"] for r in results]
|
| 795 |
+
|
| 796 |
+
# 2) Lista de embeddings (convertibles a JSON)
|
| 797 |
+
face_embeddings = [
|
| 798 |
+
{
|
| 799 |
+
"index": i,
|
| 800 |
+
"embedding": r["embedding"]
|
| 801 |
+
}
|
| 802 |
+
for i, r in enumerate(results)
|
| 803 |
+
]
|
| 804 |
+
|
| 805 |
+
return face_crops, face_embeddings
|
| 806 |
|
| 807 |
def face_image_embedding(image: Image.Image) -> List[float] | None:
|
| 808 |
"""
|
|
|
|
| 1060 |
)
|
| 1061 |
gr.Markdown("---")
|
| 1062 |
|
| 1063 |
+
# ---------------------
|
| 1064 |
+
# Section: Facial embeddings casting
|
| 1065 |
+
# ---------------------
|
| 1066 |
+
|
| 1067 |
+
gr.Markdown('<h2 style="text-align:center">Embeddings facials casting</h2>')
|
| 1068 |
+
|
| 1069 |
+
with gr.Row():
|
| 1070 |
+
face_img = gr.Image(label="Imatge per embedding facial", type="pil")
|
| 1071 |
+
|
| 1072 |
+
with gr.Row():
|
| 1073 |
+
face_btn = gr.Button("Obt茅 embedding facial", variant="primary")
|
| 1074 |
+
|
| 1075 |
+
with gr.Row():
|
| 1076 |
+
face_crops = gr.Gallery(label="Cares detectades", columns=3, height="auto")
|
| 1077 |
+
|
| 1078 |
+
with gr.Row():
|
| 1079 |
+
face_embeddings = gr.JSON(label="Vectors d'embedding")
|
| 1080 |
+
|
| 1081 |
+
face_btn.click(
|
| 1082 |
+
face_image_embedding_casting, # tu funci贸n
|
| 1083 |
+
[face_img],
|
| 1084 |
+
[face_crops, face_embeddings], # ahora 2 outputs
|
| 1085 |
+
api_name="face_image_embedding_casting",
|
| 1086 |
+
concurrency_limit=1
|
| 1087 |
+
)
|
| 1088 |
+
|
| 1089 |
# ---------------------
|
| 1090 |
# Section: Facial embeddings
|
| 1091 |
# ---------------------
|