MAPSS-measures / app.py
AIvry's picture
Update app.py
a41124a verified
import gradio as gr
import zipfile
import shutil
from pathlib import Path
import json
import os
import traceback
import gc
import torch
import spaces
# Import your modules
from engine import compute_mapss_measures
from models import get_model_config, cleanup_all_models
from config import DEFAULT_ALPHA
from utils import clear_gpu_memory
def process_audio_files_cpu(zip_file, model_name, layer, alpha):
"""Process uploaded ZIP file containing audio mixtures - CPU part."""
if zip_file is None:
return None, "Please upload a ZIP file", None
try:
# Use a fixed extraction path
extract_path = Path("/tmp/mapss_extract")
if extract_path.exists():
shutil.rmtree(extract_path)
extract_path.mkdir(parents=True)
# Extract ZIP
with zipfile.ZipFile(zip_file.name, 'r') as zip_ref:
zip_ref.extractall(extract_path)
# Find references and outputs directories
refs_dir = None
outs_dir = None
for item in extract_path.iterdir():
if item.is_dir():
if item.name.lower() in ['references', 'refs', 'reference']:
refs_dir = item
elif item.name.lower() in ['outputs', 'outs', 'output', 'separated']:
outs_dir = item
# Check one level deeper if not found
if refs_dir is None or outs_dir is None:
for item in extract_path.iterdir():
if item.is_dir():
for subitem in item.iterdir():
if subitem.is_dir():
if subitem.name.lower() in ['references', 'refs', 'reference']:
refs_dir = subitem
elif subitem.name.lower() in ['outputs', 'outs', 'output', 'separated']:
outs_dir = subitem
if refs_dir is None or outs_dir is None:
return None, "Could not find 'references' and 'outputs' directories in the ZIP file", None
# Get audio files
ref_files = sorted([f for f in refs_dir.glob("*.wav")])
out_files = sorted([f for f in outs_dir.glob("*.wav")])
if len(ref_files) == 0:
return None, "No reference WAV files found", None
if len(out_files) == 0:
return None, "No output WAV files found", None
if len(ref_files) != len(out_files):
return None, f"Number of reference files ({len(ref_files)}) must match number of output files ({len(out_files)}). Files must be in the same order.", None
# Create manifest
manifest = [{
"mixture_id": "uploaded_mixture",
"references": [str(f) for f in ref_files],
"systems": {
"uploaded_system": [str(f) for f in out_files]
}
}]
# Validate model
allowed_models = set(get_model_config(0).keys())
if model_name not in allowed_models:
return None, f"Invalid model. Allowed: {', '.join(sorted(allowed_models))}", None
# Set layer
if model_name == "raw":
layer_final = 0
else:
model_defaults = {
"wavlm": 24, "wav2vec2": 24, "hubert": 24,
"wavlm_base": 12, "wav2vec2_base": 12, "hubert_base": 12,
"wav2vec2_xlsr": 24
}
layer_final = layer if layer is not None else model_defaults.get(model_name, 12)
# Return preprocessed data for GPU processing
return manifest, layer_final, alpha
except Exception as e:
error_msg = f"Error in preprocessing: {str(e)}\n{traceback.format_exc()}"
return None, error_msg, None
@spaces.GPU(duration=300)
def process_audio_files_gpu(manifest, model_name, layer_final, alpha):
"""GPU processing part - only called when GPU is allocated."""
if manifest is None:
return None, "Invalid input data"
try:
# Force single GPU mode in Spaces environment
# The spaces decorator handles GPU allocation
max_gpus = 1 if torch.cuda.is_available() else 0
# Run experiment with forced single GPU
results_dir = compute_mapss_measures(
models=[model_name],
mixtures=manifest,
layer=layer_final,
alpha=alpha,
verbose=True,
max_gpus=max_gpus, # Force single GPU
add_ci=False # Disable CI for faster processing in demo
)
# Create output ZIP at a fixed location
output_zip = Path("/tmp/mapss_results.zip")
with zipfile.ZipFile(output_zip, 'w') as zipf:
results_path = Path(results_dir)
files_added = 0
# Add all files from results
for file_path in results_path.rglob("*"):
if file_path.is_file():
arcname = str(file_path.relative_to(results_path.parent))
zipf.write(file_path, arcname)
files_added += 1
if output_zip.exists() and files_added > 0:
return str(output_zip), f"Processing completed! Created ZIP with {files_added} files. Note: Output files must be in the same order as reference files."
else:
return None, f"Processing completed but no output files were generated. Check if embeddings were computed."
except Exception as e:
error_msg = f"Error in GPU processing: {str(e)}\n{traceback.format_exc()}"
return None, error_msg
finally:
# Cleanup is critical in Spaces environment
cleanup_all_models()
clear_gpu_memory()
gc.collect()
if torch.cuda.is_available():
torch.cuda.empty_cache()
def process_audio_files(zip_file, model_name, layer, alpha):
"""Main processing function that combines CPU and GPU parts."""
# First, do CPU preprocessing
manifest, layer_final, alpha_processed = process_audio_files_cpu(
zip_file, model_name, layer, alpha
)
if manifest is None:
return None, layer_final # layer_final contains error message in this case
# Then do GPU processing
return process_audio_files_gpu(manifest, model_name, layer_final, alpha_processed)
def create_interface():
with gr.Blocks(title="MAPSS - Multi-source Audio Perceptual Separation Scores") as demo:
gr.Markdown("""
# MAPSS: Manifold-based Assessment of Perceptual Source Separation
Granular evaluation of speech and music source separation with the MAPSS measures:
- **Perceptual Matching (PM)**: Measures how closely an output perceptually aligns with its reference. Range: 0-1, higher is better.
- **Perceptual Similarity (PS)**: Measures how well an output is separated from its interfering references. Range: 0-1, higher is better.
## ⚠️ IMPORTANT: File Order Requirements
**Output files MUST be in the same order as reference files!**
- If references are: `speaker1.wav`, `speaker2.wav`, `speaker3.wav`
- Then outputs must be: `output1.wav`, `output2.wav`, `output3.wav`
- Where `output1` corresponds to `speaker1`, `output2` to `speaker2`, etc.
## Input Format
Upload a ZIP file containing:
```
your_mixture.zip
├── references/ # Original clean sources
│ ├── speaker1.wav
│ ├── speaker2.wav
│ └── ...
└── outputs/ # Separated outputs (SAME ORDER as references)
├── separated1.wav # Must correspond to speaker1.wav
├── separated2.wav # Must correspond to speaker2.wav
└── ...
```
### Audio Requirements
- Format: .wav files
- Sample rate: Any (automatically resampled to 16kHz)
- Channels: Mono or stereo (converted to mono)
- **Number of files: Equal number of references and outputs**
- **Order: Output files must be in the same order as reference files**
## Output Format
The tool generates a ZIP file containing:
- `ps_scores_{model}.csv`: PS scores for each source over time
- `pm_scores_{model}.csv`: PM scores for each source over time
- `params.json`: Parameters used
- `manifest_canonical.json`: File mapping and processing details
### Score Interpretation
- **Valid scores**: Only computed when at least 2 speakers are active in a frame
- **NaN values**: Appear for non-active speakers, or when fewer than 2 speakers are active in the frame.
- **Time resolution**: 20ms frames
## Available Models
| Model | Description | Default Layer | Use Case |
|-------|-------------|---------------|----------|
| `raw` | Raw waveform features | N/A | Baseline comparison |
| `wavlm` | WavLM Large | 24 | Strong performance |
| `wav2vec2` | Wav2Vec2 Large | 24 | Best overall performance |
| `hubert` | HuBERT Large | 24 | |
| `wavlm_base` | WavLM Base | 12 | |
| `wav2vec2_base` | Wav2Vec2 Base | 12 | Faster, good quality |
| `hubert_base` | HuBERT Base | 12 | |
| `wav2vec2_xlsr` | Wav2Vec2 XLSR-53 | 24 | Multilingual |
## Parameters
- **Model**: Select the embedding model for feature extraction
- **Layer**: Which transformer layer to use (auto-selected by default)
- **Alpha**: Diffusion maps parameter (0.0-1.0, default: 1.0)
- 0.0 = No normalization
- 1.0 = Full normalization (recommended)
## Processing Notes
- The system automatically detects which speakers are active in each frame
- PS/PM scores are only computed between active speakers
- Processing time scales with number of sources and audio length
- GPU acceleration is automatically used when available
- **Note**: This Hugging Face Space runs with a single GPU allocation
## Citation
If you use MAPSS, please cite:
```bibtex
@article{ivry2025mapss,
title={MAPSS: Manifold-based Assessment of Perceptual Source Separation},
author={Ivry, Amir and Cornell, Samuele and Watanabe, Shinji},
journal={arXiv preprint arXiv:2509.09212},
year={2025}
}
```
## License
Code: MIT License
Paper: CC-BY-4.0
## Support
For issues, questions, or contributions, please visit the [GitHub repository](https://github.com/amir-ivry/MAPSS-measures).
""")
with gr.Row():
with gr.Column():
file_input = gr.File(
label="Upload ZIP file with audio mixtures",
file_types=[".zip"],
type="filepath"
)
model_dropdown = gr.Dropdown(
choices=["raw", "wavlm", "wav2vec2", "hubert",
"wavlm_base", "wav2vec2_base", "hubert_base",
"wav2vec2_xlsr"],
value="wav2vec2_base",
label="Select embedding model"
)
layer_slider = gr.Slider(
minimum=0,
maximum=12,
step=1,
value=12,
label="Layer (automatically set to model default)",
interactive=True
)
alpha_slider = gr.Slider(
minimum=0.0,
maximum=1.0,
step=0.1,
value=DEFAULT_ALPHA,
label="Diffusion maps alpha parameter"
)
def update_layer_slider(model_name):
"""Update layer slider based on selected model"""
model_configs = {
"raw": {"maximum": 0, "value": 0, "interactive": False},
"wavlm": {"maximum": 24, "value": 24, "interactive": True},
"wav2vec2": {"maximum": 24, "value": 24, "interactive": True},
"hubert": {"maximum": 24, "value": 24, "interactive": True},
"wav2vec2_xlsr": {"maximum": 24, "value": 24, "interactive": True},
"wavlm_base": {"maximum": 12, "value": 12, "interactive": True},
"wav2vec2_base": {"maximum": 12, "value": 12, "interactive": True},
"hubert_base": {"maximum": 12, "value": 12, "interactive": True}
}
config = model_configs.get(model_name, {"maximum": 12, "value": 12, "interactive": True})
return gr.Slider(
minimum=0,
maximum=config["maximum"],
value=config["value"],
step=1,
label=f"Layer (max: {config['maximum']}, default: {config['value']})" if config["interactive"] else "Layer (not applicable for raw features)",
interactive=config["interactive"]
)
model_dropdown.change(
fn=update_layer_slider,
inputs=[model_dropdown],
outputs=[layer_slider]
)
process_btn = gr.Button("Process Audio Files", variant="primary")
with gr.Column():
output_file = gr.File(
label="Download Results (ZIP)",
type="filepath"
)
status_text = gr.Textbox(
label="Status",
lines=3,
max_lines=10
)
process_btn.click(
fn=process_audio_files,
inputs=[file_input, model_dropdown, layer_slider, alpha_slider],
outputs=[output_file, status_text]
)
return demo
if __name__ == "__main__":
demo = create_interface()
demo.launch()