import gradio as gr import zipfile import shutil from pathlib import Path import json import os import traceback import gc import torch import spaces # Import your modules from engine import compute_mapss_measures from models import get_model_config, cleanup_all_models from config import DEFAULT_ALPHA from utils import clear_gpu_memory def process_audio_files_cpu(zip_file, model_name, layer, alpha): """Process uploaded ZIP file containing audio mixtures - CPU part.""" if zip_file is None: return None, "Please upload a ZIP file", None try: # Use a fixed extraction path extract_path = Path("/tmp/mapss_extract") if extract_path.exists(): shutil.rmtree(extract_path) extract_path.mkdir(parents=True) # Extract ZIP with zipfile.ZipFile(zip_file.name, 'r') as zip_ref: zip_ref.extractall(extract_path) # Find references and outputs directories refs_dir = None outs_dir = None for item in extract_path.iterdir(): if item.is_dir(): if item.name.lower() in ['references', 'refs', 'reference']: refs_dir = item elif item.name.lower() in ['outputs', 'outs', 'output', 'separated']: outs_dir = item # Check one level deeper if not found if refs_dir is None or outs_dir is None: for item in extract_path.iterdir(): if item.is_dir(): for subitem in item.iterdir(): if subitem.is_dir(): if subitem.name.lower() in ['references', 'refs', 'reference']: refs_dir = subitem elif subitem.name.lower() in ['outputs', 'outs', 'output', 'separated']: outs_dir = subitem if refs_dir is None or outs_dir is None: return None, "Could not find 'references' and 'outputs' directories in the ZIP file", None # Get audio files ref_files = sorted([f for f in refs_dir.glob("*.wav")]) out_files = sorted([f for f in outs_dir.glob("*.wav")]) if len(ref_files) == 0: return None, "No reference WAV files found", None if len(out_files) == 0: return None, "No output WAV files found", None if len(ref_files) != len(out_files): return None, f"Number of reference files ({len(ref_files)}) must match number of output files ({len(out_files)}). Files must be in the same order.", None # Create manifest manifest = [{ "mixture_id": "uploaded_mixture", "references": [str(f) for f in ref_files], "systems": { "uploaded_system": [str(f) for f in out_files] } }] # Validate model allowed_models = set(get_model_config(0).keys()) if model_name not in allowed_models: return None, f"Invalid model. Allowed: {', '.join(sorted(allowed_models))}", None # Set layer if model_name == "raw": layer_final = 0 else: model_defaults = { "wavlm": 24, "wav2vec2": 24, "hubert": 24, "wavlm_base": 12, "wav2vec2_base": 12, "hubert_base": 12, "wav2vec2_xlsr": 24 } layer_final = layer if layer is not None else model_defaults.get(model_name, 12) # Return preprocessed data for GPU processing return manifest, layer_final, alpha except Exception as e: error_msg = f"Error in preprocessing: {str(e)}\n{traceback.format_exc()}" return None, error_msg, None @spaces.GPU(duration=300) def process_audio_files_gpu(manifest, model_name, layer_final, alpha): """GPU processing part - only called when GPU is allocated.""" if manifest is None: return None, "Invalid input data" try: # Force single GPU mode in Spaces environment # The spaces decorator handles GPU allocation max_gpus = 1 if torch.cuda.is_available() else 0 # Run experiment with forced single GPU results_dir = compute_mapss_measures( models=[model_name], mixtures=manifest, layer=layer_final, alpha=alpha, verbose=True, max_gpus=max_gpus, # Force single GPU add_ci=False # Disable CI for faster processing in demo ) # Create output ZIP at a fixed location output_zip = Path("/tmp/mapss_results.zip") with zipfile.ZipFile(output_zip, 'w') as zipf: results_path = Path(results_dir) files_added = 0 # Add all files from results for file_path in results_path.rglob("*"): if file_path.is_file(): arcname = str(file_path.relative_to(results_path.parent)) zipf.write(file_path, arcname) files_added += 1 if output_zip.exists() and files_added > 0: return str(output_zip), f"Processing completed! Created ZIP with {files_added} files. Note: Output files must be in the same order as reference files." else: return None, f"Processing completed but no output files were generated. Check if embeddings were computed." except Exception as e: error_msg = f"Error in GPU processing: {str(e)}\n{traceback.format_exc()}" return None, error_msg finally: # Cleanup is critical in Spaces environment cleanup_all_models() clear_gpu_memory() gc.collect() if torch.cuda.is_available(): torch.cuda.empty_cache() def process_audio_files(zip_file, model_name, layer, alpha): """Main processing function that combines CPU and GPU parts.""" # First, do CPU preprocessing manifest, layer_final, alpha_processed = process_audio_files_cpu( zip_file, model_name, layer, alpha ) if manifest is None: return None, layer_final # layer_final contains error message in this case # Then do GPU processing return process_audio_files_gpu(manifest, model_name, layer_final, alpha_processed) def create_interface(): with gr.Blocks(title="MAPSS - Multi-source Audio Perceptual Separation Scores") as demo: gr.Markdown(""" # MAPSS: Manifold-based Assessment of Perceptual Source Separation Granular evaluation of speech and music source separation with the MAPSS measures: - **Perceptual Matching (PM)**: Measures how closely an output perceptually aligns with its reference. Range: 0-1, higher is better. - **Perceptual Similarity (PS)**: Measures how well an output is separated from its interfering references. Range: 0-1, higher is better. ## ⚠️ IMPORTANT: File Order Requirements **Output files MUST be in the same order as reference files!** - If references are: `speaker1.wav`, `speaker2.wav`, `speaker3.wav` - Then outputs must be: `output1.wav`, `output2.wav`, `output3.wav` - Where `output1` corresponds to `speaker1`, `output2` to `speaker2`, etc. ## Input Format Upload a ZIP file containing: ``` your_mixture.zip ├── references/ # Original clean sources │ ├── speaker1.wav │ ├── speaker2.wav │ └── ... └── outputs/ # Separated outputs (SAME ORDER as references) ├── separated1.wav # Must correspond to speaker1.wav ├── separated2.wav # Must correspond to speaker2.wav └── ... ``` ### Audio Requirements - Format: .wav files - Sample rate: Any (automatically resampled to 16kHz) - Channels: Mono or stereo (converted to mono) - **Number of files: Equal number of references and outputs** - **Order: Output files must be in the same order as reference files** ## Output Format The tool generates a ZIP file containing: - `ps_scores_{model}.csv`: PS scores for each source over time - `pm_scores_{model}.csv`: PM scores for each source over time - `params.json`: Parameters used - `manifest_canonical.json`: File mapping and processing details ### Score Interpretation - **Valid scores**: Only computed when at least 2 speakers are active in a frame - **NaN values**: Appear for non-active speakers, or when fewer than 2 speakers are active in the frame. - **Time resolution**: 20ms frames ## Available Models | Model | Description | Default Layer | Use Case | |-------|-------------|---------------|----------| | `raw` | Raw waveform features | N/A | Baseline comparison | | `wavlm` | WavLM Large | 24 | Strong performance | | `wav2vec2` | Wav2Vec2 Large | 24 | Best overall performance | | `hubert` | HuBERT Large | 24 | | | `wavlm_base` | WavLM Base | 12 | | | `wav2vec2_base` | Wav2Vec2 Base | 12 | Faster, good quality | | `hubert_base` | HuBERT Base | 12 | | | `wav2vec2_xlsr` | Wav2Vec2 XLSR-53 | 24 | Multilingual | ## Parameters - **Model**: Select the embedding model for feature extraction - **Layer**: Which transformer layer to use (auto-selected by default) - **Alpha**: Diffusion maps parameter (0.0-1.0, default: 1.0) - 0.0 = No normalization - 1.0 = Full normalization (recommended) ## Processing Notes - The system automatically detects which speakers are active in each frame - PS/PM scores are only computed between active speakers - Processing time scales with number of sources and audio length - GPU acceleration is automatically used when available - **Note**: This Hugging Face Space runs with a single GPU allocation ## Citation If you use MAPSS, please cite: ```bibtex @article{ivry2025mapss, title={MAPSS: Manifold-based Assessment of Perceptual Source Separation}, author={Ivry, Amir and Cornell, Samuele and Watanabe, Shinji}, journal={arXiv preprint arXiv:2509.09212}, year={2025} } ``` ## License Code: MIT License Paper: CC-BY-4.0 ## Support For issues, questions, or contributions, please visit the [GitHub repository](https://github.com/amir-ivry/MAPSS-measures). """) with gr.Row(): with gr.Column(): file_input = gr.File( label="Upload ZIP file with audio mixtures", file_types=[".zip"], type="filepath" ) model_dropdown = gr.Dropdown( choices=["raw", "wavlm", "wav2vec2", "hubert", "wavlm_base", "wav2vec2_base", "hubert_base", "wav2vec2_xlsr"], value="wav2vec2_base", label="Select embedding model" ) layer_slider = gr.Slider( minimum=0, maximum=12, step=1, value=12, label="Layer (automatically set to model default)", interactive=True ) alpha_slider = gr.Slider( minimum=0.0, maximum=1.0, step=0.1, value=DEFAULT_ALPHA, label="Diffusion maps alpha parameter" ) def update_layer_slider(model_name): """Update layer slider based on selected model""" model_configs = { "raw": {"maximum": 0, "value": 0, "interactive": False}, "wavlm": {"maximum": 24, "value": 24, "interactive": True}, "wav2vec2": {"maximum": 24, "value": 24, "interactive": True}, "hubert": {"maximum": 24, "value": 24, "interactive": True}, "wav2vec2_xlsr": {"maximum": 24, "value": 24, "interactive": True}, "wavlm_base": {"maximum": 12, "value": 12, "interactive": True}, "wav2vec2_base": {"maximum": 12, "value": 12, "interactive": True}, "hubert_base": {"maximum": 12, "value": 12, "interactive": True} } config = model_configs.get(model_name, {"maximum": 12, "value": 12, "interactive": True}) return gr.Slider( minimum=0, maximum=config["maximum"], value=config["value"], step=1, label=f"Layer (max: {config['maximum']}, default: {config['value']})" if config["interactive"] else "Layer (not applicable for raw features)", interactive=config["interactive"] ) model_dropdown.change( fn=update_layer_slider, inputs=[model_dropdown], outputs=[layer_slider] ) process_btn = gr.Button("Process Audio Files", variant="primary") with gr.Column(): output_file = gr.File( label="Download Results (ZIP)", type="filepath" ) status_text = gr.Textbox( label="Status", lines=3, max_lines=10 ) process_btn.click( fn=process_audio_files, inputs=[file_input, model_dropdown, layer_slider, alpha_slider], outputs=[output_file, status_text] ) return demo if __name__ == "__main__": demo = create_interface() demo.launch()