Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import zipfile | |
| import shutil | |
| from pathlib import Path | |
| import json | |
| import os | |
| import traceback | |
| import gc | |
| import torch | |
| import spaces | |
| # Import your modules | |
| from engine import compute_mapss_measures | |
| from models import get_model_config, cleanup_all_models | |
| from config import DEFAULT_ALPHA | |
| from utils import clear_gpu_memory | |
| def process_audio_files_cpu(zip_file, model_name, layer, alpha): | |
| """Process uploaded ZIP file containing audio mixtures - CPU part.""" | |
| if zip_file is None: | |
| return None, "Please upload a ZIP file", None | |
| try: | |
| # Use a fixed extraction path | |
| extract_path = Path("/tmp/mapss_extract") | |
| if extract_path.exists(): | |
| shutil.rmtree(extract_path) | |
| extract_path.mkdir(parents=True) | |
| # Extract ZIP | |
| with zipfile.ZipFile(zip_file.name, 'r') as zip_ref: | |
| zip_ref.extractall(extract_path) | |
| # Find references and outputs directories | |
| refs_dir = None | |
| outs_dir = None | |
| for item in extract_path.iterdir(): | |
| if item.is_dir(): | |
| if item.name.lower() in ['references', 'refs', 'reference']: | |
| refs_dir = item | |
| elif item.name.lower() in ['outputs', 'outs', 'output', 'separated']: | |
| outs_dir = item | |
| # Check one level deeper if not found | |
| if refs_dir is None or outs_dir is None: | |
| for item in extract_path.iterdir(): | |
| if item.is_dir(): | |
| for subitem in item.iterdir(): | |
| if subitem.is_dir(): | |
| if subitem.name.lower() in ['references', 'refs', 'reference']: | |
| refs_dir = subitem | |
| elif subitem.name.lower() in ['outputs', 'outs', 'output', 'separated']: | |
| outs_dir = subitem | |
| if refs_dir is None or outs_dir is None: | |
| return None, "Could not find 'references' and 'outputs' directories in the ZIP file", None | |
| # Get audio files | |
| ref_files = sorted([f for f in refs_dir.glob("*.wav")]) | |
| out_files = sorted([f for f in outs_dir.glob("*.wav")]) | |
| if len(ref_files) == 0: | |
| return None, "No reference WAV files found", None | |
| if len(out_files) == 0: | |
| return None, "No output WAV files found", None | |
| if len(ref_files) != len(out_files): | |
| return None, f"Number of reference files ({len(ref_files)}) must match number of output files ({len(out_files)}). Files must be in the same order.", None | |
| # Create manifest | |
| manifest = [{ | |
| "mixture_id": "uploaded_mixture", | |
| "references": [str(f) for f in ref_files], | |
| "systems": { | |
| "uploaded_system": [str(f) for f in out_files] | |
| } | |
| }] | |
| # Validate model | |
| allowed_models = set(get_model_config(0).keys()) | |
| if model_name not in allowed_models: | |
| return None, f"Invalid model. Allowed: {', '.join(sorted(allowed_models))}", None | |
| # Set layer | |
| if model_name == "raw": | |
| layer_final = 0 | |
| else: | |
| model_defaults = { | |
| "wavlm": 24, "wav2vec2": 24, "hubert": 24, | |
| "wavlm_base": 12, "wav2vec2_base": 12, "hubert_base": 12, | |
| "wav2vec2_xlsr": 24 | |
| } | |
| layer_final = layer if layer is not None else model_defaults.get(model_name, 12) | |
| # Return preprocessed data for GPU processing | |
| return manifest, layer_final, alpha | |
| except Exception as e: | |
| error_msg = f"Error in preprocessing: {str(e)}\n{traceback.format_exc()}" | |
| return None, error_msg, None | |
| def process_audio_files_gpu(manifest, model_name, layer_final, alpha): | |
| """GPU processing part - only called when GPU is allocated.""" | |
| if manifest is None: | |
| return None, "Invalid input data" | |
| try: | |
| # Force single GPU mode in Spaces environment | |
| # The spaces decorator handles GPU allocation | |
| max_gpus = 1 if torch.cuda.is_available() else 0 | |
| # Run experiment with forced single GPU | |
| results_dir = compute_mapss_measures( | |
| models=[model_name], | |
| mixtures=manifest, | |
| layer=layer_final, | |
| alpha=alpha, | |
| verbose=True, | |
| max_gpus=max_gpus, # Force single GPU | |
| add_ci=False # Disable CI for faster processing in demo | |
| ) | |
| # Create output ZIP at a fixed location | |
| output_zip = Path("/tmp/mapss_results.zip") | |
| with zipfile.ZipFile(output_zip, 'w') as zipf: | |
| results_path = Path(results_dir) | |
| files_added = 0 | |
| # Add all files from results | |
| for file_path in results_path.rglob("*"): | |
| if file_path.is_file(): | |
| arcname = str(file_path.relative_to(results_path.parent)) | |
| zipf.write(file_path, arcname) | |
| files_added += 1 | |
| if output_zip.exists() and files_added > 0: | |
| return str(output_zip), f"Processing completed! Created ZIP with {files_added} files. Note: Output files must be in the same order as reference files." | |
| else: | |
| return None, f"Processing completed but no output files were generated. Check if embeddings were computed." | |
| except Exception as e: | |
| error_msg = f"Error in GPU processing: {str(e)}\n{traceback.format_exc()}" | |
| return None, error_msg | |
| finally: | |
| # Cleanup is critical in Spaces environment | |
| cleanup_all_models() | |
| clear_gpu_memory() | |
| gc.collect() | |
| if torch.cuda.is_available(): | |
| torch.cuda.empty_cache() | |
| def process_audio_files(zip_file, model_name, layer, alpha): | |
| """Main processing function that combines CPU and GPU parts.""" | |
| # First, do CPU preprocessing | |
| manifest, layer_final, alpha_processed = process_audio_files_cpu( | |
| zip_file, model_name, layer, alpha | |
| ) | |
| if manifest is None: | |
| return None, layer_final # layer_final contains error message in this case | |
| # Then do GPU processing | |
| return process_audio_files_gpu(manifest, model_name, layer_final, alpha_processed) | |
| def create_interface(): | |
| with gr.Blocks(title="MAPSS - Multi-source Audio Perceptual Separation Scores") as demo: | |
| gr.Markdown(""" | |
| # MAPSS: Manifold-based Assessment of Perceptual Source Separation | |
| Granular evaluation of speech and music source separation with the MAPSS measures: | |
| - **Perceptual Matching (PM)**: Measures how closely an output perceptually aligns with its reference. Range: 0-1, higher is better. | |
| - **Perceptual Similarity (PS)**: Measures how well an output is separated from its interfering references. Range: 0-1, higher is better. | |
| ## ⚠️ IMPORTANT: File Order Requirements | |
| **Output files MUST be in the same order as reference files!** | |
| - If references are: `speaker1.wav`, `speaker2.wav`, `speaker3.wav` | |
| - Then outputs must be: `output1.wav`, `output2.wav`, `output3.wav` | |
| - Where `output1` corresponds to `speaker1`, `output2` to `speaker2`, etc. | |
| ## Input Format | |
| Upload a ZIP file containing: | |
| ``` | |
| your_mixture.zip | |
| ├── references/ # Original clean sources | |
| │ ├── speaker1.wav | |
| │ ├── speaker2.wav | |
| │ └── ... | |
| └── outputs/ # Separated outputs (SAME ORDER as references) | |
| ├── separated1.wav # Must correspond to speaker1.wav | |
| ├── separated2.wav # Must correspond to speaker2.wav | |
| └── ... | |
| ``` | |
| ### Audio Requirements | |
| - Format: .wav files | |
| - Sample rate: Any (automatically resampled to 16kHz) | |
| - Channels: Mono or stereo (converted to mono) | |
| - **Number of files: Equal number of references and outputs** | |
| - **Order: Output files must be in the same order as reference files** | |
| ## Output Format | |
| The tool generates a ZIP file containing: | |
| - `ps_scores_{model}.csv`: PS scores for each source over time | |
| - `pm_scores_{model}.csv`: PM scores for each source over time | |
| - `params.json`: Parameters used | |
| - `manifest_canonical.json`: File mapping and processing details | |
| ### Score Interpretation | |
| - **Valid scores**: Only computed when at least 2 speakers are active in a frame | |
| - **NaN values**: Appear for non-active speakers, or when fewer than 2 speakers are active in the frame. | |
| - **Time resolution**: 20ms frames | |
| ## Available Models | |
| | Model | Description | Default Layer | Use Case | | |
| |-------|-------------|---------------|----------| | |
| | `raw` | Raw waveform features | N/A | Baseline comparison | | |
| | `wavlm` | WavLM Large | 24 | Strong performance | | |
| | `wav2vec2` | Wav2Vec2 Large | 24 | Best overall performance | | |
| | `hubert` | HuBERT Large | 24 | | | |
| | `wavlm_base` | WavLM Base | 12 | | | |
| | `wav2vec2_base` | Wav2Vec2 Base | 12 | Faster, good quality | | |
| | `hubert_base` | HuBERT Base | 12 | | | |
| | `wav2vec2_xlsr` | Wav2Vec2 XLSR-53 | 24 | Multilingual | | |
| ## Parameters | |
| - **Model**: Select the embedding model for feature extraction | |
| - **Layer**: Which transformer layer to use (auto-selected by default) | |
| - **Alpha**: Diffusion maps parameter (0.0-1.0, default: 1.0) | |
| - 0.0 = No normalization | |
| - 1.0 = Full normalization (recommended) | |
| ## Processing Notes | |
| - The system automatically detects which speakers are active in each frame | |
| - PS/PM scores are only computed between active speakers | |
| - Processing time scales with number of sources and audio length | |
| - GPU acceleration is automatically used when available | |
| - **Note**: This Hugging Face Space runs with a single GPU allocation | |
| ## Citation | |
| If you use MAPSS, please cite: | |
| ```bibtex | |
| @article{ivry2025mapss, | |
| title={MAPSS: Manifold-based Assessment of Perceptual Source Separation}, | |
| author={Ivry, Amir and Cornell, Samuele and Watanabe, Shinji}, | |
| journal={arXiv preprint arXiv:2509.09212}, | |
| year={2025} | |
| } | |
| ``` | |
| ## License | |
| Code: MIT License | |
| Paper: CC-BY-4.0 | |
| ## Support | |
| For issues, questions, or contributions, please visit the [GitHub repository](https://github.com/amir-ivry/MAPSS-measures). | |
| """) | |
| with gr.Row(): | |
| with gr.Column(): | |
| file_input = gr.File( | |
| label="Upload ZIP file with audio mixtures", | |
| file_types=[".zip"], | |
| type="filepath" | |
| ) | |
| model_dropdown = gr.Dropdown( | |
| choices=["raw", "wavlm", "wav2vec2", "hubert", | |
| "wavlm_base", "wav2vec2_base", "hubert_base", | |
| "wav2vec2_xlsr"], | |
| value="wav2vec2_base", | |
| label="Select embedding model" | |
| ) | |
| layer_slider = gr.Slider( | |
| minimum=0, | |
| maximum=12, | |
| step=1, | |
| value=12, | |
| label="Layer (automatically set to model default)", | |
| interactive=True | |
| ) | |
| alpha_slider = gr.Slider( | |
| minimum=0.0, | |
| maximum=1.0, | |
| step=0.1, | |
| value=DEFAULT_ALPHA, | |
| label="Diffusion maps alpha parameter" | |
| ) | |
| def update_layer_slider(model_name): | |
| """Update layer slider based on selected model""" | |
| model_configs = { | |
| "raw": {"maximum": 0, "value": 0, "interactive": False}, | |
| "wavlm": {"maximum": 24, "value": 24, "interactive": True}, | |
| "wav2vec2": {"maximum": 24, "value": 24, "interactive": True}, | |
| "hubert": {"maximum": 24, "value": 24, "interactive": True}, | |
| "wav2vec2_xlsr": {"maximum": 24, "value": 24, "interactive": True}, | |
| "wavlm_base": {"maximum": 12, "value": 12, "interactive": True}, | |
| "wav2vec2_base": {"maximum": 12, "value": 12, "interactive": True}, | |
| "hubert_base": {"maximum": 12, "value": 12, "interactive": True} | |
| } | |
| config = model_configs.get(model_name, {"maximum": 12, "value": 12, "interactive": True}) | |
| return gr.Slider( | |
| minimum=0, | |
| maximum=config["maximum"], | |
| value=config["value"], | |
| step=1, | |
| label=f"Layer (max: {config['maximum']}, default: {config['value']})" if config["interactive"] else "Layer (not applicable for raw features)", | |
| interactive=config["interactive"] | |
| ) | |
| model_dropdown.change( | |
| fn=update_layer_slider, | |
| inputs=[model_dropdown], | |
| outputs=[layer_slider] | |
| ) | |
| process_btn = gr.Button("Process Audio Files", variant="primary") | |
| with gr.Column(): | |
| output_file = gr.File( | |
| label="Download Results (ZIP)", | |
| type="filepath" | |
| ) | |
| status_text = gr.Textbox( | |
| label="Status", | |
| lines=3, | |
| max_lines=10 | |
| ) | |
| process_btn.click( | |
| fn=process_audio_files, | |
| inputs=[file_input, model_dropdown, layer_slider, alpha_slider], | |
| outputs=[output_file, status_text] | |
| ) | |
| return demo | |
| if __name__ == "__main__": | |
| demo = create_interface() | |
| demo.launch() |