Spaces:

AIvry
/

MAPSS-measures

Sleeping

File size: 14,611 Bytes

import gradio as gr
import zipfile
import shutil
from pathlib import Path
import json
import os
import traceback
import gc
import torch
import spaces

# Import your modules
from engine import compute_mapss_measures
from models import get_model_config, cleanup_all_models
from config import DEFAULT_ALPHA
from utils import clear_gpu_memory

def process_audio_files_cpu(zip_file, model_name, layer, alpha):
    """Process uploaded ZIP file containing audio mixtures - CPU part."""
    
    if zip_file is None:
        return None, "Please upload a ZIP file", None
    
    try:
        # Use a fixed extraction path
        extract_path = Path("/tmp/mapss_extract")
        if extract_path.exists():
            shutil.rmtree(extract_path)
        extract_path.mkdir(parents=True)
        
        # Extract ZIP
        with zipfile.ZipFile(zip_file.name, 'r') as zip_ref:
            zip_ref.extractall(extract_path)
        
        # Find references and outputs directories
        refs_dir = None
        outs_dir = None
        
        for item in extract_path.iterdir():
            if item.is_dir():
                if item.name.lower() in ['references', 'refs', 'reference']:
                    refs_dir = item
                elif item.name.lower() in ['outputs', 'outs', 'output', 'separated']:
                    outs_dir = item
        
        # Check one level deeper if not found
        if refs_dir is None or outs_dir is None:
            for item in extract_path.iterdir():
                if item.is_dir():
                    for subitem in item.iterdir():
                        if subitem.is_dir():
                            if subitem.name.lower() in ['references', 'refs', 'reference']:
                                refs_dir = subitem
                            elif subitem.name.lower() in ['outputs', 'outs', 'output', 'separated']:
                                outs_dir = subitem
        
        if refs_dir is None or outs_dir is None:
            return None, "Could not find 'references' and 'outputs' directories in the ZIP file", None
        
        # Get audio files
        ref_files = sorted([f for f in refs_dir.glob("*.wav")])
        out_files = sorted([f for f in outs_dir.glob("*.wav")])
        
        if len(ref_files) == 0:
            return None, "No reference WAV files found", None
        if len(out_files) == 0:
            return None, "No output WAV files found", None
        if len(ref_files) != len(out_files):
            return None, f"Number of reference files ({len(ref_files)}) must match number of output files ({len(out_files)}). Files must be in the same order.", None
        
        # Create manifest
        manifest = [{
            "mixture_id": "uploaded_mixture",
            "references": [str(f) for f in ref_files],
            "systems": {
                "uploaded_system": [str(f) for f in out_files]
            }
        }]
        
        # Validate model
        allowed_models = set(get_model_config(0).keys())
        if model_name not in allowed_models:
            return None, f"Invalid model. Allowed: {', '.join(sorted(allowed_models))}", None
        
        # Set layer
        if model_name == "raw":
            layer_final = 0
        else:
            model_defaults = {
                "wavlm": 24, "wav2vec2": 24, "hubert": 24,
                "wavlm_base": 12, "wav2vec2_base": 12, "hubert_base": 12,
                "wav2vec2_xlsr": 24
            }
            layer_final = layer if layer is not None else model_defaults.get(model_name, 12)
        
        # Return preprocessed data for GPU processing
        return manifest, layer_final, alpha
        
    except Exception as e:
        error_msg = f"Error in preprocessing: {str(e)}\n{traceback.format_exc()}"
        return None, error_msg, None

@spaces.GPU(duration=300)
def process_audio_files_gpu(manifest, model_name, layer_final, alpha):
    """GPU processing part - only called when GPU is allocated."""
    
    if manifest is None:
        return None, "Invalid input data"
    
    try:
        # Force single GPU mode in Spaces environment
        # The spaces decorator handles GPU allocation
        max_gpus = 1 if torch.cuda.is_available() else 0
        
        # Run experiment with forced single GPU
        results_dir = compute_mapss_measures(
            models=[model_name],
            mixtures=manifest,
            layer=layer_final,
            alpha=alpha,
            verbose=True,
            max_gpus=max_gpus,  # Force single GPU
            add_ci=False  # Disable CI for faster processing in demo
        )
        
        # Create output ZIP at a fixed location
        output_zip = Path("/tmp/mapss_results.zip")
        
        with zipfile.ZipFile(output_zip, 'w') as zipf:
            results_path = Path(results_dir)
            files_added = 0
            
            # Add all files from results
            for file_path in results_path.rglob("*"):
                if file_path.is_file():
                    arcname = str(file_path.relative_to(results_path.parent))
                    zipf.write(file_path, arcname)
                    files_added += 1
        
        if output_zip.exists() and files_added > 0:
            return str(output_zip), f"Processing completed! Created ZIP with {files_added} files. Note: Output files must be in the same order as reference files."
        else:
            return None, f"Processing completed but no output files were generated. Check if embeddings were computed."
            
    except Exception as e:
        error_msg = f"Error in GPU processing: {str(e)}\n{traceback.format_exc()}"
        return None, error_msg
    
    finally:
        # Cleanup is critical in Spaces environment
        cleanup_all_models()
        clear_gpu_memory()
        gc.collect()
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

def process_audio_files(zip_file, model_name, layer, alpha):
    """Main processing function that combines CPU and GPU parts."""
    
    # First, do CPU preprocessing
    manifest, layer_final, alpha_processed = process_audio_files_cpu(
        zip_file, model_name, layer, alpha
    )
    
    if manifest is None:
        return None, layer_final  # layer_final contains error message in this case
    
    # Then do GPU processing
    return process_audio_files_gpu(manifest, model_name, layer_final, alpha_processed)

def create_interface():
    with gr.Blocks(title="MAPSS - Multi-source Audio Perceptual Separation Scores") as demo:
        gr.Markdown("""
        # MAPSS: Manifold-based Assessment of Perceptual Source Separation

        Granular evaluation of speech and music source separation with the MAPSS measures:
        - **Perceptual Matching (PM)**: Measures how closely an output perceptually aligns with its reference. Range: 0-1, higher is better.
        - **Perceptual Similarity (PS)**: Measures how well an output is separated from its interfering references. Range: 0-1, higher is better.
        
        ## ⚠️ IMPORTANT: File Order Requirements
        
        **Output files MUST be in the same order as reference files!**
        - If references are: `speaker1.wav`, `speaker2.wav`, `speaker3.wav`
        - Then outputs must be: `output1.wav`, `output2.wav`, `output3.wav`
        - Where `output1` corresponds to `speaker1`, `output2` to `speaker2`, etc.
        
        ## Input Format
        
        Upload a ZIP file containing:
        ```
        your_mixture.zip
        ├── references/       # Original clean sources
        │   ├── speaker1.wav
        │   ├── speaker2.wav
        │   └── ...
        └── outputs/         # Separated outputs (SAME ORDER as references)
            ├── separated1.wav  # Must correspond to speaker1.wav
            ├── separated2.wav  # Must correspond to speaker2.wav
            └── ...
        ```
        
        ### Audio Requirements
        - Format: .wav files
        - Sample rate: Any (automatically resampled to 16kHz)
        - Channels: Mono or stereo (converted to mono)
        - **Number of files: Equal number of references and outputs**
        - **Order: Output files must be in the same order as reference files**
        
        ## Output Format
        
        The tool generates a ZIP file containing:
        - `ps_scores_{model}.csv`: PS scores for each source over time
        - `pm_scores_{model}.csv`: PM scores for each source over time
        - `params.json`: Parameters used
        - `manifest_canonical.json`: File mapping and processing details
        
        ### Score Interpretation
        - **Valid scores**: Only computed when at least 2 speakers are active in a frame
        - **NaN values**: Appear for non-active speakers, or when fewer than 2 speakers are active in the frame. 
        - **Time resolution**: 20ms frames
        
        ## Available Models
        
        | Model | Description | Default Layer | Use Case |
        |-------|-------------|---------------|----------|
        | `raw` | Raw waveform features | N/A | Baseline comparison |
        | `wavlm` | WavLM Large | 24 | Strong performance |
        | `wav2vec2` | Wav2Vec2 Large | 24 | Best overall performance |
        | `hubert` | HuBERT Large | 24 |  |
        | `wavlm_base` | WavLM Base | 12 |  |
        | `wav2vec2_base` | Wav2Vec2 Base | 12 | Faster, good quality |
        | `hubert_base` | HuBERT Base | 12 |  |
        | `wav2vec2_xlsr` | Wav2Vec2 XLSR-53 | 24 | Multilingual |
        
        ## Parameters
        
        - **Model**: Select the embedding model for feature extraction
        - **Layer**: Which transformer layer to use (auto-selected by default)
        - **Alpha**: Diffusion maps parameter (0.0-1.0, default: 1.0)
          - 0.0 = No normalization
          - 1.0 = Full normalization (recommended)
        
        ## Processing Notes
        
        - The system automatically detects which speakers are active in each frame
        - PS/PM scores are only computed between active speakers
        - Processing time scales with number of sources and audio length
        - GPU acceleration is automatically used when available
        - **Note**: This Hugging Face Space runs with a single GPU allocation
        
        ## Citation
        
        If you use MAPSS, please cite:
        
        ```bibtex
        @article{ivry2025mapss,
        title={MAPSS: Manifold-based Assessment of Perceptual Source Separation},
        author={Ivry, Amir and Cornell, Samuele and Watanabe, Shinji},
        journal={arXiv preprint arXiv:2509.09212},
        year={2025}
        }
        ```
        
        ## License
        
        Code: MIT License  
        Paper: CC-BY-4.0
        
        ## Support
        
        For issues, questions, or contributions, please visit the [GitHub repository](https://github.com/amir-ivry/MAPSS-measures).
        """)
        
        with gr.Row():
            with gr.Column():
                file_input = gr.File(
                    label="Upload ZIP file with audio mixtures",
                    file_types=[".zip"],
                    type="filepath"
                )
                
                model_dropdown = gr.Dropdown(
                    choices=["raw", "wavlm", "wav2vec2", "hubert", 
                            "wavlm_base", "wav2vec2_base", "hubert_base",
                            "wav2vec2_xlsr"],
                    value="wav2vec2_base",
                    label="Select embedding model"
                )
                
                layer_slider = gr.Slider(
                    minimum=0,
                    maximum=12,
                    step=1,
                    value=12,
                    label="Layer (automatically set to model default)",
                    interactive=True
                )
                
                alpha_slider = gr.Slider(
                    minimum=0.0,
                    maximum=1.0,
                    step=0.1,
                    value=DEFAULT_ALPHA,
                    label="Diffusion maps alpha parameter"
                )
                
                def update_layer_slider(model_name):
                    """Update layer slider based on selected model"""
                    model_configs = {
                        "raw": {"maximum": 0, "value": 0, "interactive": False},
                        "wavlm": {"maximum": 24, "value": 24, "interactive": True},
                        "wav2vec2": {"maximum": 24, "value": 24, "interactive": True},
                        "hubert": {"maximum": 24, "value": 24, "interactive": True},
                        "wav2vec2_xlsr": {"maximum": 24, "value": 24, "interactive": True},
                        "wavlm_base": {"maximum": 12, "value": 12, "interactive": True},
                        "wav2vec2_base": {"maximum": 12, "value": 12, "interactive": True},
                        "hubert_base": {"maximum": 12, "value": 12, "interactive": True}
                    }
                    
                    config = model_configs.get(model_name, {"maximum": 12, "value": 12, "interactive": True})
                    return gr.Slider(
                        minimum=0,
                        maximum=config["maximum"],
                        value=config["value"],
                        step=1,
                        label=f"Layer (max: {config['maximum']}, default: {config['value']})" if config["interactive"] else "Layer (not applicable for raw features)",
                        interactive=config["interactive"]
                    )
                
                model_dropdown.change(
                    fn=update_layer_slider,
                    inputs=[model_dropdown],
                    outputs=[layer_slider]
                )
                
                process_btn = gr.Button("Process Audio Files", variant="primary")
            
            with gr.Column():
                output_file = gr.File(
                    label="Download Results (ZIP)",
                    type="filepath"
                )
                status_text = gr.Textbox(
                    label="Status",
                    lines=3,
                    max_lines=10
                )
        
        process_btn.click(
            fn=process_audio_files,
            inputs=[file_input, model_dropdown, layer_slider, alpha_slider],
            outputs=[output_file, status_text]
        )
    
    return demo

if __name__ == "__main__":
    demo = create_interface()
    demo.launch()