File size: 14,611 Bytes
5b6a83c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
437e805
 
5b6a83c
 
437e805
5b6a83c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
437e805
5b6a83c
 
 
 
 
 
437e805
5b6a83c
437e805
5b6a83c
437e805
5b6a83c
 
 
 
 
 
 
 
 
 
 
 
 
437e805
5b6a83c
 
 
 
 
 
 
 
 
 
 
 
437e805
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5b6a83c
437e805
5b6a83c
 
 
 
 
 
437e805
5b6a83c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
437e805
5b6a83c
 
 
437e805
5b6a83c
 
 
437e805
 
 
 
 
 
 
9ae59a2
437e805
 
 
 
 
 
 
 
5b6a83c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a41124a
 
5b6a83c
 
 
 
 
 
 
 
a41124a
 
5b6a83c
a41124a
5b6a83c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
437e805
5b6a83c
 
 
 
 
 
a41124a
 
 
 
 
5b6a83c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
import gradio as gr
import zipfile
import shutil
from pathlib import Path
import json
import os
import traceback
import gc
import torch
import spaces

# Import your modules
from engine import compute_mapss_measures
from models import get_model_config, cleanup_all_models
from config import DEFAULT_ALPHA
from utils import clear_gpu_memory

def process_audio_files_cpu(zip_file, model_name, layer, alpha):
    """Process uploaded ZIP file containing audio mixtures - CPU part."""
    
    if zip_file is None:
        return None, "Please upload a ZIP file", None
    
    try:
        # Use a fixed extraction path
        extract_path = Path("/tmp/mapss_extract")
        if extract_path.exists():
            shutil.rmtree(extract_path)
        extract_path.mkdir(parents=True)
        
        # Extract ZIP
        with zipfile.ZipFile(zip_file.name, 'r') as zip_ref:
            zip_ref.extractall(extract_path)
        
        # Find references and outputs directories
        refs_dir = None
        outs_dir = None
        
        for item in extract_path.iterdir():
            if item.is_dir():
                if item.name.lower() in ['references', 'refs', 'reference']:
                    refs_dir = item
                elif item.name.lower() in ['outputs', 'outs', 'output', 'separated']:
                    outs_dir = item
        
        # Check one level deeper if not found
        if refs_dir is None or outs_dir is None:
            for item in extract_path.iterdir():
                if item.is_dir():
                    for subitem in item.iterdir():
                        if subitem.is_dir():
                            if subitem.name.lower() in ['references', 'refs', 'reference']:
                                refs_dir = subitem
                            elif subitem.name.lower() in ['outputs', 'outs', 'output', 'separated']:
                                outs_dir = subitem
        
        if refs_dir is None or outs_dir is None:
            return None, "Could not find 'references' and 'outputs' directories in the ZIP file", None
        
        # Get audio files
        ref_files = sorted([f for f in refs_dir.glob("*.wav")])
        out_files = sorted([f for f in outs_dir.glob("*.wav")])
        
        if len(ref_files) == 0:
            return None, "No reference WAV files found", None
        if len(out_files) == 0:
            return None, "No output WAV files found", None
        if len(ref_files) != len(out_files):
            return None, f"Number of reference files ({len(ref_files)}) must match number of output files ({len(out_files)}). Files must be in the same order.", None
        
        # Create manifest
        manifest = [{
            "mixture_id": "uploaded_mixture",
            "references": [str(f) for f in ref_files],
            "systems": {
                "uploaded_system": [str(f) for f in out_files]
            }
        }]
        
        # Validate model
        allowed_models = set(get_model_config(0).keys())
        if model_name not in allowed_models:
            return None, f"Invalid model. Allowed: {', '.join(sorted(allowed_models))}", None
        
        # Set layer
        if model_name == "raw":
            layer_final = 0
        else:
            model_defaults = {
                "wavlm": 24, "wav2vec2": 24, "hubert": 24,
                "wavlm_base": 12, "wav2vec2_base": 12, "hubert_base": 12,
                "wav2vec2_xlsr": 24
            }
            layer_final = layer if layer is not None else model_defaults.get(model_name, 12)
        
        # Return preprocessed data for GPU processing
        return manifest, layer_final, alpha
        
    except Exception as e:
        error_msg = f"Error in preprocessing: {str(e)}\n{traceback.format_exc()}"
        return None, error_msg, None

@spaces.GPU(duration=300)
def process_audio_files_gpu(manifest, model_name, layer_final, alpha):
    """GPU processing part - only called when GPU is allocated."""
    
    if manifest is None:
        return None, "Invalid input data"
    
    try:
        # Force single GPU mode in Spaces environment
        # The spaces decorator handles GPU allocation
        max_gpus = 1 if torch.cuda.is_available() else 0
        
        # Run experiment with forced single GPU
        results_dir = compute_mapss_measures(
            models=[model_name],
            mixtures=manifest,
            layer=layer_final,
            alpha=alpha,
            verbose=True,
            max_gpus=max_gpus,  # Force single GPU
            add_ci=False  # Disable CI for faster processing in demo
        )
        
        # Create output ZIP at a fixed location
        output_zip = Path("/tmp/mapss_results.zip")
        
        with zipfile.ZipFile(output_zip, 'w') as zipf:
            results_path = Path(results_dir)
            files_added = 0
            
            # Add all files from results
            for file_path in results_path.rglob("*"):
                if file_path.is_file():
                    arcname = str(file_path.relative_to(results_path.parent))
                    zipf.write(file_path, arcname)
                    files_added += 1
        
        if output_zip.exists() and files_added > 0:
            return str(output_zip), f"Processing completed! Created ZIP with {files_added} files. Note: Output files must be in the same order as reference files."
        else:
            return None, f"Processing completed but no output files were generated. Check if embeddings were computed."
            
    except Exception as e:
        error_msg = f"Error in GPU processing: {str(e)}\n{traceback.format_exc()}"
        return None, error_msg
    
    finally:
        # Cleanup is critical in Spaces environment
        cleanup_all_models()
        clear_gpu_memory()
        gc.collect()
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

def process_audio_files(zip_file, model_name, layer, alpha):
    """Main processing function that combines CPU and GPU parts."""
    
    # First, do CPU preprocessing
    manifest, layer_final, alpha_processed = process_audio_files_cpu(
        zip_file, model_name, layer, alpha
    )
    
    if manifest is None:
        return None, layer_final  # layer_final contains error message in this case
    
    # Then do GPU processing
    return process_audio_files_gpu(manifest, model_name, layer_final, alpha_processed)

def create_interface():
    with gr.Blocks(title="MAPSS - Multi-source Audio Perceptual Separation Scores") as demo:
        gr.Markdown("""
        # MAPSS: Manifold-based Assessment of Perceptual Source Separation

        Granular evaluation of speech and music source separation with the MAPSS measures:
        - **Perceptual Matching (PM)**: Measures how closely an output perceptually aligns with its reference. Range: 0-1, higher is better.
        - **Perceptual Similarity (PS)**: Measures how well an output is separated from its interfering references. Range: 0-1, higher is better.
        
        ## ⚠️ IMPORTANT: File Order Requirements
        
        **Output files MUST be in the same order as reference files!**
        - If references are: `speaker1.wav`, `speaker2.wav`, `speaker3.wav`
        - Then outputs must be: `output1.wav`, `output2.wav`, `output3.wav`
        - Where `output1` corresponds to `speaker1`, `output2` to `speaker2`, etc.
        
        ## Input Format
        
        Upload a ZIP file containing:
        ```
        your_mixture.zip
        β”œβ”€β”€ references/       # Original clean sources
        β”‚   β”œβ”€β”€ speaker1.wav
        β”‚   β”œβ”€β”€ speaker2.wav
        β”‚   └── ...
        └── outputs/         # Separated outputs (SAME ORDER as references)
            β”œβ”€β”€ separated1.wav  # Must correspond to speaker1.wav
            β”œβ”€β”€ separated2.wav  # Must correspond to speaker2.wav
            └── ...
        ```
        
        ### Audio Requirements
        - Format: .wav files
        - Sample rate: Any (automatically resampled to 16kHz)
        - Channels: Mono or stereo (converted to mono)
        - **Number of files: Equal number of references and outputs**
        - **Order: Output files must be in the same order as reference files**
        
        ## Output Format
        
        The tool generates a ZIP file containing:
        - `ps_scores_{model}.csv`: PS scores for each source over time
        - `pm_scores_{model}.csv`: PM scores for each source over time
        - `params.json`: Parameters used
        - `manifest_canonical.json`: File mapping and processing details
        
        ### Score Interpretation
        - **Valid scores**: Only computed when at least 2 speakers are active in a frame
        - **NaN values**: Appear for non-active speakers, or when fewer than 2 speakers are active in the frame. 
        - **Time resolution**: 20ms frames
        
        ## Available Models
        
        | Model | Description | Default Layer | Use Case |
        |-------|-------------|---------------|----------|
        | `raw` | Raw waveform features | N/A | Baseline comparison |
        | `wavlm` | WavLM Large | 24 | Strong performance |
        | `wav2vec2` | Wav2Vec2 Large | 24 | Best overall performance |
        | `hubert` | HuBERT Large | 24 |  |
        | `wavlm_base` | WavLM Base | 12 |  |
        | `wav2vec2_base` | Wav2Vec2 Base | 12 | Faster, good quality |
        | `hubert_base` | HuBERT Base | 12 |  |
        | `wav2vec2_xlsr` | Wav2Vec2 XLSR-53 | 24 | Multilingual |
        
        ## Parameters
        
        - **Model**: Select the embedding model for feature extraction
        - **Layer**: Which transformer layer to use (auto-selected by default)
        - **Alpha**: Diffusion maps parameter (0.0-1.0, default: 1.0)
          - 0.0 = No normalization
          - 1.0 = Full normalization (recommended)
        
        ## Processing Notes
        
        - The system automatically detects which speakers are active in each frame
        - PS/PM scores are only computed between active speakers
        - Processing time scales with number of sources and audio length
        - GPU acceleration is automatically used when available
        - **Note**: This Hugging Face Space runs with a single GPU allocation
        
        ## Citation
        
        If you use MAPSS, please cite:
        
        ```bibtex
        @article{ivry2025mapss,
        title={MAPSS: Manifold-based Assessment of Perceptual Source Separation},
        author={Ivry, Amir and Cornell, Samuele and Watanabe, Shinji},
        journal={arXiv preprint arXiv:2509.09212},
        year={2025}
        }
        ```
        
        ## License
        
        Code: MIT License  
        Paper: CC-BY-4.0
        
        ## Support
        
        For issues, questions, or contributions, please visit the [GitHub repository](https://github.com/amir-ivry/MAPSS-measures).
        """)
        
        with gr.Row():
            with gr.Column():
                file_input = gr.File(
                    label="Upload ZIP file with audio mixtures",
                    file_types=[".zip"],
                    type="filepath"
                )
                
                model_dropdown = gr.Dropdown(
                    choices=["raw", "wavlm", "wav2vec2", "hubert", 
                            "wavlm_base", "wav2vec2_base", "hubert_base",
                            "wav2vec2_xlsr"],
                    value="wav2vec2_base",
                    label="Select embedding model"
                )
                
                layer_slider = gr.Slider(
                    minimum=0,
                    maximum=12,
                    step=1,
                    value=12,
                    label="Layer (automatically set to model default)",
                    interactive=True
                )
                
                alpha_slider = gr.Slider(
                    minimum=0.0,
                    maximum=1.0,
                    step=0.1,
                    value=DEFAULT_ALPHA,
                    label="Diffusion maps alpha parameter"
                )
                
                def update_layer_slider(model_name):
                    """Update layer slider based on selected model"""
                    model_configs = {
                        "raw": {"maximum": 0, "value": 0, "interactive": False},
                        "wavlm": {"maximum": 24, "value": 24, "interactive": True},
                        "wav2vec2": {"maximum": 24, "value": 24, "interactive": True},
                        "hubert": {"maximum": 24, "value": 24, "interactive": True},
                        "wav2vec2_xlsr": {"maximum": 24, "value": 24, "interactive": True},
                        "wavlm_base": {"maximum": 12, "value": 12, "interactive": True},
                        "wav2vec2_base": {"maximum": 12, "value": 12, "interactive": True},
                        "hubert_base": {"maximum": 12, "value": 12, "interactive": True}
                    }
                    
                    config = model_configs.get(model_name, {"maximum": 12, "value": 12, "interactive": True})
                    return gr.Slider(
                        minimum=0,
                        maximum=config["maximum"],
                        value=config["value"],
                        step=1,
                        label=f"Layer (max: {config['maximum']}, default: {config['value']})" if config["interactive"] else "Layer (not applicable for raw features)",
                        interactive=config["interactive"]
                    )
                
                model_dropdown.change(
                    fn=update_layer_slider,
                    inputs=[model_dropdown],
                    outputs=[layer_slider]
                )
                
                process_btn = gr.Button("Process Audio Files", variant="primary")
            
            with gr.Column():
                output_file = gr.File(
                    label="Download Results (ZIP)",
                    type="filepath"
                )
                status_text = gr.Textbox(
                    label="Status",
                    lines=3,
                    max_lines=10
                )
        
        process_btn.click(
            fn=process_audio_files,
            inputs=[file_input, model_dropdown, layer_slider, alpha_slider],
            outputs=[output_file, status_text]
        )
    
    return demo

if __name__ == "__main__":
    demo = create_interface()
    demo.launch()