Spaces:

AIvry
/

MAPSS-measures

Sleeping

App Files Files Community

MAPSS-measures / app.py

AIvry

Update app.py

a41124a verified 3 months ago

raw

history blame contribute delete

14.6 kB

	import gradio as gr
	import zipfile
	import shutil
	from pathlib import Path
	import json
	import os
	import traceback
	import gc
	import torch
	import spaces

	# Import your modules
	from engine import compute_mapss_measures
	from models import get_model_config, cleanup_all_models
	from config import DEFAULT_ALPHA
	from utils import clear_gpu_memory

	def process_audio_files_cpu(zip_file, model_name, layer, alpha):
	"""Process uploaded ZIP file containing audio mixtures - CPU part."""

	if zip_file is None:
	return None, "Please upload a ZIP file", None

	try:
	# Use a fixed extraction path
	extract_path = Path("/tmp/mapss_extract")
	if extract_path.exists():
	shutil.rmtree(extract_path)
	extract_path.mkdir(parents=True)

	# Extract ZIP
	with zipfile.ZipFile(zip_file.name, 'r') as zip_ref:
	zip_ref.extractall(extract_path)

	# Find references and outputs directories
	refs_dir = None
	outs_dir = None

	for item in extract_path.iterdir():
	if item.is_dir():
	if item.name.lower() in ['references', 'refs', 'reference']:
	refs_dir = item
	elif item.name.lower() in ['outputs', 'outs', 'output', 'separated']:
	outs_dir = item

	# Check one level deeper if not found
	if refs_dir is None or outs_dir is None:
	for item in extract_path.iterdir():
	if item.is_dir():
	for subitem in item.iterdir():
	if subitem.is_dir():
	if subitem.name.lower() in ['references', 'refs', 'reference']:
	refs_dir = subitem
	elif subitem.name.lower() in ['outputs', 'outs', 'output', 'separated']:
	outs_dir = subitem

	if refs_dir is None or outs_dir is None:
	return None, "Could not find 'references' and 'outputs' directories in the ZIP file", None

	# Get audio files
	ref_files = sorted([f for f in refs_dir.glob("*.wav")])
	out_files = sorted([f for f in outs_dir.glob("*.wav")])

	if len(ref_files) == 0:
	return None, "No reference WAV files found", None
	if len(out_files) == 0:
	return None, "No output WAV files found", None
	if len(ref_files) != len(out_files):
	return None, f"Number of reference files ({len(ref_files)}) must match number of output files ({len(out_files)}). Files must be in the same order.", None

	# Create manifest
	manifest = [{
	"mixture_id": "uploaded_mixture",
	"references": [str(f) for f in ref_files],
	"systems": {
	"uploaded_system": [str(f) for f in out_files]
	}
	}]

	# Validate model
	allowed_models = set(get_model_config(0).keys())
	if model_name not in allowed_models:
	return None, f"Invalid model. Allowed: {', '.join(sorted(allowed_models))}", None

	# Set layer
	if model_name == "raw":
	layer_final = 0
	else:
	model_defaults = {
	"wavlm": 24, "wav2vec2": 24, "hubert": 24,
	"wavlm_base": 12, "wav2vec2_base": 12, "hubert_base": 12,
	"wav2vec2_xlsr": 24
	}
	layer_final = layer if layer is not None else model_defaults.get(model_name, 12)

	# Return preprocessed data for GPU processing
	return manifest, layer_final, alpha

	except Exception as e:
	error_msg = f"Error in preprocessing: {str(e)}\n{traceback.format_exc()}"
	return None, error_msg, None

	@spaces.GPU(duration=300)
	def process_audio_files_gpu(manifest, model_name, layer_final, alpha):
	"""GPU processing part - only called when GPU is allocated."""

	if manifest is None:
	return None, "Invalid input data"

	try:
	# Force single GPU mode in Spaces environment
	# The spaces decorator handles GPU allocation
	max_gpus = 1 if torch.cuda.is_available() else 0

	# Run experiment with forced single GPU
	results_dir = compute_mapss_measures(
	models=[model_name],
	mixtures=manifest,
	layer=layer_final,
	alpha=alpha,
	verbose=True,
	max_gpus=max_gpus, # Force single GPU
	add_ci=False # Disable CI for faster processing in demo
	)

	# Create output ZIP at a fixed location
	output_zip = Path("/tmp/mapss_results.zip")

	with zipfile.ZipFile(output_zip, 'w') as zipf:
	results_path = Path(results_dir)
	files_added = 0

	# Add all files from results
	for file_path in results_path.rglob("*"):
	if file_path.is_file():
	arcname = str(file_path.relative_to(results_path.parent))
	zipf.write(file_path, arcname)
	files_added += 1

	if output_zip.exists() and files_added > 0:
	return str(output_zip), f"Processing completed! Created ZIP with {files_added} files. Note: Output files must be in the same order as reference files."
	else:
	return None, f"Processing completed but no output files were generated. Check if embeddings were computed."

	except Exception as e:
	error_msg = f"Error in GPU processing: {str(e)}\n{traceback.format_exc()}"
	return None, error_msg

	finally:
	# Cleanup is critical in Spaces environment
	cleanup_all_models()
	clear_gpu_memory()
	gc.collect()
	if torch.cuda.is_available():
	torch.cuda.empty_cache()

	def process_audio_files(zip_file, model_name, layer, alpha):
	"""Main processing function that combines CPU and GPU parts."""

	# First, do CPU preprocessing
	manifest, layer_final, alpha_processed = process_audio_files_cpu(
	zip_file, model_name, layer, alpha
	)

	if manifest is None:
	return None, layer_final # layer_final contains error message in this case

	# Then do GPU processing
	return process_audio_files_gpu(manifest, model_name, layer_final, alpha_processed)

	def create_interface():
	with gr.Blocks(title="MAPSS - Multi-source Audio Perceptual Separation Scores") as demo:
	gr.Markdown("""
	# MAPSS: Manifold-based Assessment of Perceptual Source Separation

	Granular evaluation of speech and music source separation with the MAPSS measures:
	- Perceptual Matching (PM): Measures how closely an output perceptually aligns with its reference. Range: 0-1, higher is better.
	- Perceptual Similarity (PS): Measures how well an output is separated from its interfering references. Range: 0-1, higher is better.

	## ⚠️ IMPORTANT: File Order Requirements

	Output files MUST be in the same order as reference files!
	- If references are: `speaker1.wav`, `speaker2.wav`, `speaker3.wav`
	- Then outputs must be: `output1.wav`, `output2.wav`, `output3.wav`
	- Where `output1` corresponds to `speaker1`, `output2` to `speaker2`, etc.

	## Input Format

	Upload a ZIP file containing:
	```
	your_mixture.zip
	├── references/ # Original clean sources
	│ ├── speaker1.wav
	│ ├── speaker2.wav
	│ └── ...
	└── outputs/ # Separated outputs (SAME ORDER as references)
	├── separated1.wav # Must correspond to speaker1.wav
	├── separated2.wav # Must correspond to speaker2.wav
	└── ...
	```

	### Audio Requirements
	- Format: .wav files
	- Sample rate: Any (automatically resampled to 16kHz)
	- Channels: Mono or stereo (converted to mono)
	- Number of files: Equal number of references and outputs
	- Order: Output files must be in the same order as reference files

	## Output Format

	The tool generates a ZIP file containing:
	- `ps_scores_{model}.csv`: PS scores for each source over time
	- `pm_scores_{model}.csv`: PM scores for each source over time
	- `params.json`: Parameters used
	- `manifest_canonical.json`: File mapping and processing details

	### Score Interpretation
	- Valid scores: Only computed when at least 2 speakers are active in a frame
	- NaN values: Appear for non-active speakers, or when fewer than 2 speakers are active in the frame.
	- Time resolution: 20ms frames

	## Available Models

	\| Model \| Description \| Default Layer \| Use Case \|
	\|-------\|-------------\|---------------\|----------\|
	\| `raw` \| Raw waveform features \| N/A \| Baseline comparison \|
	\| `wavlm` \| WavLM Large \| 24 \| Strong performance \|
	\| `wav2vec2` \| Wav2Vec2 Large \| 24 \| Best overall performance \|
	\| `hubert` \| HuBERT Large \| 24 \| \|
	\| `wavlm_base` \| WavLM Base \| 12 \| \|
	\| `wav2vec2_base` \| Wav2Vec2 Base \| 12 \| Faster, good quality \|
	\| `hubert_base` \| HuBERT Base \| 12 \| \|
	\| `wav2vec2_xlsr` \| Wav2Vec2 XLSR-53 \| 24 \| Multilingual \|

	## Parameters

	- Model: Select the embedding model for feature extraction
	- Layer: Which transformer layer to use (auto-selected by default)
	- Alpha: Diffusion maps parameter (0.0-1.0, default: 1.0)
	- 0.0 = No normalization
	- 1.0 = Full normalization (recommended)

	## Processing Notes

	- The system automatically detects which speakers are active in each frame
	- PS/PM scores are only computed between active speakers
	- Processing time scales with number of sources and audio length
	- GPU acceleration is automatically used when available
	- Note: This Hugging Face Space runs with a single GPU allocation

	## Citation

	If you use MAPSS, please cite:

	```bibtex
	@article{ivry2025mapss,
	title={MAPSS: Manifold-based Assessment of Perceptual Source Separation},
	author={Ivry, Amir and Cornell, Samuele and Watanabe, Shinji},
	journal={arXiv preprint arXiv:2509.09212},
	year={2025}
	}
	```

	## License

	Code: MIT License
	Paper: CC-BY-4.0

	## Support

	For issues, questions, or contributions, please visit the [GitHub repository](https://github.com/amir-ivry/MAPSS-measures).
	""")

	with gr.Row():
	with gr.Column():
	file_input = gr.File(
	label="Upload ZIP file with audio mixtures",
	file_types=[".zip"],
	type="filepath"
	)

	model_dropdown = gr.Dropdown(
	choices=["raw", "wavlm", "wav2vec2", "hubert",
	"wavlm_base", "wav2vec2_base", "hubert_base",
	"wav2vec2_xlsr"],
	value="wav2vec2_base",
	label="Select embedding model"
	)

	layer_slider = gr.Slider(
	minimum=0,
	maximum=12,
	step=1,
	value=12,
	label="Layer (automatically set to model default)",
	interactive=True
	)

	alpha_slider = gr.Slider(
	minimum=0.0,
	maximum=1.0,
	step=0.1,
	value=DEFAULT_ALPHA,
	label="Diffusion maps alpha parameter"
	)

	def update_layer_slider(model_name):
	"""Update layer slider based on selected model"""
	model_configs = {
	"raw": {"maximum": 0, "value": 0, "interactive": False},
	"wavlm": {"maximum": 24, "value": 24, "interactive": True},
	"wav2vec2": {"maximum": 24, "value": 24, "interactive": True},
	"hubert": {"maximum": 24, "value": 24, "interactive": True},
	"wav2vec2_xlsr": {"maximum": 24, "value": 24, "interactive": True},
	"wavlm_base": {"maximum": 12, "value": 12, "interactive": True},
	"wav2vec2_base": {"maximum": 12, "value": 12, "interactive": True},
	"hubert_base": {"maximum": 12, "value": 12, "interactive": True}
	}

	config = model_configs.get(model_name, {"maximum": 12, "value": 12, "interactive": True})
	return gr.Slider(
	minimum=0,
	maximum=config["maximum"],
	value=config["value"],
	step=1,
	label=f"Layer (max: {config['maximum']}, default: {config['value']})" if config["interactive"] else "Layer (not applicable for raw features)",
	interactive=config["interactive"]
	)

	model_dropdown.change(
	fn=update_layer_slider,
	inputs=[model_dropdown],
	outputs=[layer_slider]
	)

	process_btn = gr.Button("Process Audio Files", variant="primary")

	with gr.Column():
	output_file = gr.File(
	label="Download Results (ZIP)",
	type="filepath"
	)
	status_text = gr.Textbox(
	label="Status",
	lines=3,
	max_lines=10
	)

	process_btn.click(
	fn=process_audio_files,
	inputs=[file_input, model_dropdown, layer_slider, alpha_slider],
	outputs=[output_file, status_text]
	)

	return demo

	if __name__ == "__main__":
	demo = create_interface()
	demo.launch()