""" VoiceKit - MCP Server for Voice Analysis 6 MCP tools for voice processing (all accept base64 audio): - Embedding extraction, voice comparison, acoustic analysis - Speech-to-text, voice isolation, similarity analysis MCP Endpoint: https://mcp-1st-birthday-voicekit.hf.space/gradio_api/mcp/sse """ import gradio as gr import base64 import os import json import tempfile import math import re # Set Gradio temp directory to current directory GRADIO_TEMP_DIR = os.path.join(os.getcwd(), "gradio_temp") os.makedirs(GRADIO_TEMP_DIR, exist_ok=True) os.environ['GRADIO_TEMP_DIR'] = GRADIO_TEMP_DIR tempfile.tempdir = GRADIO_TEMP_DIR # Modal connection (requires MODAL_TOKEN_ID and MODAL_TOKEN_SECRET in HF Secrets) try: import modal AudioAnalyzer = modal.Cls.from_name("voice-semantle", "AudioAnalyzer") analyzer = AudioAnalyzer() modal_available = True print("Modal connected!") except Exception as e: modal_available = False analyzer = None print(f"Modal not available: {e}") # Load README.md and convert to HTML def load_readme_as_html(): """Load README.md and convert markdown to HTML""" try: with open("README.md", "r", encoding="utf-8") as f: content = f.read() # Remove YAML front matter content = re.sub(r'^---\n.*?\n---\n', '', content, flags=re.DOTALL) html = content # Headers html = re.sub(r'^### (.+)$', r'

\1

', html, flags=re.MULTILINE) html = re.sub(r'^## (.+)$', r'

\1

', html, flags=re.MULTILINE) html = re.sub(r'^# (.+)$', r'

\1

', html, flags=re.MULTILINE) # Code blocks - preserve content without adding extra newlines def format_code_block(match): code = match.group(2).strip() # Replace internal newlines with a placeholder, then restore after processing # This prevents the paragraph logic from adding extra breaks code_escaped = code.replace('\n', '') return f'
{code_escaped}
' html = re.sub(r'```(\w*)\n(.*?)```', format_code_block, html, flags=re.DOTALL) # Images - convert relative paths to HuggingFace raw file URLs # Handle both tags and markdown image syntax HF_BASE_URL = "https://huggingface.co/spaces/MCP-1st-Birthday/voicekit/resolve/main" def convert_image_path(match): src = match.group(1) # If it's a relative path (not starting with http), convert to HF URL if not src.startswith('http'): src = f"{HF_BASE_URL}/{src}" return f'' html = re.sub(r']*>', convert_image_path, html) # Inline code (but not inside
 blocks)
        html = re.sub(r'`([^`]+)`', r'\1', html)

        # Bold
        html = re.sub(r'\*\*(.+?)\*\*', r'\1', html)

        # Links
        html = re.sub(r'\[([^\]]+)\]\(([^)]+)\)', r'\1', html)

        # Tables
        lines = html.split('\n')
        in_table = False
        table_html = []
        new_lines = []

        for line in lines:
            if '|' in line and line.strip().startswith('|'):
                if not in_table:
                    in_table = True
                    table_html = ['']

                if re.match(r'^\|[\s\-:|]+\|$', line.strip()):
                    continue

                cells = [c.strip() for c in line.strip().split('|')[1:-1]]
                if len(table_html) == 1:
                    table_html.append('')
                    for cell in cells:
                        table_html.append(f'')
                    table_html.append('')
                else:
                    table_html.append('')
                    for cell in cells:
                        table_html.append(f'')
                    table_html.append('')
            else:
                if in_table:
                    table_html.append('
{cell}
{cell}
') new_lines.append(''.join(table_html)) table_html = [] in_table = False new_lines.append(line) if in_table: table_html.append('') new_lines.append(''.join(table_html)) html = '\n'.join(new_lines) # Lists html = re.sub(r'^- (.+)$', r'
  • \1
  • ', html, flags=re.MULTILINE) html = re.sub(r'(
  • .*
  • \n?)+', r'', html) # Paragraphs - skip lines that are inside pre/code blocks lines = html.split('\n') result = [] for line in lines: stripped = line.strip() if stripped and not stripped.startswith('<') and not stripped.startswith('```'): result.append(f'

    {stripped}

    ') else: result.append(line) # Join and restore newlines in code blocks final_html = '\n'.join(result) final_html = final_html.replace('', '\n') # Escape curly braces for f-string compatibility final_html = final_html.replace('{', '{{').replace('}', '}}') return final_html except Exception as e: return f"

    Error loading README: {e}

    " readme_html = load_readme_as_html() def file_to_base64(file_path: str) -> str: """Convert file path to base64 string""" if not file_path: return "" with open(file_path, "rb") as f: return base64.b64encode(f.read()).decode() # ============================================================================ # MCP Tools (all accept base64 directly) # ============================================================================ def extract_embedding(audio_base64: str) -> str: """ Extract voice embedding using Wav2Vec2. Returns a 768-dimensional vector representing voice characteristics. Useful for voice comparison, speaker identification, etc. Args: audio_base64: Audio file as base64 encoded string Returns: embedding (768-dim list), model, dim """ if not modal_available: return json.dumps({"error": "Modal not available. Please set MODAL_TOKEN_ID and MODAL_TOKEN_SECRET in HF Secrets."}) if not audio_base64: return json.dumps({"error": "No audio provided"}) try: result = analyzer.extract_embedding.remote(audio_base64) if "embedding" in result: result["embedding_preview"] = result["embedding"][:5] + ["..."] result["embedding_length"] = len(result["embedding"]) del result["embedding"] return json.dumps(result, ensure_ascii=False, indent=2) except Exception as e: return json.dumps({"error": str(e)}) def match_voice(audio1_base64: str, audio2_base64: str) -> str: """ Compare similarity between two voices. Extracts Wav2Vec2 embeddings and calculates cosine similarity. Useful for checking if the same person spoke with similar tone. Args: audio1_base64: First audio as base64 encoded string audio2_base64: Second audio as base64 encoded string Returns: similarity (0-1), tone_score (0-100) """ if not modal_available: return json.dumps({"error": "Modal not available. Please set MODAL_TOKEN_ID and MODAL_TOKEN_SECRET in HF Secrets."}) if not audio1_base64 or not audio2_base64: return json.dumps({"error": "Both audio files required"}) try: result = analyzer.compare_voices.remote(audio1_base64, audio2_base64) return json.dumps(result, ensure_ascii=False, indent=2) except Exception as e: return json.dumps({"error": str(e)}) def analyze_acoustics(audio_base64: str) -> str: """ Analyze acoustic features of audio. Extracts pitch, energy, rhythm, tempo, and spectral characteristics. Useful for understanding voice expressiveness and characteristics. Args: audio_base64: Audio file as base64 encoded string Returns: pitch, energy, rhythm, tempo, spectral information """ if not modal_available: return json.dumps({"error": "Modal not available. Please set MODAL_TOKEN_ID and MODAL_TOKEN_SECRET in HF Secrets."}) if not audio_base64: return json.dumps({"error": "No audio provided"}) try: result = analyzer.analyze_acoustic_features.remote(audio_base64) return json.dumps(result, ensure_ascii=False, indent=2) except Exception as e: return json.dumps({"error": str(e)}) def transcribe_audio(audio_base64: str, language: str = "en") -> str: """ Convert audio to text (Speech-to-Text). Uses ElevenLabs Scribe v1 model for high-quality speech recognition. Supports various languages. Args: audio_base64: Audio file as base64 encoded string language: Language code (e.g., "en", "ko", "ja"). Default is "en" Returns: text, language, model """ if not modal_available: return json.dumps({"error": "Modal not available. Please set MODAL_TOKEN_ID and MODAL_TOKEN_SECRET in HF Secrets."}) if not audio_base64: return json.dumps({"error": "No audio provided"}) try: result = analyzer.transcribe_audio.remote(audio_base64, language) return json.dumps(result, ensure_ascii=False, indent=2) except Exception as e: return json.dumps({"error": str(e)}) def isolate_voice(audio_base64: str) -> str: """ Remove background music (BGM) and extract voice only. Uses ElevenLabs Voice Isolator to remove music, noise, etc. Useful for memes, songs, and other audio with background sounds. Args: audio_base64: Audio file as base64 encoded string Returns: isolated_audio_base64, metadata (bgm_detected, sizes, duration) """ if not modal_available: return json.dumps({"error": "Modal not available. Please set MODAL_TOKEN_ID and MODAL_TOKEN_SECRET in HF Secrets."}) if not audio_base64: return json.dumps({"error": "No audio provided"}) try: result = analyzer.isolate_voice.remote(audio_base64) return json.dumps(result, ensure_ascii=False, indent=2) except Exception as e: return json.dumps({"error": str(e)}) def grade_voice( user_audio_base64: str, reference_audio_base64: str, reference_text: str = "", category: str = "meme" ) -> str: """ Comprehensively compare and analyze user voice with reference voice. Evaluates with 5 metrics: - pronunciation: Pronunciation accuracy (STT-based) - tone: Voice timbre similarity (Wav2Vec2 embedding) - pitch: Pitch matching - rhythm: Rhythm sense - energy: Energy expressiveness Args: user_audio_base64: User audio as base64 encoded string reference_audio_base64: Reference audio as base64 encoded string reference_text: Reference text (optional, enables pronunciation scoring) category: Category (meme, song, movie) - determines weights Returns: overall_score, metrics, weak_points, strong_points, feedback """ if not modal_available: return json.dumps({"error": "Modal not available. Please set MODAL_TOKEN_ID and MODAL_TOKEN_SECRET in HF Secrets."}) if not user_audio_base64 or not reference_audio_base64: return json.dumps({"error": "Both user and reference audio required"}) try: result = analyzer.analyze_audio.remote( user_audio_base64=user_audio_base64, reference_audio_base64=reference_audio_base64, reference_text=reference_text if reference_text else None, challenge_id="mcp_analysis", category=category, ) # Simplify output for backend/API use metrics = result.get("metrics", {}) simple_result = { "pitch": metrics.get("pitch", 0), "rhythm": metrics.get("rhythm", 0), "energy": metrics.get("energy", 0), "pronunciation": metrics.get("pronunciation", 0), "transcript": metrics.get("transcript", 0), "overall": result.get("overall_score", 0), "user_text": result.get("user_text", "") } return json.dumps(simple_result, ensure_ascii=False, indent=2) except Exception as e: return json.dumps({"error": str(e)}) # ============================================================================ # Demo Functions for UI # ============================================================================ def demo_acoustic_analysis(audio_file): """Acoustic Analysis - Analyze pitch, energy, rhythm, tempo""" if not audio_file: return create_acoustic_empty() audio_b64 = file_to_base64(audio_file) result_json = analyze_acoustics(audio_b64) try: result = json.loads(result_json) if "error" in result: return f'''
    Error in result:
    {result.get("error", "Unknown error")}
    ''' return create_acoustic_visualization(result) except Exception as e: return f'''
    Parsing Error: {str(e)}

    Raw Result (first 500 chars):
    {result_json[:500]}
    ''' def demo_transcribe_audio(audio_file, language): """Audio Transcription""" if not audio_file: return create_transcription_empty() audio_b64 = file_to_base64(audio_file) result_json = transcribe_audio(audio_b64, language) try: result = json.loads(result_json) if "error" in result: return create_transcription_empty() text = result.get("text", "") return create_transcription_visualization(text) except: return create_transcription_empty() def demo_clean_extraction(audio_file): """Clean Audio Extraction - returns audio file only""" if not audio_file: return None audio_b64 = file_to_base64(audio_file) result_json = isolate_voice(audio_b64) try: result = json.loads(result_json) if "error" in result: return None # Convert isolated audio base64 back to file import tempfile isolated_audio_bytes = base64.b64decode(result["isolated_audio_base64"]) with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp: tmp.write(isolated_audio_bytes) isolated_audio_path = tmp.name return isolated_audio_path except: return None def demo_extract_embedding(audio_file): """Extract Embedding - extract voice fingerprint""" if not audio_file: return create_embedding_empty() audio_b64 = file_to_base64(audio_file) result_json = extract_embedding(audio_b64) try: result = json.loads(result_json) if "error" in result: return f'''
    Error in result:
    {result.get("error", "Unknown error")}
    ''' return create_embedding_visualization(result) except Exception as e: return f'''
    Parsing Error: {str(e)}

    Raw Result (first 500 chars):
    {result_json[:500]}
    ''' def demo_match_voice(audio1, audio2): """Compare Voices - compare two voice similarities""" if not audio1 or not audio2: return create_compare_empty() audio1_b64 = file_to_base64(audio1) audio2_b64 = file_to_base64(audio2) result_json = match_voice(audio1_b64, audio2_b64) try: result = json.loads(result_json) if "error" in result: return create_compare_empty() return create_compare_visualization(result) except: return create_compare_empty() def demo_voice_similarity(user_audio, ref_audio): """Voice Similarity - comprehensive 5-metric analysis""" if not user_audio or not ref_audio: return create_similarity_empty() user_b64 = file_to_base64(user_audio) ref_b64 = file_to_base64(ref_audio) result_json = grade_voice(user_b64, ref_b64, "", "meme") try: result = json.loads(result_json) if "error" in result: return create_similarity_empty() return create_similarity_visualization(result) except: return create_similarity_empty() # ============================================================================ # Visualization Functions # ============================================================================ def create_acoustic_empty(): """Empty state for acoustic analysis""" return """
    Upload audio to analyze acoustic features
    """ def create_acoustic_visualization(result): """Acoustic analysis visualization with radar chart""" pitch = result.get("pitch", {}) energy = result.get("energy", {}) rhythm = result.get("rhythm", {}) tempo = result.get("tempo", 0) spectral = result.get("spectral", {}) # Use pre-calculated scores from Modal backend (already 0-100) pitch_norm = pitch.get("score", 0) energy_norm = energy.get("score", 0) rhythm_norm = rhythm.get("score", 0) spectral_norm = spectral.get("score", 0) # Tempo: normalize BPM to 0-100 (60-180 BPM range) tempo_bpm = tempo tempo_norm = min(100, max(0, (tempo_bpm - 60) / 120 * 100)) if tempo_bpm > 0 else 0 # Radar chart calculation center_x, center_y = 150, 150 radius = 110 # 5 metrics in order: Pitch(top), Energy(top-right), Rhythm(bottom-right), Tempo(bottom-left), Spectral(top-left) metrics = [ ("Pitch", pitch_norm, -90), # 0° - 90° = -90° (top) ("Energy", energy_norm, -18), # 72° - 90° = -18° (top-right) ("Rhythm", rhythm_norm, 54), # 144° - 90° = 54° (bottom-right) ("Tempo", tempo_norm, 126), # 216° - 90° = 126° (bottom-left) ("Spectral", spectral_norm, 198) # 288° - 90° = 198° (top-left) ] # Calculate polygon points for data data_points = [] for _, value, angle_deg in metrics: angle_rad = math.radians(angle_deg) point_radius = (value / 100) * radius x = center_x + point_radius * math.cos(angle_rad) y = center_y + point_radius * math.sin(angle_rad) data_points.append(f"{x:.2f},{y:.2f}") # Background concentric pentagons (20, 40, 60, 80, 100) def create_pentagon_points(scale): points = [] for _, _, angle_deg in metrics: angle_rad = math.radians(angle_deg) r = radius * scale x = center_x + r * math.cos(angle_rad) y = center_y + r * math.sin(angle_rad) points.append(f"{x:.2f},{y:.2f}") return " ".join(points) background_pentagons = "" for scale in [0.2, 0.4, 0.6, 0.8, 1.0]: background_pentagons += f'' # Axis lines from center to vertices axis_lines = "" for _, _, angle_deg in metrics: angle_rad = math.radians(angle_deg) x = center_x + radius * math.cos(angle_rad) y = center_y + radius * math.sin(angle_rad) axis_lines += f'' # Labels at vertices labels = "" for label, value, angle_deg in metrics: angle_rad = math.radians(angle_deg) # Position label outside the pentagon label_radius = radius + 25 x = center_x + label_radius * math.cos(angle_rad) y = center_y + label_radius * math.sin(angle_rad) labels += f''' {label} {int(value)} ''' return f"""
    {background_pentagons} {axis_lines} {''.join([f'' for pt in data_points])} {labels}
    """ def create_mimicry_empty(): """Empty state for voice mimicry game""" return """
    Upload reference and your voice to see similarity scores
    """ def create_mimicry_visualization(result): """Voice mimicry score visualization with progress bars""" pronunciation = result.get("pronunciation", 0) tone = result.get("transcript", 0) # Tone score pitch = result.get("pitch", 0) rhythm = result.get("rhythm", 0) energy = result.get("energy", 0) def create_progress_bar(label, value): return f"""
    {label}
    {value}
    """ return f"""
    AI
    CLAUDE
    Wow, that voice input, takes analytical skills of course but I'll handle it
    {create_progress_bar("Pronunciation", pronunciation)} {create_progress_bar("Tone", tone)} {create_progress_bar("Pitch", pitch)} {create_progress_bar("Rhythm", rhythm)} {create_progress_bar("Energy", energy)}
    """ def create_transcription_empty(): """Empty state for transcription""" return """
    Upload audio to transcribe
    """ def create_transcription_visualization(text): """Simple text display for transcription result""" return f"""
    {text if text else "Transcription completed"}
    """ def create_embedding_empty(): """Empty state for embedding extraction""" return """
    Upload audio to extract voice embedding
    """ def create_embedding_visualization(result): """Embedding extraction visualization""" model = result.get("model", "Wav2Vec2") dim = result.get("embedding_length", result.get("dim", 768)) preview = result.get("embedding_preview", []) # Filter only numeric values to avoid format errors with strings like "..." if preview: numeric_preview = [v for v in preview if isinstance(v, (int, float))] preview_str = ", ".join([f"{v:.4f}" for v in numeric_preview]) if numeric_preview else "..." else: preview_str = "..." return f"""
    Model
    {model}
    Dimensions
    {dim}
    Preview
    [{preview_str}]
    """ def create_compare_empty(): """Empty state for voice comparison""" return """
    Upload two audio files to compare voices
    """ def create_compare_visualization(result): """Voice comparison visualization with similarity score""" similarity = result.get("similarity", 0) tone_score = result.get("tone_score", 0) # Convert similarity to percentage similarity_pct = int(similarity * 100) # Color based on similarity - Purple theme matching VOICE SIMILARITY if similarity_pct >= 80: color = "#c084fc" # Light purple (high score) elif similarity_pct >= 60: color = "#a855f7" # Medium purple (medium score) else: color = "#7c3aed" # Dark purple (low score) return f"""
    {similarity_pct} SIMILARITY
    """ def create_similarity_empty(): """Empty state for voice similarity analysis""" return """
    Upload audio files for comprehensive similarity analysis
    """ def create_similarity_visualization(result): """Voice similarity visualization with radar chart""" overall = result.get("overall", 0) pronunciation = result.get("pronunciation", 0) transcript = result.get("transcript", 0) pitch = result.get("pitch", 0) rhythm = result.get("rhythm", 0) energy = result.get("energy", 0) # Color based on overall score - Purple theme if overall >= 80: color = "#c084fc" # Light purple (high score) elif overall >= 60: color = "#a855f7" # Medium purple (medium score) else: color = "#7c3aed" # Dark purple (low score) # Radar chart calculation center_x, center_y = 150, 150 radius = 110 # 5 metrics in order: Pronunciation(top), Transcript(top-right), Pitch(bottom-right), Energy(bottom-left), Rhythm(top-left) metrics = [ ("Pronunciation", pronunciation, -90), # 0° - 90° = -90° (top) ("Transcript", transcript, -18), # 72° - 90° = -18° (top-right) ("Pitch", pitch, 54), # 144° - 90° = 54° (bottom-right) ("Energy", energy, 126), # 216° - 90° = 126° (bottom-left) ("Rhythm", rhythm, 198) # 288° - 90° = 198° (top-left) ] # Calculate polygon points for data data_points = [] for _, value, angle_deg in metrics: angle_rad = math.radians(angle_deg) point_radius = (value / 100) * radius x = center_x + point_radius * math.cos(angle_rad) y = center_y + point_radius * math.sin(angle_rad) data_points.append(f"{x:.2f},{y:.2f}") # Background concentric pentagons (20, 40, 60, 80, 100) def create_pentagon_points(scale): points = [] for _, _, angle_deg in metrics: angle_rad = math.radians(angle_deg) r = radius * scale x = center_x + r * math.cos(angle_rad) y = center_y + r * math.sin(angle_rad) points.append(f"{x:.2f},{y:.2f}") return " ".join(points) background_pentagons = "" for scale in [0.2, 0.4, 0.6, 0.8, 1.0]: background_pentagons += f'' # Axis lines from center to vertices axis_lines = "" for _, _, angle_deg in metrics: angle_rad = math.radians(angle_deg) x = center_x + radius * math.cos(angle_rad) y = center_y + radius * math.sin(angle_rad) axis_lines += f'' # Labels at vertices labels = "" for label, value, angle_deg in metrics: angle_rad = math.radians(angle_deg) # Position label outside the pentagon label_radius = radius + 25 x = center_x + label_radius * math.cos(angle_rad) y = center_y + label_radius * math.sin(angle_rad) labels += f''' {label} {value} ''' return f"""
    {overall} OVERALL
    {background_pentagons} {axis_lines} {''.join([f'' for pt in data_points])} {labels}
    """ # Clean audio functions removed - using gr.Audio component directly # ============================================================================ # Gradio Interface with Dark Theme # ============================================================================ custom_css = """ /* ===== DARK THEME STYLING (CSS-ONLY) ===== */ /* This CSS forces dark mode appearance regardless of system/Gradio theme */ /* All colors are SOLID (not rgba/transparent) to ensure consistent appearance */ :root { color-scheme: dark !important; --body-background-fill: #0a0a1a !important; --background-fill-primary: #0d0d1a !important; --background-fill-secondary: #12122a !important; --block-background-fill: #0d0d1a !important; --input-background-fill: #1a1a35 !important; --body-text-color: #e0e7ff !important; --block-title-text-color: #a5b4fc !important; --block-label-text-color: #a5b4fc !important; --input-text-color: #e0e7ff !important; --neutral-50: #0a0a1a !important; --neutral-100: #0d0d1a !important; --neutral-200: #12122a !important; --neutral-300: #1a1a35 !important; --neutral-400: #2d2d4a !important; --neutral-500: #4a4a6a !important; --neutral-600: #7c7c9a !important; --neutral-700: #a5b4fc !important; --neutral-800: #c7d2fe !important; --neutral-900: #e0e7ff !important; --neutral-950: #ffffff !important; } /* Force dark mode on html and body */ html, body { background: #0a0a1a !important; background-color: #0a0a1a !important; color: #e0e7ff !important; } /* ===== GLOBAL STYLES ===== */ body { background: linear-gradient(180deg, #0a0a1a 0%, #0f0f23 100%) !important; background-color: #0a0a1a !important; color: #ffffff !important; font-family: system-ui, -apple-system, sans-serif; } /* Override Gradio's light mode backgrounds AND text colors */ .dark, .light, [data-theme="light"], [data-theme="dark"], html[data-theme="light"], html[data-theme="dark"], body.light, body.dark { --body-background-fill: #0a0a1a !important; --background-fill-primary: #0d0d1a !important; --background-fill-secondary: #12122a !important; --block-background-fill: #0d0d1a !important; --input-background-fill: #1a1a35 !important; --body-text-color: #e0e7ff !important; --block-title-text-color: #a5b4fc !important; --block-label-text-color: #a5b4fc !important; --input-text-color: #e0e7ff !important; --neutral-50: #0a0a1a !important; --neutral-100: #0d0d1a !important; --neutral-200: #12122a !important; --neutral-300: #1a1a35 !important; --neutral-400: #2d2d4a !important; --neutral-500: #4a4a6a !important; --neutral-600: #7c7c9a !important; --neutral-700: #a5b4fc !important; --neutral-800: #c7d2fe !important; --neutral-900: #e0e7ff !important; --neutral-950: #ffffff !important; color: #e0e7ff !important; background: #0a0a1a !important; background-color: #0a0a1a !important; } .gradio-container { max-width: 100% !important; width: 100% !important; padding: 0px 16px 20px 16px !important; background: #0a0a1a !important; background-color: #0a0a1a !important; margin: 0 !important; } .gradio-container > .main, .gradio-container .main, .main { max-width: 100% !important; width: 100% !important; padding-left: 0 !important; padding-right: 0 !important; margin: 0 auto !important; } .contain { max-width: 100% !important; padding: 0 !important; } /* Force full width on all Gradio internal containers */ .gradio-container > div, .gradio-container > div > div, #component-0, .wrap, .app, .contain, footer, .gradio-row, .gradio-column, .svelte-1gfkn6j, [class*="svelte-"] { max-width: 100% !important; } .gradio-row { max-width: 100% !important; width: 100% !important; margin: 0 !important; padding: 0 !important; } /* ===== HEADER (FLOATING, NO CARD) ===== */ .header-main { display: flex; justify-content: space-between; align-items: center; margin-bottom: 0; padding: 0; } .header-left { display: flex; align-items: center; gap: 16px; } .header-icon { font-size: 48px; filter: drop-shadow(0 4px 12px rgba(99, 102, 241, 0.6)); } .header-title { font-size: 42px; font-weight: 900; color: #e0e7ff; margin: 0; letter-spacing: -0.5px; } .header-subtitle { color: #c7d2fe; font-size: 20px; font-weight: 700; margin-left: 6px; } /* ===== DOCS BUTTON ===== */ .docs-button { display: flex; align-items: center; gap: 8px; padding: 10px 20px; background: linear-gradient(135deg, rgba(124, 58, 237, 0.3), rgba(99, 102, 241, 0.3)); border: 1px solid rgba(124, 58, 237, 0.5); border-radius: 12px; color: #e0e7ff; font-size: 14px; font-weight: 600; cursor: pointer; transition: all 0.3s ease; text-transform: uppercase; letter-spacing: 0.5px; } .docs-button:hover { background: linear-gradient(135deg, rgba(124, 58, 237, 0.5), rgba(99, 102, 241, 0.5)); border-color: rgba(124, 58, 237, 0.8); transform: translateY(-2px); box-shadow: 0 4px 20px rgba(124, 58, 237, 0.4); } .docs-button svg { width: 18px; height: 18px; } /* ===== DOCS MODAL ===== */ .docs-modal-overlay { display: none; position: fixed !important; inset: 0 !important; width: 100vw !important; height: 100vh !important; background: rgba(0, 0, 0, 0.85) !important; backdrop-filter: blur(10px) !important; z-index: 99999 !important; justify-content: center !important; align-items: flex-start !important; padding-top: 60px !important; box-sizing: border-box !important; /* Modal positioned near top of viewport */ overflow: hidden !important; } .docs-modal-overlay.active { display: flex !important; } .docs-modal { position: relative !important; background: #0d0d1a !important; border: 2px solid #7c3aed !important; border-radius: 20px !important; width: calc(100vw - 80px) !important; max-width: 1200px !important; max-height: 55vh !important; overflow: hidden !important; box-shadow: 0 25px 80px rgba(0, 0, 0, 0.9) !important; /* Remove margin that could affect centering */ margin: 0 !important; /* Prevent any transform inheritance issues */ transform: none !important; } .docs-modal-header { display: flex !important; justify-content: space-between !important; align-items: center !important; padding: 20px 24px !important; border-bottom: 2px solid #7c3aed !important; background: #1a1a2e !important; } .docs-modal-title { font-size: 20px; font-weight: 700; color: #e0e7ff; display: flex; align-items: center; gap: 10px; } .docs-modal-close { background: rgba(124, 58, 237, 0.3); border: 2px solid rgba(124, 58, 237, 0.5); border-radius: 12px; color: #e0e7ff; font-size: 28px; font-weight: 300; cursor: pointer; padding: 4px 14px; line-height: 1; transition: all 0.2s; } .docs-modal-close:hover { background: rgba(124, 58, 237, 0.4); border-color: rgba(124, 58, 237, 0.6); } .docs-modal-content { padding: 24px !important; overflow-y: auto !important; max-height: calc(55vh - 80px) !important; color: #c7d2fe !important; font-size: 15px !important; line-height: 1.7 !important; background: #0d0d1a !important; } .docs-modal-content h1 { font-size: 28px; color: #e0e7ff; margin: 0 0 16px 0; padding-bottom: 12px; border-bottom: 2px solid rgba(124, 58, 237, 0.3); } .docs-modal-content h2 { font-size: 22px; color: #e0e7ff; margin: 24px 0 12px 0; } .docs-modal-content h3 { font-size: 18px; color: #a5b4fc; margin: 20px 0 10px 0; } .docs-modal-content p { margin: 12px 0; } .docs-modal-content ul, .docs-modal-content ol { margin: 12px 0; padding-left: 24px; } .docs-modal-content li { margin: 6px 0; } .docs-modal-content code { background: rgba(124, 58, 237, 0.2); padding: 2px 6px; border-radius: 4px; font-family: 'SF Mono', 'Monaco', 'Consolas', monospace; font-size: 13px; color: #c4b5fd; } .docs-modal-content pre { background: rgba(0, 0, 0, 0.4); border: 1px solid rgba(124, 58, 237, 0.2); border-radius: 12px; padding: 16px; overflow-x: auto; margin: 16px 0; white-space: pre; } .docs-modal-content pre code { background: transparent; padding: 0; color: #a5b4fc; white-space: pre; display: block; } .docs-modal-content table { width: 100%; border-collapse: collapse; margin: 16px 0; } .docs-modal-content th, .docs-modal-content td { padding: 10px 12px; text-align: left; border: 1px solid rgba(124, 58, 237, 0.2); } .docs-modal-content th { background: rgba(124, 58, 237, 0.15); color: #e0e7ff; font-weight: 600; } .docs-modal-content td { color: #c7d2fe; } .docs-modal-content a { color: #a78bfa; text-decoration: none; } .docs-modal-content a:hover { text-decoration: underline; } .docs-modal-content strong { color: #e0e7ff; } .docs-modal-content img { max-width: 100%; max-height: 400px; height: auto; border-radius: 8px; margin: 12px 0; object-fit: contain; } /* ===== CARD STYLES ===== */ .card { background: #0f0f23 !important; background-color: #0f0f23 !important; border: 1px solid #3d2a6b !important; border-radius: 20px; padding: 30px; box-shadow: 0 8px 32px rgba(0, 0, 0, 0.4); transition: all 0.3s ease; height: 100%; display: flex; flex-direction: column; } .card:hover { border-color: #5b3d99 !important; box-shadow: 0 12px 48px rgba(124, 58, 237, 0.3); } /* Ensure columns in top row have equal height */ .gradio-row:first-of-type .gradio-column { display: flex !important; flex-direction: column !important; } .gradio-row:first-of-type .gradio-column > div { flex: 1 !important; display: flex !important; flex-direction: column !important; } /* Set minimum height for top row cards */ .gradio-row:first-of-type .card { min-height: 550px; } .card-title { font-size: 16px; font-weight: 700; color: #a5b4fc; text-transform: uppercase; letter-spacing: 1px; margin-bottom: 20px; display: flex; align-items: center; } /* ===== ROW SPACING ===== */ .gradio-row { gap: 24px !important; } /* ===== QUICK START - CODE BLOCK (TERMINAL/IDE STYLE) ===== */ .terminal-window { background: #1a1b26; border: 1px solid rgba(124, 58, 237, 0.3); border-radius: 12px; overflow: hidden; margin-bottom: 16px; box-shadow: 0 8px 32px rgba(0, 0, 0, 0.6); } .terminal-header { background: #16161e; padding: 12px 16px; display: flex; align-items: center; justify-content: space-between; border-bottom: 1px solid rgba(124, 58, 237, 0.2); } .terminal-dots { display: flex; gap: 8px; } .terminal-dot { width: 12px; height: 12px; border-radius: 50%; } .terminal-dot.red { background: #ff5f56 !important; box-shadow: 0 0 8px rgba(255, 95, 86, 0.8) !important; } .terminal-dot.yellow { background: #ffbd2e !important; box-shadow: 0 0 8px rgba(255, 189, 46, 0.8) !important; } .terminal-dot.green { background: #27c93f !important; box-shadow: 0 0 8px rgba(39, 201, 63, 0.8) !important; } .terminal-title { font-size: 12px; color: #6b7280; font-family: 'SF Mono', 'Monaco', 'Consolas', monospace; font-weight: 500; } .terminal-body { background: #1a1b26; padding: 0; display: flex; } .line-numbers { background: #16161e; padding: 16px 12px; border-right: 1px solid rgba(124, 58, 237, 0.15); user-select: none; text-align: right; min-width: 48px; } .line-num { display: block; color: #4a5568; font-family: 'SF Mono', 'Monaco', 'Consolas', monospace; font-size: 14px; line-height: 1.8; } .code-content { flex: 1; padding: 16px 20px; overflow-x: auto; } .code-line { display: block; white-space: pre; font-family: 'SF Mono', 'Monaco', 'Consolas', monospace; font-size: 14px; line-height: 1.8; color: #a9b1d6; } .json-key { color: #7dcfff; font-weight: 500; } .json-string { color: #9ece6a; } .json-bracket { color: #bb9af7; font-weight: 600; } .json-colon { color: #c0caf5; } .json-comma { color: #c0caf5; } .copy-button { width: 100%; background: linear-gradient(135deg, #7c3aed, #6366f1) !important; border: none !important; border-radius: 12px !important; padding: 14px 24px !important; font-weight: 700 !important; font-size: 13px !important; color: white !important; text-transform: uppercase; letter-spacing: 1px; cursor: pointer; box-shadow: 0 4px 16px rgba(124, 58, 237, 0.4) !important; transition: all 0.3s ease !important; display: flex; align-items: center; justify-content: center; gap: 8px; } .copy-button:hover { transform: translateY(-2px) !important; box-shadow: 0 6px 24px rgba(124, 58, 237, 0.6) !important; } /* ===== TOOLS TABLE ===== */ .tools-table, table.tools-table, .light .tools-table, .dark .tools-table, [data-theme="light"] .tools-table, [data-theme="dark"] .tools-table { width: 100%; border-collapse: separate; border-spacing: 0; background: #0d0d1f !important; background-color: #0d0d1f !important; border-radius: 12px; overflow: hidden; border: 1px solid #3d2a6b !important; margin-bottom: 0; flex: 1; color: #cbd5e1 !important; } .tools-table th, table.tools-table th, .light .tools-table th, .dark .tools-table th, [data-theme="light"] .tools-table th, [data-theme="dark"] .tools-table th { background: #1f1545 !important; background-color: #1f1545 !important; color: #a5b4fc !important; font-weight: 700; font-size: 16px; text-transform: uppercase; letter-spacing: 1.5px; padding: 20px 14px; text-align: left; border-bottom: 1px solid #3d2a6b !important; } .tools-table td, table.tools-table td, .light .tools-table td, .dark .tools-table td, [data-theme="light"] .tools-table td, [data-theme="dark"] .tools-table td { padding: 20px 14px; color: #cbd5e1 !important; background: #0d0d1f !important; background-color: #0d0d1f !important; font-size: 16px; border-bottom: 1px solid #1a1535 !important; } .tools-table tr:last-child td { border-bottom: none; } .tools-table tr:hover, .tools-table tr:hover td { background: #1a1540 !important; background-color: #1a1540 !important; } .tool-name, .light .tool-name, .dark .tool-name, [data-theme="light"] .tool-name, [data-theme="dark"] .tool-name { color: #22d3ee !important; font-family: 'SF Mono', 'Monaco', 'Consolas', monospace; font-weight: 600; font-size: 13px; vertical-align: middle; } /* ===== COMPOSITE SECTION ===== */ .composite-section, .light .composite-section, .dark .composite-section, [data-theme="light"] .composite-section, [data-theme="dark"] .composite-section { background: #0d0d1f !important; background-color: #0d0d1f !important; border: 1px solid #3d2a6b !important; border-radius: 12px; padding: 20px; color: #cbd5e1 !important; } .composite-header, .light .composite-header, .dark .composite-header, [data-theme="light"] .composite-header, [data-theme="dark"] .composite-header { font-size: 11px; font-weight: 700; color: #a5b4fc !important; text-transform: uppercase; letter-spacing: 1.5px; margin-bottom: 12px; } .composite-content, .light .composite-content, .dark .composite-content, [data-theme="light"] .composite-content, [data-theme="dark"] .composite-content { color: #cbd5e1 !important; font-size: 12px; line-height: 1.6; margin-bottom: 16px; } .try-demo-button { width: 100%; background: transparent !important; border: 2px solid #7c3aed !important; border-radius: 12px !important; padding: 12px 24px !important; font-weight: 700 !important; font-size: 12px !important; color: #7c3aed !important; text-transform: uppercase; letter-spacing: 1px; cursor: pointer; transition: all 0.3s ease !important; } .try-demo-button:hover { background: rgba(124, 58, 237, 0.1) !important; border-color: #7c3aed !important; color: #8b5cf6 !important; } /* ===== BUTTONS ===== */ button[variant="primary"] { background: linear-gradient(135deg, #7c3aed, #6366f1) !important; border: none !important; border-radius: 12px !important; padding: 14px 32px !important; font-weight: 700 !important; font-size: 14px !important; color: white !important; box-shadow: 0 4px 20px rgba(124, 58, 237, 0.4) !important; transition: all 0.3s ease !important; } button[variant="primary"]:hover { transform: translateY(-2px) !important; box-shadow: 0 8px 32px rgba(124, 58, 237, 0.6) !important; } /* ===== AUDIO COMPONENT ===== */ .gradio-audio { background: rgba(30, 27, 75, 0.6) !important; border: 1px solid rgba(124, 58, 237, 0.3) !important; border-radius: 12px !important; } /* ===== TEXTBOX ===== */ textarea { background: rgba(30, 27, 75, 0.6) !important; border: 1px solid rgba(124, 58, 237, 0.3) !important; border-radius: 12px !important; color: #e0e7ff !important; font-size: 13px !important; } /* ===== DROPDOWN ===== */ select { background: rgba(30, 27, 75, 0.6) !important; border: 1px solid rgba(124, 58, 237, 0.3) !important; border-radius: 12px !important; color: #e0e7ff !important; } /* ===== LABELS ===== */ label { color: #a5b4fc !important; font-weight: 600 !important; font-size: 12px !important; text-transform: uppercase; letter-spacing: 0.5px; } /* ===== HTML OUTPUT ===== */ .gradio-html { background: transparent !important; border: none !important; } /* ===== DEMO ROW LAYOUT ===== */ .demo-row { display: flex !important; gap: 24px !important; align-items: stretch !important; } /* Only apply card style to the outer column (demo-card-column) */ .demo-card-column { display: flex !important; flex-direction: column !important; height: 700px !important; min-height: 700px !important; max-height: 700px !important; background: rgba(15, 15, 35, 0.8) !important; backdrop-filter: blur(20px) !important; border: 1px solid rgba(124, 58, 237, 0.3) !important; border-radius: 20px !important; padding: 4px 4px 2px 4px !important; box-shadow: 0 8px 32px rgba(0, 0, 0, 0.4) !important; transition: all 0.3s ease !important; gap: 2px !important; overflow-y: auto !important; } .demo-card-column:hover { border-color: rgba(124, 58, 237, 0.5) !important; box-shadow: 0 12px 48px rgba(124, 58, 237, 0.3) !important; } /* Remove any border/background from inner elements */ .demo-card-column > div, .demo-card-column > div > div, .demo-row > div > div { background: transparent !important; border: none !important; box-shadow: none !important; padding: 0 !important; border-radius: 0 !important; } /* Remove card background from inner HTML - we use column background instead */ .demo-row .card { background: transparent !important; backdrop-filter: none !important; border: none !important; border-radius: 0 !important; padding: 0 !important; box-shadow: none !important; margin-bottom: 12px !important; } .demo-row .card:hover { border: none !important; box-shadow: none !important; } /* Ensure all inner components have transparent background */ .demo-row .gradio-audio, .demo-row .gradio-dropdown, .demo-row .gradio-textbox, .demo-row .gradio-button { background: transparent !important; } /* Create a wrapper for input elements (flex container) */ .demo-card-column > div:not(:last-child) { flex: 0 0 auto !important; } /* Adjust spacing for input elements in demo cards */ .demo-row .gradio-audio { margin-top: 6px !important; margin-bottom: 0px !important; max-height: 50px !important; min-height: 40px !important; height: 45px !important; } /* Target all child elements of audio component */ .demo-row .gradio-audio > div, .demo-row .gradio-audio .wrap, .demo-row .gradio-audio .upload-container, .demo-row .gradio-audio .record-container, .demo-row .gradio-audio * { max-height: 50px !important; } /* Audio player specific height reduction */ .demo-row .gradio-audio audio { height: 26px !important; max-height: 26px !important; min-height: 26px !important; } /* Upload/record button container height */ .demo-row .gradio-audio .upload-container, .demo-row .gradio-audio .record-container { min-height: 38px !important; max-height: 38px !important; padding: 4px !important; } /* Audio component buttons */ .demo-row .gradio-audio button { height: 28px !important; min-height: 28px !important; max-height: 28px !important; padding: 4px 10px !important; font-size: 10px !important; } /* Hide text nodes in audio upload area - keep icons */ .demo-row .gradio-audio .upload-text { display: none !important; } .demo-row .gradio-audio .placeholder { display: none !important; } .demo-row .gradio-audio span:not(:has(svg)) { font-size: 0 !important; } .demo-row .gradio-audio p { display: none !important; } /* Hide "Drop Audio Here", "- or -", "Click to Upload" text */ .demo-row .gradio-audio .upload-container span, .demo-row .gradio-audio .upload-container p { font-size: 0 !important; line-height: 0 !important; } /* Keep SVG icons visible */ .demo-row .gradio-audio svg { font-size: initial !important; } /* ADDITIONAL METHODS: Hide all text in audio upload area */ .demo-row .gradio-audio label { font-size: 0 !important; } .demo-row .gradio-audio label span:not(:has(svg)) { display: none !important; } .demo-row .gradio-audio .file-preview { font-size: 0 !important; } .demo-row .gradio-audio .file-preview span { font-size: 0 !important; display: none !important; } .demo-row .gradio-audio [data-testid="upload-text"], .demo-row .gradio-audio [data-testid="file-preview-text"], .demo-row .gradio-audio .upload-text, .demo-row .gradio-audio .file-preview-text { display: none !important; visibility: hidden !important; font-size: 0 !important; } /* Target all text nodes (more aggressive) */ .demo-row .gradio-audio *:not(svg):not(path):not(circle):not(rect):not(line) { color: transparent !important; } .demo-row .gradio-audio button { color: white !important; } /* Ensure icons remain visible */ .demo-row .gradio-audio svg, .demo-row .gradio-audio svg * { color: initial !important; fill: currentColor !important; stroke: currentColor !important; } /* NUCLEAR OPTION: Hide everything in label, then show only necessary elements */ .demo-row .gradio-audio label > div > div { display: none !important; } .demo-row .gradio-audio label::before { content: '' !important; } .demo-row .gradio-audio label * { visibility: hidden !important; } .demo-row .gradio-audio label svg { visibility: visible !important; } .demo-row .gradio-audio label button { visibility: visible !important; } .demo-row .gradio-audio label audio { visibility: visible !important; } /* Force hide any text content */ .demo-row .gradio-audio label > div::after, .demo-row .gradio-audio label > div::before { content: '' !important; display: none !important; } /* Additional override for upload text elements */ .demo-row .gradio-audio [class*="upload"], .demo-row .gradio-audio [class*="placeholder"], .demo-row .gradio-audio [class*="text"] { font-size: 0 !important; line-height: 0 !important; width: 0 !important; height: 0 !important; opacity: 0 !important; visibility: hidden !important; position: absolute !important; left: -9999px !important; } /* NUCLEAR OPTION 2: Complete removal of label content */ .demo-row .gradio-audio label.block { display: none !important; } .demo-row .gradio-audio .file-upload { display: none !important; } /* Hide all direct text children */ .demo-row .gradio-audio label > span:not(:has(button)):not(:has(audio)):not(:has(svg)) { display: none !important; } /* Gradio 6.0 specific selectors - upload area */ .demo-row .gradio-audio [data-testid="upload-button"], .demo-row .gradio-audio [data-testid="file-upload"], .demo-row .gradio-audio .upload-area { display: none !important; } /* Hide all paragraph elements in audio component */ .demo-row .gradio-audio label p, .demo-row .gradio-audio label span.text, .demo-row .gradio-audio label div.text { display: none !important; } /* More aggressive text hiding - target by content */ .demo-row .gradio-audio *::before, .demo-row .gradio-audio *::after { content: '' !important; display: none !important; } /* Make sure only buttons and audio players are visible */ .demo-row .gradio-audio > label > div > div:not(:has(button)):not(:has(audio)) { display: none !important; } /* Gradio Blocks specific - Hide wrapper divs that contain text */ .demo-row .gradio-audio .wrap > div:not(:has(button)):not(:has(audio)):not(:has(svg)) { display: none !important; } /* Override for Gradio 6.x structure */ .demo-row .gradio-audio [class*="svelte-"] span:not(:has(svg)):not(:has(button)) { display: none !important; } .demo-row .gradio-dropdown, .demo-row .gradio-textbox { margin-bottom: 2px !important; } .demo-row .gradio-row { margin-bottom: 2px !important; } /* IMPORTANT: Button alignment - push buttons to bottom with margin-top: auto */ .demo-row .gradio-button { margin-top: auto !important; margin-bottom: 0px !important; flex-shrink: 0 !important; } /* Output area should not push button down - set flex: 1 */ .demo-row .gradio-html { flex: 1 !important; margin-bottom: 0 !important; display: flex !important; flex-direction: column !important; max-height: 300px !important; overflow-y: auto !important; } /* Output audio component (clean_audio_output) height limit */ .demo-row .gradio-audio[data-testid="audio-output"], .demo-row > div:last-child .gradio-audio { max-height: 120px !important; min-height: 60px !important; height: auto !important; margin-bottom: 0px !important; } /* ===== CUSTOM ACTION BUTTONS (DEMO CARDS) ===== */ .custom-action-btn, .custom-action-btn button, .custom-action-btn button[data-testid="button"], button.custom-action-btn, .demo-row .custom-action-btn, .demo-row .custom-action-btn button { width: 100% !important; min-width: 100% !important; max-width: 100% !important; background: linear-gradient(135deg, #6366f1, #7c3aed) !important; border: none !important; border-radius: 12px !important; padding: 8px 16px !important; height: 38px !important; min-height: 38px !important; max-height: 38px !important; font-weight: 700 !important; font-size: 16px !important; letter-spacing: 1.5px !important; text-transform: uppercase !important; color: white !important; box-shadow: 0 4px 20px rgba(124, 58, 237, 0.4) !important; transition: all 0.3s ease !important; } .custom-action-btn:hover, .custom-action-btn button:hover, .custom-action-btn button[data-testid="button"]:hover, button.custom-action-btn:hover, .demo-row .custom-action-btn:hover, .demo-row .custom-action-btn button:hover { transform: translateY(-2px) !important; box-shadow: 0 8px 32px rgba(124, 58, 237, 0.6) !important; background: linear-gradient(135deg, #6366f1, #7c3aed) !important; } /* ===== DECORATIVE ELEMENTS ===== */ .diamond-decoration { position: fixed; bottom: 40px; right: 40px; width: 80px; height: 80px; border: 2px solid rgba(124, 58, 237, 0.2); transform: rotate(45deg); pointer-events: none; z-index: 1; } .star-decoration { display: none; } """ with gr.Blocks() as demo: # Inject custom CSS and decorative elements (positioned fixed, no DOM space) gr.HTML(f"""
    """) # ==================== HEADER (FLOATING) ==================== gr.HTML(f"""
    VoiceKit MCP Server
    Documentation
    {readme_html}
    """) # ==================== TOP ROW: QUICK START + AVAILABLE TOOLS ==================== with gr.Row(equal_height=True): # QUICK START CARD with gr.Column(scale=1): gr.HTML("""
    QUICK START
    claude_desktop_config.json
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    {
    "mcpServers": {
    "voicekit": {
    "command": "npx",
    "args": [
    "-y",
    "mcp-remote",
    "https://mcp-1st-birthday-voicekit.hf.space/gradio_api/mcp/sse"
    ]
    }
    }
    }
    """) # AVAILABLE TOOLS CARD with gr.Column(scale=1): gr.HTML("""
    AVAILABLE TOOLS
    TOOL PURPOSE INPUT OUTPUT
    extract_embedding
    Extract 768-dim voice fingerprint audio_base64 embedding, model, dim
    match_voice
    Compare two voice similarities audio1_base64, audio2_base64 similarity, tone_score
    analyze_acoustics
    Analyze pitch, energy, rhythm, tempo audio_base64 pitch, energy, rhythm, tempo
    transcribe_audio
    Convert speech to text audio_base64, language text, language, model
    isolate_voice
    Remove background music/noise audio_base64 isolated_audio_base64, metadata
    grade_voice
    5-metric comprehensive analysis user_audio, reference_audio, text, category overall, metrics, feedback
    """) # ==================== FIRST ROW: 3 DEMO CARDS ==================== with gr.Row(equal_height=True, elem_classes="demo-row"): # EXTRACT EMBEDDING with gr.Column(scale=1, elem_classes="demo-card-column"): gr.HTML("""
    EXTRACT EMBEDDING
    """) embedding_audio = gr.Audio( type="filepath", label="Audio Input", show_label=False, format="wav" ) embedding_btn = gr.Button("EXTRACT", variant="primary", size="lg", elem_classes="custom-action-btn") embedding_output = gr.HTML(value=create_embedding_empty()) embedding_btn.click( demo_extract_embedding, inputs=[embedding_audio], outputs=[embedding_output], api_visibility="private" ) # COMPARE VOICES with gr.Column(scale=1, elem_classes="demo-card-column"): gr.HTML("""
    MATCH VOICE
    """) with gr.Row(): compare_audio1 = gr.Audio( type="filepath", label="Audio 1", show_label=False, format="wav" ) compare_audio2 = gr.Audio( type="filepath", label="Audio 2", show_label=False, format="wav" ) compare_btn = gr.Button("COMPARE", variant="primary", size="lg", elem_classes="custom-action-btn") compare_output = gr.HTML(value=create_compare_empty()) compare_btn.click( demo_match_voice, inputs=[compare_audio1, compare_audio2], outputs=[compare_output], api_visibility="private" ) # ACOUSTIC ANALYSIS with gr.Column(scale=1, elem_classes="demo-card-column"): gr.HTML("""
    ANALYZE ACOUSTICS
    """) acoustic_audio = gr.Audio( type="filepath", label="Audio Input", show_label=False, format="wav" ) acoustic_btn = gr.Button("ANALYZE", variant="primary", size="lg", elem_classes="custom-action-btn") acoustic_output = gr.HTML(value=create_acoustic_empty()) acoustic_btn.click( demo_acoustic_analysis, inputs=[acoustic_audio], outputs=[acoustic_output], api_visibility="private" ) # ==================== SECOND ROW: 3 MORE DEMO CARDS ==================== with gr.Row(equal_height=True, elem_classes="demo-row"): # AUDIO TRANSCRIPTION with gr.Column(scale=1, elem_classes="demo-card-column"): gr.HTML("""
    TRANSCRIBE AUDIO
    """) transcribe_audio_input = gr.Audio( type="filepath", label="Audio Input", show_label=False, format="wav" ) transcribe_btn = gr.Button("TRANSCRIBE", variant="primary", size="lg", elem_classes="custom-action-btn") transcribe_output = gr.HTML(value=create_transcription_empty()) transcribe_btn.click( lambda audio: demo_transcribe_audio(audio, "en"), inputs=[transcribe_audio_input], outputs=[transcribe_output], api_visibility="private" ) # CLEAN AUDIO EXTRACTION with gr.Column(scale=1, elem_classes="demo-card-column"): gr.HTML("""
    ISOLATE VOICE
    """) clean_audio_input = gr.Audio( type="filepath", label="Audio with Background", show_label=False, format="wav" ) clean_btn = gr.Button("EXTRACT VOICE", variant="primary", size="lg", elem_classes="custom-action-btn") clean_audio_output = gr.Audio(label="Clean Audio", type="filepath", visible=True) clean_btn.click( demo_clean_extraction, inputs=[clean_audio_input], outputs=[clean_audio_output], api_visibility="private" ) # VOICE SIMILARITY with gr.Column(scale=1, elem_classes="demo-card-column"): gr.HTML("""
    GRADE VOICE
    """) with gr.Row(): similarity_user_audio = gr.Audio( type="filepath", label="User Audio", show_label=False, format="wav" ) similarity_ref_audio = gr.Audio( type="filepath", label="Reference Audio", show_label=False, format="wav" ) similarity_btn = gr.Button("ANALYZE", variant="primary", size="lg", elem_classes="custom-action-btn") similarity_output = gr.HTML(value=create_similarity_empty()) similarity_btn.click( demo_voice_similarity, inputs=[similarity_user_audio, similarity_ref_audio], outputs=[similarity_output], api_visibility="private" ) # ==================== MCP TOOL INTERFACES (HIDDEN, API ONLY) ==================== with gr.Row(visible=False): # extract_embedding mcp_emb_input = gr.Textbox() mcp_emb_output = gr.Textbox() mcp_emb_btn = gr.Button() mcp_emb_btn.click(extract_embedding, inputs=[mcp_emb_input], outputs=[mcp_emb_output]) # match_voice mcp_cmp_input1 = gr.Textbox() mcp_cmp_input2 = gr.Textbox() mcp_cmp_output = gr.Textbox() mcp_cmp_btn = gr.Button() mcp_cmp_btn.click(match_voice, inputs=[mcp_cmp_input1, mcp_cmp_input2], outputs=[mcp_cmp_output]) # analyze_acoustics mcp_ac_input = gr.Textbox() mcp_ac_output = gr.Textbox() mcp_ac_btn = gr.Button() mcp_ac_btn.click(analyze_acoustics, inputs=[mcp_ac_input], outputs=[mcp_ac_output]) # transcribe_audio mcp_tr_input = gr.Textbox() mcp_tr_lang = gr.Textbox(value="en") mcp_tr_output = gr.Textbox() mcp_tr_btn = gr.Button() mcp_tr_btn.click(transcribe_audio, inputs=[mcp_tr_input, mcp_tr_lang], outputs=[mcp_tr_output]) # isolate_voice mcp_iso_input = gr.Textbox() mcp_iso_output = gr.Textbox() mcp_iso_btn = gr.Button() mcp_iso_btn.click(isolate_voice, inputs=[mcp_iso_input], outputs=[mcp_iso_output]) # grade_voice mcp_sim_user = gr.Textbox() mcp_sim_ref = gr.Textbox() mcp_sim_text = gr.Textbox() mcp_sim_cat = gr.Textbox(value="meme") mcp_sim_output = gr.Textbox() mcp_sim_btn = gr.Button() mcp_sim_btn.click(grade_voice, inputs=[mcp_sim_user, mcp_sim_ref, mcp_sim_text, mcp_sim_cat], outputs=[mcp_sim_output]) if __name__ == "__main__": demo.launch( server_name="0.0.0.0", server_port=7860, mcp_server=True )