""" VoiceKit - MCP Server for Voice Analysis 6 MCP tools for voice processing (all accept base64 audio): - Embedding extraction, voice comparison, acoustic analysis - Speech-to-text, voice isolation, similarity analysis MCP Endpoint: https://mcp-1st-birthday-voicekit.hf.space/gradio_api/mcp/sse """ import gradio as gr import base64 import os import json import tempfile import math import re # Set Gradio temp directory to current directory GRADIO_TEMP_DIR = os.path.join(os.getcwd(), "gradio_temp") os.makedirs(GRADIO_TEMP_DIR, exist_ok=True) os.environ['GRADIO_TEMP_DIR'] = GRADIO_TEMP_DIR tempfile.tempdir = GRADIO_TEMP_DIR # Modal connection (requires MODAL_TOKEN_ID and MODAL_TOKEN_SECRET in HF Secrets) try: import modal AudioAnalyzer = modal.Cls.from_name("voice-semantle", "AudioAnalyzer") analyzer = AudioAnalyzer() modal_available = True print("Modal connected!") except Exception as e: modal_available = False analyzer = None print(f"Modal not available: {e}") # Load README.md and convert to HTML def load_readme_as_html(): """Load README.md and convert markdown to HTML""" try: with open("README.md", "r", encoding="utf-8") as f: content = f.read() # Remove YAML front matter content = re.sub(r'^---\n.*?\n---\n', '', content, flags=re.DOTALL) html = content # Headers html = re.sub(r'^### (.+)$', r'
{code_escaped}'
html = re.sub(r'```(\w*)\n(.*?)```', format_code_block, html, flags=re.DOTALL)
# Images - convert relative paths to HuggingFace raw file URLs
# Handle both blocks)
html = re.sub(r'`([^`]+)`', r'\1', html)
# Bold
html = re.sub(r'\*\*(.+?)\*\*', r'\1', html)
# Links
html = re.sub(r'\[([^\]]+)\]\(([^)]+)\)', r'\1', html)
# Tables
lines = html.split('\n')
in_table = False
table_html = []
new_lines = []
for line in lines:
if '|' in line and line.strip().startswith('|'):
if not in_table:
in_table = True
table_html = ['']
if re.match(r'^\|[\s\-:|]+\|$', line.strip()):
continue
cells = [c.strip() for c in line.strip().split('|')[1:-1]]
if len(table_html) == 1:
table_html.append('')
for cell in cells:
table_html.append(f'{cell} ')
table_html.append(' ')
else:
table_html.append('')
for cell in cells:
table_html.append(f'{cell} ')
table_html.append(' ')
else:
if in_table:
table_html.append('
')
new_lines.append(''.join(table_html))
table_html = []
in_table = False
new_lines.append(line)
if in_table:
table_html.append('')
new_lines.append(''.join(table_html))
html = '\n'.join(new_lines)
# Lists
html = re.sub(r'^- (.+)$', r'\1 ', html, flags=re.MULTILINE)
html = re.sub(r'(.* \n?)+', r'\g<0>
', html)
# Paragraphs - skip lines that are inside pre/code blocks
lines = html.split('\n')
result = []
for line in lines:
stripped = line.strip()
if stripped and not stripped.startswith('<') and not stripped.startswith('```'):
result.append(f'{stripped}
')
else:
result.append(line)
# Join and restore newlines in code blocks
final_html = '\n'.join(result)
final_html = final_html.replace('', '\n')
# Escape curly braces for f-string compatibility
final_html = final_html.replace('{', '{{').replace('}', '}}')
return final_html
except Exception as e:
return f"Error loading README: {e}
"
readme_html = load_readme_as_html()
def file_to_base64(file_path: str) -> str:
"""Convert file path to base64 string"""
if not file_path:
return ""
with open(file_path, "rb") as f:
return base64.b64encode(f.read()).decode()
# ============================================================================
# MCP Tools (all accept base64 directly)
# ============================================================================
def extract_embedding(audio_base64: str) -> str:
"""
Extract voice embedding using Wav2Vec2.
Returns a 768-dimensional vector representing voice characteristics.
Useful for voice comparison, speaker identification, etc.
Args:
audio_base64: Audio file as base64 encoded string
Returns:
embedding (768-dim list), model, dim
"""
if not modal_available:
return json.dumps({"error": "Modal not available. Please set MODAL_TOKEN_ID and MODAL_TOKEN_SECRET in HF Secrets."})
if not audio_base64:
return json.dumps({"error": "No audio provided"})
try:
result = analyzer.extract_embedding.remote(audio_base64)
if "embedding" in result:
result["embedding_preview"] = result["embedding"][:5] + ["..."]
result["embedding_length"] = len(result["embedding"])
del result["embedding"]
return json.dumps(result, ensure_ascii=False, indent=2)
except Exception as e:
return json.dumps({"error": str(e)})
def match_voice(audio1_base64: str, audio2_base64: str) -> str:
"""
Compare similarity between two voices.
Extracts Wav2Vec2 embeddings and calculates cosine similarity.
Useful for checking if the same person spoke with similar tone.
Args:
audio1_base64: First audio as base64 encoded string
audio2_base64: Second audio as base64 encoded string
Returns:
similarity (0-1), tone_score (0-100)
"""
if not modal_available:
return json.dumps({"error": "Modal not available. Please set MODAL_TOKEN_ID and MODAL_TOKEN_SECRET in HF Secrets."})
if not audio1_base64 or not audio2_base64:
return json.dumps({"error": "Both audio files required"})
try:
result = analyzer.compare_voices.remote(audio1_base64, audio2_base64)
return json.dumps(result, ensure_ascii=False, indent=2)
except Exception as e:
return json.dumps({"error": str(e)})
def analyze_acoustics(audio_base64: str) -> str:
"""
Analyze acoustic features of audio.
Extracts pitch, energy, rhythm, tempo, and spectral characteristics.
Useful for understanding voice expressiveness and characteristics.
Args:
audio_base64: Audio file as base64 encoded string
Returns:
pitch, energy, rhythm, tempo, spectral information
"""
if not modal_available:
return json.dumps({"error": "Modal not available. Please set MODAL_TOKEN_ID and MODAL_TOKEN_SECRET in HF Secrets."})
if not audio_base64:
return json.dumps({"error": "No audio provided"})
try:
result = analyzer.analyze_acoustic_features.remote(audio_base64)
return json.dumps(result, ensure_ascii=False, indent=2)
except Exception as e:
return json.dumps({"error": str(e)})
def transcribe_audio(audio_base64: str, language: str = "en") -> str:
"""
Convert audio to text (Speech-to-Text).
Uses ElevenLabs Scribe v1 model for high-quality speech recognition.
Supports various languages.
Args:
audio_base64: Audio file as base64 encoded string
language: Language code (e.g., "en", "ko", "ja"). Default is "en"
Returns:
text, language, model
"""
if not modal_available:
return json.dumps({"error": "Modal not available. Please set MODAL_TOKEN_ID and MODAL_TOKEN_SECRET in HF Secrets."})
if not audio_base64:
return json.dumps({"error": "No audio provided"})
try:
result = analyzer.transcribe_audio.remote(audio_base64, language)
return json.dumps(result, ensure_ascii=False, indent=2)
except Exception as e:
return json.dumps({"error": str(e)})
def isolate_voice(audio_base64: str) -> str:
"""
Remove background music (BGM) and extract voice only.
Uses ElevenLabs Voice Isolator to remove music, noise, etc.
Useful for memes, songs, and other audio with background sounds.
Args:
audio_base64: Audio file as base64 encoded string
Returns:
isolated_audio_base64, metadata (bgm_detected, sizes, duration)
"""
if not modal_available:
return json.dumps({"error": "Modal not available. Please set MODAL_TOKEN_ID and MODAL_TOKEN_SECRET in HF Secrets."})
if not audio_base64:
return json.dumps({"error": "No audio provided"})
try:
result = analyzer.isolate_voice.remote(audio_base64)
return json.dumps(result, ensure_ascii=False, indent=2)
except Exception as e:
return json.dumps({"error": str(e)})
def grade_voice(
user_audio_base64: str,
reference_audio_base64: str,
reference_text: str = "",
category: str = "meme"
) -> str:
"""
Comprehensively compare and analyze user voice with reference voice.
Evaluates with 5 metrics:
- pronunciation: Pronunciation accuracy (STT-based)
- tone: Voice timbre similarity (Wav2Vec2 embedding)
- pitch: Pitch matching
- rhythm: Rhythm sense
- energy: Energy expressiveness
Args:
user_audio_base64: User audio as base64 encoded string
reference_audio_base64: Reference audio as base64 encoded string
reference_text: Reference text (optional, enables pronunciation scoring)
category: Category (meme, song, movie) - determines weights
Returns:
overall_score, metrics, weak_points, strong_points, feedback
"""
if not modal_available:
return json.dumps({"error": "Modal not available. Please set MODAL_TOKEN_ID and MODAL_TOKEN_SECRET in HF Secrets."})
if not user_audio_base64 or not reference_audio_base64:
return json.dumps({"error": "Both user and reference audio required"})
try:
result = analyzer.analyze_audio.remote(
user_audio_base64=user_audio_base64,
reference_audio_base64=reference_audio_base64,
reference_text=reference_text if reference_text else None,
challenge_id="mcp_analysis",
category=category,
)
# Simplify output for backend/API use
metrics = result.get("metrics", {})
simple_result = {
"pitch": metrics.get("pitch", 0),
"rhythm": metrics.get("rhythm", 0),
"energy": metrics.get("energy", 0),
"pronunciation": metrics.get("pronunciation", 0),
"transcript": metrics.get("transcript", 0),
"overall": result.get("overall_score", 0),
"user_text": result.get("user_text", "")
}
return json.dumps(simple_result, ensure_ascii=False, indent=2)
except Exception as e:
return json.dumps({"error": str(e)})
# ============================================================================
# Demo Functions for UI
# ============================================================================
def demo_acoustic_analysis(audio_file):
"""Acoustic Analysis - Analyze pitch, energy, rhythm, tempo"""
if not audio_file:
return create_acoustic_empty()
audio_b64 = file_to_base64(audio_file)
result_json = analyze_acoustics(audio_b64)
try:
result = json.loads(result_json)
if "error" in result:
return f'''
Error in result:
{result.get("error", "Unknown error")}
'''
return create_acoustic_visualization(result)
except Exception as e:
return f'''
Parsing Error: {str(e)}
Raw Result (first 500 chars):
{result_json[:500]}
'''
def demo_transcribe_audio(audio_file, language):
"""Audio Transcription"""
if not audio_file:
return create_transcription_empty()
audio_b64 = file_to_base64(audio_file)
result_json = transcribe_audio(audio_b64, language)
try:
result = json.loads(result_json)
if "error" in result:
return create_transcription_empty()
text = result.get("text", "")
return create_transcription_visualization(text)
except:
return create_transcription_empty()
def demo_clean_extraction(audio_file):
"""Clean Audio Extraction - returns audio file only"""
if not audio_file:
return None
audio_b64 = file_to_base64(audio_file)
result_json = isolate_voice(audio_b64)
try:
result = json.loads(result_json)
if "error" in result:
return None
# Convert isolated audio base64 back to file
import tempfile
isolated_audio_bytes = base64.b64decode(result["isolated_audio_base64"])
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
tmp.write(isolated_audio_bytes)
isolated_audio_path = tmp.name
return isolated_audio_path
except:
return None
def demo_extract_embedding(audio_file):
"""Extract Embedding - extract voice fingerprint"""
if not audio_file:
return create_embedding_empty()
audio_b64 = file_to_base64(audio_file)
result_json = extract_embedding(audio_b64)
try:
result = json.loads(result_json)
if "error" in result:
return f'''
Error in result:
{result.get("error", "Unknown error")}
'''
return create_embedding_visualization(result)
except Exception as e:
return f'''
Parsing Error: {str(e)}
Raw Result (first 500 chars):
{result_json[:500]}
'''
def demo_match_voice(audio1, audio2):
"""Compare Voices - compare two voice similarities"""
if not audio1 or not audio2:
return create_compare_empty()
audio1_b64 = file_to_base64(audio1)
audio2_b64 = file_to_base64(audio2)
result_json = match_voice(audio1_b64, audio2_b64)
try:
result = json.loads(result_json)
if "error" in result:
return create_compare_empty()
return create_compare_visualization(result)
except:
return create_compare_empty()
def demo_voice_similarity(user_audio, ref_audio):
"""Voice Similarity - comprehensive 5-metric analysis"""
if not user_audio or not ref_audio:
return create_similarity_empty()
user_b64 = file_to_base64(user_audio)
ref_b64 = file_to_base64(ref_audio)
result_json = grade_voice(user_b64, ref_b64, "", "meme")
try:
result = json.loads(result_json)
if "error" in result:
return create_similarity_empty()
return create_similarity_visualization(result)
except:
return create_similarity_empty()
# ============================================================================
# Visualization Functions
# ============================================================================
def create_acoustic_empty():
"""Empty state for acoustic analysis"""
return """
Upload audio to analyze acoustic features
"""
def create_acoustic_visualization(result):
"""Acoustic analysis visualization with radar chart"""
pitch = result.get("pitch", {})
energy = result.get("energy", {})
rhythm = result.get("rhythm", {})
tempo = result.get("tempo", 0)
spectral = result.get("spectral", {})
# Use pre-calculated scores from Modal backend (already 0-100)
pitch_norm = pitch.get("score", 0)
energy_norm = energy.get("score", 0)
rhythm_norm = rhythm.get("score", 0)
spectral_norm = spectral.get("score", 0)
# Tempo: normalize BPM to 0-100 (60-180 BPM range)
tempo_bpm = tempo
tempo_norm = min(100, max(0, (tempo_bpm - 60) / 120 * 100)) if tempo_bpm > 0 else 0
# Radar chart calculation
center_x, center_y = 150, 150
radius = 110
# 5 metrics in order: Pitch(top), Energy(top-right), Rhythm(bottom-right), Tempo(bottom-left), Spectral(top-left)
metrics = [
("Pitch", pitch_norm, -90), # 0° - 90° = -90° (top)
("Energy", energy_norm, -18), # 72° - 90° = -18° (top-right)
("Rhythm", rhythm_norm, 54), # 144° - 90° = 54° (bottom-right)
("Tempo", tempo_norm, 126), # 216° - 90° = 126° (bottom-left)
("Spectral", spectral_norm, 198) # 288° - 90° = 198° (top-left)
]
# Calculate polygon points for data
data_points = []
for _, value, angle_deg in metrics:
angle_rad = math.radians(angle_deg)
point_radius = (value / 100) * radius
x = center_x + point_radius * math.cos(angle_rad)
y = center_y + point_radius * math.sin(angle_rad)
data_points.append(f"{x:.2f},{y:.2f}")
# Background concentric pentagons (20, 40, 60, 80, 100)
def create_pentagon_points(scale):
points = []
for _, _, angle_deg in metrics:
angle_rad = math.radians(angle_deg)
r = radius * scale
x = center_x + r * math.cos(angle_rad)
y = center_y + r * math.sin(angle_rad)
points.append(f"{x:.2f},{y:.2f}")
return " ".join(points)
background_pentagons = ""
for scale in [0.2, 0.4, 0.6, 0.8, 1.0]:
background_pentagons += f' '
# Axis lines from center to vertices
axis_lines = ""
for _, _, angle_deg in metrics:
angle_rad = math.radians(angle_deg)
x = center_x + radius * math.cos(angle_rad)
y = center_y + radius * math.sin(angle_rad)
axis_lines += f' '
# Labels at vertices
labels = ""
for label, value, angle_deg in metrics:
angle_rad = math.radians(angle_deg)
# Position label outside the pentagon
label_radius = radius + 25
x = center_x + label_radius * math.cos(angle_rad)
y = center_y + label_radius * math.sin(angle_rad)
labels += f'''
{label}
{int(value)}
'''
return f"""
"""
def create_mimicry_empty():
"""Empty state for voice mimicry game"""
return """
Upload reference and your voice to see similarity scores
"""
def create_mimicry_visualization(result):
"""Voice mimicry score visualization with progress bars"""
pronunciation = result.get("pronunciation", 0)
tone = result.get("transcript", 0) # Tone score
pitch = result.get("pitch", 0)
rhythm = result.get("rhythm", 0)
energy = result.get("energy", 0)
def create_progress_bar(label, value):
return f"""
{label}
{value}
"""
return f"""
CLAUDE
Wow, that voice input, takes analytical skills of course but I'll handle it
{create_progress_bar("Pronunciation", pronunciation)}
{create_progress_bar("Tone", tone)}
{create_progress_bar("Pitch", pitch)}
{create_progress_bar("Rhythm", rhythm)}
{create_progress_bar("Energy", energy)}
"""
def create_transcription_empty():
"""Empty state for transcription"""
return """
Upload audio to transcribe
"""
def create_transcription_visualization(text):
"""Simple text display for transcription result"""
return f"""
{text if text else "Transcription completed"}
"""
def create_embedding_empty():
"""Empty state for embedding extraction"""
return """
Upload audio to extract voice embedding
"""
def create_embedding_visualization(result):
"""Embedding extraction visualization"""
model = result.get("model", "Wav2Vec2")
dim = result.get("embedding_length", result.get("dim", 768))
preview = result.get("embedding_preview", [])
# Filter only numeric values to avoid format errors with strings like "..."
if preview:
numeric_preview = [v for v in preview if isinstance(v, (int, float))]
preview_str = ", ".join([f"{v:.4f}" for v in numeric_preview]) if numeric_preview else "..."
else:
preview_str = "..."
return f"""
Model
{model}
Dimensions
{dim}
Preview
"""
def create_compare_empty():
"""Empty state for voice comparison"""
return """
Upload two audio files to compare voices
"""
def create_compare_visualization(result):
"""Voice comparison visualization with similarity score"""
similarity = result.get("similarity", 0)
tone_score = result.get("tone_score", 0)
# Convert similarity to percentage
similarity_pct = int(similarity * 100)
# Color based on similarity - Purple theme matching VOICE SIMILARITY
if similarity_pct >= 80:
color = "#c084fc" # Light purple (high score)
elif similarity_pct >= 60:
color = "#a855f7" # Medium purple (medium score)
else:
color = "#7c3aed" # Dark purple (low score)
return f"""
{similarity_pct}
SIMILARITY
"""
def create_similarity_empty():
"""Empty state for voice similarity analysis"""
return """
Upload audio files for comprehensive similarity analysis
"""
def create_similarity_visualization(result):
"""Voice similarity visualization with radar chart"""
overall = result.get("overall", 0)
pronunciation = result.get("pronunciation", 0)
transcript = result.get("transcript", 0)
pitch = result.get("pitch", 0)
rhythm = result.get("rhythm", 0)
energy = result.get("energy", 0)
# Color based on overall score - Purple theme
if overall >= 80:
color = "#c084fc" # Light purple (high score)
elif overall >= 60:
color = "#a855f7" # Medium purple (medium score)
else:
color = "#7c3aed" # Dark purple (low score)
# Radar chart calculation
center_x, center_y = 150, 150
radius = 110
# 5 metrics in order: Pronunciation(top), Transcript(top-right), Pitch(bottom-right), Energy(bottom-left), Rhythm(top-left)
metrics = [
("Pronunciation", pronunciation, -90), # 0° - 90° = -90° (top)
("Transcript", transcript, -18), # 72° - 90° = -18° (top-right)
("Pitch", pitch, 54), # 144° - 90° = 54° (bottom-right)
("Energy", energy, 126), # 216° - 90° = 126° (bottom-left)
("Rhythm", rhythm, 198) # 288° - 90° = 198° (top-left)
]
# Calculate polygon points for data
data_points = []
for _, value, angle_deg in metrics:
angle_rad = math.radians(angle_deg)
point_radius = (value / 100) * radius
x = center_x + point_radius * math.cos(angle_rad)
y = center_y + point_radius * math.sin(angle_rad)
data_points.append(f"{x:.2f},{y:.2f}")
# Background concentric pentagons (20, 40, 60, 80, 100)
def create_pentagon_points(scale):
points = []
for _, _, angle_deg in metrics:
angle_rad = math.radians(angle_deg)
r = radius * scale
x = center_x + r * math.cos(angle_rad)
y = center_y + r * math.sin(angle_rad)
points.append(f"{x:.2f},{y:.2f}")
return " ".join(points)
background_pentagons = ""
for scale in [0.2, 0.4, 0.6, 0.8, 1.0]:
background_pentagons += f' '
# Axis lines from center to vertices
axis_lines = ""
for _, _, angle_deg in metrics:
angle_rad = math.radians(angle_deg)
x = center_x + radius * math.cos(angle_rad)
y = center_y + radius * math.sin(angle_rad)
axis_lines += f' '
# Labels at vertices
labels = ""
for label, value, angle_deg in metrics:
angle_rad = math.radians(angle_deg)
# Position label outside the pentagon
label_radius = radius + 25
x = center_x + label_radius * math.cos(angle_rad)
y = center_y + label_radius * math.sin(angle_rad)
labels += f'''
{label}
{value}
'''
return f"""
{overall}
OVERALL
"""
# Clean audio functions removed - using gr.Audio component directly
# ============================================================================
# Gradio Interface with Dark Theme
# ============================================================================
custom_css = """
/* ===== DARK THEME STYLING (CSS-ONLY) ===== */
/* This CSS forces dark mode appearance regardless of system/Gradio theme */
/* All colors are SOLID (not rgba/transparent) to ensure consistent appearance */
:root {
color-scheme: dark !important;
--body-background-fill: #0a0a1a !important;
--background-fill-primary: #0d0d1a !important;
--background-fill-secondary: #12122a !important;
--block-background-fill: #0d0d1a !important;
--input-background-fill: #1a1a35 !important;
--body-text-color: #e0e7ff !important;
--block-title-text-color: #a5b4fc !important;
--block-label-text-color: #a5b4fc !important;
--input-text-color: #e0e7ff !important;
--neutral-50: #0a0a1a !important;
--neutral-100: #0d0d1a !important;
--neutral-200: #12122a !important;
--neutral-300: #1a1a35 !important;
--neutral-400: #2d2d4a !important;
--neutral-500: #4a4a6a !important;
--neutral-600: #7c7c9a !important;
--neutral-700: #a5b4fc !important;
--neutral-800: #c7d2fe !important;
--neutral-900: #e0e7ff !important;
--neutral-950: #ffffff !important;
}
/* Force dark mode on html and body */
html, body {
background: #0a0a1a !important;
background-color: #0a0a1a !important;
color: #e0e7ff !important;
}
/* ===== GLOBAL STYLES ===== */
body {
background: linear-gradient(180deg, #0a0a1a 0%, #0f0f23 100%) !important;
background-color: #0a0a1a !important;
color: #ffffff !important;
font-family: system-ui, -apple-system, sans-serif;
}
/* Override Gradio's light mode backgrounds AND text colors */
.dark, .light, [data-theme="light"], [data-theme="dark"],
html[data-theme="light"], html[data-theme="dark"],
body.light, body.dark {
--body-background-fill: #0a0a1a !important;
--background-fill-primary: #0d0d1a !important;
--background-fill-secondary: #12122a !important;
--block-background-fill: #0d0d1a !important;
--input-background-fill: #1a1a35 !important;
--body-text-color: #e0e7ff !important;
--block-title-text-color: #a5b4fc !important;
--block-label-text-color: #a5b4fc !important;
--input-text-color: #e0e7ff !important;
--neutral-50: #0a0a1a !important;
--neutral-100: #0d0d1a !important;
--neutral-200: #12122a !important;
--neutral-300: #1a1a35 !important;
--neutral-400: #2d2d4a !important;
--neutral-500: #4a4a6a !important;
--neutral-600: #7c7c9a !important;
--neutral-700: #a5b4fc !important;
--neutral-800: #c7d2fe !important;
--neutral-900: #e0e7ff !important;
--neutral-950: #ffffff !important;
color: #e0e7ff !important;
background: #0a0a1a !important;
background-color: #0a0a1a !important;
}
.gradio-container {
max-width: 100% !important;
width: 100% !important;
padding: 0px 16px 20px 16px !important;
background: #0a0a1a !important;
background-color: #0a0a1a !important;
margin: 0 !important;
}
.gradio-container > .main,
.gradio-container .main,
.main {
max-width: 100% !important;
width: 100% !important;
padding-left: 0 !important;
padding-right: 0 !important;
margin: 0 auto !important;
}
.contain {
max-width: 100% !important;
padding: 0 !important;
}
/* Force full width on all Gradio internal containers */
.gradio-container > div,
.gradio-container > div > div,
#component-0,
.wrap,
.app,
.contain,
footer,
.gradio-row,
.gradio-column,
.svelte-1gfkn6j,
[class*="svelte-"] {
max-width: 100% !important;
}
.gradio-row {
max-width: 100% !important;
width: 100% !important;
margin: 0 !important;
padding: 0 !important;
}
/* ===== HEADER (FLOATING, NO CARD) ===== */
.header-main {
display: flex;
justify-content: space-between;
align-items: center;
margin-bottom: 0;
padding: 0;
}
.header-left {
display: flex;
align-items: center;
gap: 16px;
}
.header-icon {
font-size: 48px;
filter: drop-shadow(0 4px 12px rgba(99, 102, 241, 0.6));
}
.header-title {
font-size: 42px;
font-weight: 900;
color: #e0e7ff;
margin: 0;
letter-spacing: -0.5px;
}
.header-subtitle {
color: #c7d2fe;
font-size: 20px;
font-weight: 700;
margin-left: 6px;
}
/* ===== DOCS BUTTON ===== */
.docs-button {
display: flex;
align-items: center;
gap: 8px;
padding: 10px 20px;
background: linear-gradient(135deg, rgba(124, 58, 237, 0.3), rgba(99, 102, 241, 0.3));
border: 1px solid rgba(124, 58, 237, 0.5);
border-radius: 12px;
color: #e0e7ff;
font-size: 14px;
font-weight: 600;
cursor: pointer;
transition: all 0.3s ease;
text-transform: uppercase;
letter-spacing: 0.5px;
}
.docs-button:hover {
background: linear-gradient(135deg, rgba(124, 58, 237, 0.5), rgba(99, 102, 241, 0.5));
border-color: rgba(124, 58, 237, 0.8);
transform: translateY(-2px);
box-shadow: 0 4px 20px rgba(124, 58, 237, 0.4);
}
.docs-button svg {
width: 18px;
height: 18px;
}
/* ===== DOCS MODAL ===== */
.docs-modal-overlay {
display: none;
position: fixed !important;
inset: 0 !important;
width: 100vw !important;
height: 100vh !important;
background: rgba(0, 0, 0, 0.85) !important;
backdrop-filter: blur(10px) !important;
z-index: 99999 !important;
justify-content: center !important;
align-items: flex-start !important;
padding-top: 60px !important;
box-sizing: border-box !important;
/* Modal positioned near top of viewport */
overflow: hidden !important;
}
.docs-modal-overlay.active {
display: flex !important;
}
.docs-modal {
position: relative !important;
background: #0d0d1a !important;
border: 2px solid #7c3aed !important;
border-radius: 20px !important;
width: calc(100vw - 80px) !important;
max-width: 1200px !important;
max-height: 55vh !important;
overflow: hidden !important;
box-shadow: 0 25px 80px rgba(0, 0, 0, 0.9) !important;
/* Remove margin that could affect centering */
margin: 0 !important;
/* Prevent any transform inheritance issues */
transform: none !important;
}
.docs-modal-header {
display: flex !important;
justify-content: space-between !important;
align-items: center !important;
padding: 20px 24px !important;
border-bottom: 2px solid #7c3aed !important;
background: #1a1a2e !important;
}
.docs-modal-title {
font-size: 20px;
font-weight: 700;
color: #e0e7ff;
display: flex;
align-items: center;
gap: 10px;
}
.docs-modal-close {
background: rgba(124, 58, 237, 0.3);
border: 2px solid rgba(124, 58, 237, 0.5);
border-radius: 12px;
color: #e0e7ff;
font-size: 28px;
font-weight: 300;
cursor: pointer;
padding: 4px 14px;
line-height: 1;
transition: all 0.2s;
}
.docs-modal-close:hover {
background: rgba(124, 58, 237, 0.4);
border-color: rgba(124, 58, 237, 0.6);
}
.docs-modal-content {
padding: 24px !important;
overflow-y: auto !important;
max-height: calc(55vh - 80px) !important;
color: #c7d2fe !important;
font-size: 15px !important;
line-height: 1.7 !important;
background: #0d0d1a !important;
}
.docs-modal-content h1 { font-size: 28px; color: #e0e7ff; margin: 0 0 16px 0; padding-bottom: 12px; border-bottom: 2px solid rgba(124, 58, 237, 0.3); }
.docs-modal-content h2 { font-size: 22px; color: #e0e7ff; margin: 24px 0 12px 0; }
.docs-modal-content h3 { font-size: 18px; color: #a5b4fc; margin: 20px 0 10px 0; }
.docs-modal-content p { margin: 12px 0; }
.docs-modal-content ul, .docs-modal-content ol { margin: 12px 0; padding-left: 24px; }
.docs-modal-content li { margin: 6px 0; }
.docs-modal-content code { background: rgba(124, 58, 237, 0.2); padding: 2px 6px; border-radius: 4px; font-family: 'SF Mono', 'Monaco', 'Consolas', monospace; font-size: 13px; color: #c4b5fd; }
.docs-modal-content pre { background: rgba(0, 0, 0, 0.4); border: 1px solid rgba(124, 58, 237, 0.2); border-radius: 12px; padding: 16px; overflow-x: auto; margin: 16px 0; white-space: pre; }
.docs-modal-content pre code { background: transparent; padding: 0; color: #a5b4fc; white-space: pre; display: block; }
.docs-modal-content table { width: 100%; border-collapse: collapse; margin: 16px 0; }
.docs-modal-content th, .docs-modal-content td { padding: 10px 12px; text-align: left; border: 1px solid rgba(124, 58, 237, 0.2); }
.docs-modal-content th { background: rgba(124, 58, 237, 0.15); color: #e0e7ff; font-weight: 600; }
.docs-modal-content td { color: #c7d2fe; }
.docs-modal-content a { color: #a78bfa; text-decoration: none; }
.docs-modal-content a:hover { text-decoration: underline; }
.docs-modal-content strong { color: #e0e7ff; }
.docs-modal-content img { max-width: 100%; max-height: 400px; height: auto; border-radius: 8px; margin: 12px 0; object-fit: contain; }
/* ===== CARD STYLES ===== */
.card {
background: #0f0f23 !important;
background-color: #0f0f23 !important;
border: 1px solid #3d2a6b !important;
border-radius: 20px;
padding: 30px;
box-shadow: 0 8px 32px rgba(0, 0, 0, 0.4);
transition: all 0.3s ease;
height: 100%;
display: flex;
flex-direction: column;
}
.card:hover {
border-color: #5b3d99 !important;
box-shadow: 0 12px 48px rgba(124, 58, 237, 0.3);
}
/* Ensure columns in top row have equal height */
.gradio-row:first-of-type .gradio-column {
display: flex !important;
flex-direction: column !important;
}
.gradio-row:first-of-type .gradio-column > div {
flex: 1 !important;
display: flex !important;
flex-direction: column !important;
}
/* Set minimum height for top row cards */
.gradio-row:first-of-type .card {
min-height: 550px;
}
.card-title {
font-size: 16px;
font-weight: 700;
color: #a5b4fc;
text-transform: uppercase;
letter-spacing: 1px;
margin-bottom: 20px;
display: flex;
align-items: center;
}
/* ===== ROW SPACING ===== */
.gradio-row {
gap: 24px !important;
}
/* ===== QUICK START - CODE BLOCK (TERMINAL/IDE STYLE) ===== */
.terminal-window {
background: #1a1b26;
border: 1px solid rgba(124, 58, 237, 0.3);
border-radius: 12px;
overflow: hidden;
margin-bottom: 16px;
box-shadow: 0 8px 32px rgba(0, 0, 0, 0.6);
}
.terminal-header {
background: #16161e;
padding: 12px 16px;
display: flex;
align-items: center;
justify-content: space-between;
border-bottom: 1px solid rgba(124, 58, 237, 0.2);
}
.terminal-dots {
display: flex;
gap: 8px;
}
.terminal-dot {
width: 12px;
height: 12px;
border-radius: 50%;
}
.terminal-dot.red {
background: #ff5f56 !important;
box-shadow: 0 0 8px rgba(255, 95, 86, 0.8) !important;
}
.terminal-dot.yellow {
background: #ffbd2e !important;
box-shadow: 0 0 8px rgba(255, 189, 46, 0.8) !important;
}
.terminal-dot.green {
background: #27c93f !important;
box-shadow: 0 0 8px rgba(39, 201, 63, 0.8) !important;
}
.terminal-title {
font-size: 12px;
color: #6b7280;
font-family: 'SF Mono', 'Monaco', 'Consolas', monospace;
font-weight: 500;
}
.terminal-body {
background: #1a1b26;
padding: 0;
display: flex;
}
.line-numbers {
background: #16161e;
padding: 16px 12px;
border-right: 1px solid rgba(124, 58, 237, 0.15);
user-select: none;
text-align: right;
min-width: 48px;
}
.line-num {
display: block;
color: #4a5568;
font-family: 'SF Mono', 'Monaco', 'Consolas', monospace;
font-size: 14px;
line-height: 1.8;
}
.code-content {
flex: 1;
padding: 16px 20px;
overflow-x: auto;
}
.code-line {
display: block;
white-space: pre;
font-family: 'SF Mono', 'Monaco', 'Consolas', monospace;
font-size: 14px;
line-height: 1.8;
color: #a9b1d6;
}
.json-key {
color: #7dcfff;
font-weight: 500;
}
.json-string {
color: #9ece6a;
}
.json-bracket {
color: #bb9af7;
font-weight: 600;
}
.json-colon {
color: #c0caf5;
}
.json-comma {
color: #c0caf5;
}
.copy-button {
width: 100%;
background: linear-gradient(135deg, #7c3aed, #6366f1) !important;
border: none !important;
border-radius: 12px !important;
padding: 14px 24px !important;
font-weight: 700 !important;
font-size: 13px !important;
color: white !important;
text-transform: uppercase;
letter-spacing: 1px;
cursor: pointer;
box-shadow: 0 4px 16px rgba(124, 58, 237, 0.4) !important;
transition: all 0.3s ease !important;
display: flex;
align-items: center;
justify-content: center;
gap: 8px;
}
.copy-button:hover {
transform: translateY(-2px) !important;
box-shadow: 0 6px 24px rgba(124, 58, 237, 0.6) !important;
}
/* ===== TOOLS TABLE ===== */
.tools-table,
table.tools-table,
.light .tools-table,
.dark .tools-table,
[data-theme="light"] .tools-table,
[data-theme="dark"] .tools-table {
width: 100%;
border-collapse: separate;
border-spacing: 0;
background: #0d0d1f !important;
background-color: #0d0d1f !important;
border-radius: 12px;
overflow: hidden;
border: 1px solid #3d2a6b !important;
margin-bottom: 0;
flex: 1;
color: #cbd5e1 !important;
}
.tools-table th,
table.tools-table th,
.light .tools-table th,
.dark .tools-table th,
[data-theme="light"] .tools-table th,
[data-theme="dark"] .tools-table th {
background: #1f1545 !important;
background-color: #1f1545 !important;
color: #a5b4fc !important;
font-weight: 700;
font-size: 16px;
text-transform: uppercase;
letter-spacing: 1.5px;
padding: 20px 14px;
text-align: left;
border-bottom: 1px solid #3d2a6b !important;
}
.tools-table td,
table.tools-table td,
.light .tools-table td,
.dark .tools-table td,
[data-theme="light"] .tools-table td,
[data-theme="dark"] .tools-table td {
padding: 20px 14px;
color: #cbd5e1 !important;
background: #0d0d1f !important;
background-color: #0d0d1f !important;
font-size: 16px;
border-bottom: 1px solid #1a1535 !important;
}
.tools-table tr:last-child td {
border-bottom: none;
}
.tools-table tr:hover,
.tools-table tr:hover td {
background: #1a1540 !important;
background-color: #1a1540 !important;
}
.tool-name,
.light .tool-name,
.dark .tool-name,
[data-theme="light"] .tool-name,
[data-theme="dark"] .tool-name {
color: #22d3ee !important;
font-family: 'SF Mono', 'Monaco', 'Consolas', monospace;
font-weight: 600;
font-size: 13px;
vertical-align: middle;
}
/* ===== COMPOSITE SECTION ===== */
.composite-section,
.light .composite-section,
.dark .composite-section,
[data-theme="light"] .composite-section,
[data-theme="dark"] .composite-section {
background: #0d0d1f !important;
background-color: #0d0d1f !important;
border: 1px solid #3d2a6b !important;
border-radius: 12px;
padding: 20px;
color: #cbd5e1 !important;
}
.composite-header,
.light .composite-header,
.dark .composite-header,
[data-theme="light"] .composite-header,
[data-theme="dark"] .composite-header {
font-size: 11px;
font-weight: 700;
color: #a5b4fc !important;
text-transform: uppercase;
letter-spacing: 1.5px;
margin-bottom: 12px;
}
.composite-content,
.light .composite-content,
.dark .composite-content,
[data-theme="light"] .composite-content,
[data-theme="dark"] .composite-content {
color: #cbd5e1 !important;
font-size: 12px;
line-height: 1.6;
margin-bottom: 16px;
}
.try-demo-button {
width: 100%;
background: transparent !important;
border: 2px solid #7c3aed !important;
border-radius: 12px !important;
padding: 12px 24px !important;
font-weight: 700 !important;
font-size: 12px !important;
color: #7c3aed !important;
text-transform: uppercase;
letter-spacing: 1px;
cursor: pointer;
transition: all 0.3s ease !important;
}
.try-demo-button:hover {
background: rgba(124, 58, 237, 0.1) !important;
border-color: #7c3aed !important;
color: #8b5cf6 !important;
}
/* ===== BUTTONS ===== */
button[variant="primary"] {
background: linear-gradient(135deg, #7c3aed, #6366f1) !important;
border: none !important;
border-radius: 12px !important;
padding: 14px 32px !important;
font-weight: 700 !important;
font-size: 14px !important;
color: white !important;
box-shadow: 0 4px 20px rgba(124, 58, 237, 0.4) !important;
transition: all 0.3s ease !important;
}
button[variant="primary"]:hover {
transform: translateY(-2px) !important;
box-shadow: 0 8px 32px rgba(124, 58, 237, 0.6) !important;
}
/* ===== AUDIO COMPONENT ===== */
.gradio-audio {
background: rgba(30, 27, 75, 0.6) !important;
border: 1px solid rgba(124, 58, 237, 0.3) !important;
border-radius: 12px !important;
}
/* ===== TEXTBOX ===== */
textarea {
background: rgba(30, 27, 75, 0.6) !important;
border: 1px solid rgba(124, 58, 237, 0.3) !important;
border-radius: 12px !important;
color: #e0e7ff !important;
font-size: 13px !important;
}
/* ===== DROPDOWN ===== */
select {
background: rgba(30, 27, 75, 0.6) !important;
border: 1px solid rgba(124, 58, 237, 0.3) !important;
border-radius: 12px !important;
color: #e0e7ff !important;
}
/* ===== LABELS ===== */
label {
color: #a5b4fc !important;
font-weight: 600 !important;
font-size: 12px !important;
text-transform: uppercase;
letter-spacing: 0.5px;
}
/* ===== HTML OUTPUT ===== */
.gradio-html {
background: transparent !important;
border: none !important;
}
/* ===== DEMO ROW LAYOUT ===== */
.demo-row {
display: flex !important;
gap: 24px !important;
align-items: stretch !important;
}
/* Only apply card style to the outer column (demo-card-column) */
.demo-card-column {
display: flex !important;
flex-direction: column !important;
height: 700px !important;
min-height: 700px !important;
max-height: 700px !important;
background: rgba(15, 15, 35, 0.8) !important;
backdrop-filter: blur(20px) !important;
border: 1px solid rgba(124, 58, 237, 0.3) !important;
border-radius: 20px !important;
padding: 4px 4px 2px 4px !important;
box-shadow: 0 8px 32px rgba(0, 0, 0, 0.4) !important;
transition: all 0.3s ease !important;
gap: 2px !important;
overflow-y: auto !important;
}
.demo-card-column:hover {
border-color: rgba(124, 58, 237, 0.5) !important;
box-shadow: 0 12px 48px rgba(124, 58, 237, 0.3) !important;
}
/* Remove any border/background from inner elements */
.demo-card-column > div,
.demo-card-column > div > div,
.demo-row > div > div {
background: transparent !important;
border: none !important;
box-shadow: none !important;
padding: 0 !important;
border-radius: 0 !important;
}
/* Remove card background from inner HTML - we use column background instead */
.demo-row .card {
background: transparent !important;
backdrop-filter: none !important;
border: none !important;
border-radius: 0 !important;
padding: 0 !important;
box-shadow: none !important;
margin-bottom: 12px !important;
}
.demo-row .card:hover {
border: none !important;
box-shadow: none !important;
}
/* Ensure all inner components have transparent background */
.demo-row .gradio-audio,
.demo-row .gradio-dropdown,
.demo-row .gradio-textbox,
.demo-row .gradio-button {
background: transparent !important;
}
/* Create a wrapper for input elements (flex container) */
.demo-card-column > div:not(:last-child) {
flex: 0 0 auto !important;
}
/* Adjust spacing for input elements in demo cards */
.demo-row .gradio-audio {
margin-top: 6px !important;
margin-bottom: 0px !important;
max-height: 50px !important;
min-height: 40px !important;
height: 45px !important;
}
/* Target all child elements of audio component */
.demo-row .gradio-audio > div,
.demo-row .gradio-audio .wrap,
.demo-row .gradio-audio .upload-container,
.demo-row .gradio-audio .record-container,
.demo-row .gradio-audio * {
max-height: 50px !important;
}
/* Audio player specific height reduction */
.demo-row .gradio-audio audio {
height: 26px !important;
max-height: 26px !important;
min-height: 26px !important;
}
/* Upload/record button container height */
.demo-row .gradio-audio .upload-container,
.demo-row .gradio-audio .record-container {
min-height: 38px !important;
max-height: 38px !important;
padding: 4px !important;
}
/* Audio component buttons */
.demo-row .gradio-audio button {
height: 28px !important;
min-height: 28px !important;
max-height: 28px !important;
padding: 4px 10px !important;
font-size: 10px !important;
}
/* Hide text nodes in audio upload area - keep icons */
.demo-row .gradio-audio .upload-text {
display: none !important;
}
.demo-row .gradio-audio .placeholder {
display: none !important;
}
.demo-row .gradio-audio span:not(:has(svg)) {
font-size: 0 !important;
}
.demo-row .gradio-audio p {
display: none !important;
}
/* Hide "Drop Audio Here", "- or -", "Click to Upload" text */
.demo-row .gradio-audio .upload-container span,
.demo-row .gradio-audio .upload-container p {
font-size: 0 !important;
line-height: 0 !important;
}
/* Keep SVG icons visible */
.demo-row .gradio-audio svg {
font-size: initial !important;
}
/* ADDITIONAL METHODS: Hide all text in audio upload area */
.demo-row .gradio-audio label {
font-size: 0 !important;
}
.demo-row .gradio-audio label span:not(:has(svg)) {
display: none !important;
}
.demo-row .gradio-audio .file-preview {
font-size: 0 !important;
}
.demo-row .gradio-audio .file-preview span {
font-size: 0 !important;
display: none !important;
}
.demo-row .gradio-audio [data-testid="upload-text"],
.demo-row .gradio-audio [data-testid="file-preview-text"],
.demo-row .gradio-audio .upload-text,
.demo-row .gradio-audio .file-preview-text {
display: none !important;
visibility: hidden !important;
font-size: 0 !important;
}
/* Target all text nodes (more aggressive) */
.demo-row .gradio-audio *:not(svg):not(path):not(circle):not(rect):not(line) {
color: transparent !important;
}
.demo-row .gradio-audio button {
color: white !important;
}
/* Ensure icons remain visible */
.demo-row .gradio-audio svg,
.demo-row .gradio-audio svg * {
color: initial !important;
fill: currentColor !important;
stroke: currentColor !important;
}
/* NUCLEAR OPTION: Hide everything in label, then show only necessary elements */
.demo-row .gradio-audio label > div > div {
display: none !important;
}
.demo-row .gradio-audio label::before {
content: '' !important;
}
.demo-row .gradio-audio label * {
visibility: hidden !important;
}
.demo-row .gradio-audio label svg {
visibility: visible !important;
}
.demo-row .gradio-audio label button {
visibility: visible !important;
}
.demo-row .gradio-audio label audio {
visibility: visible !important;
}
/* Force hide any text content */
.demo-row .gradio-audio label > div::after,
.demo-row .gradio-audio label > div::before {
content: '' !important;
display: none !important;
}
/* Additional override for upload text elements */
.demo-row .gradio-audio [class*="upload"],
.demo-row .gradio-audio [class*="placeholder"],
.demo-row .gradio-audio [class*="text"] {
font-size: 0 !important;
line-height: 0 !important;
width: 0 !important;
height: 0 !important;
opacity: 0 !important;
visibility: hidden !important;
position: absolute !important;
left: -9999px !important;
}
/* NUCLEAR OPTION 2: Complete removal of label content */
.demo-row .gradio-audio label.block {
display: none !important;
}
.demo-row .gradio-audio .file-upload {
display: none !important;
}
/* Hide all direct text children */
.demo-row .gradio-audio label > span:not(:has(button)):not(:has(audio)):not(:has(svg)) {
display: none !important;
}
/* Gradio 6.0 specific selectors - upload area */
.demo-row .gradio-audio [data-testid="upload-button"],
.demo-row .gradio-audio [data-testid="file-upload"],
.demo-row .gradio-audio .upload-area {
display: none !important;
}
/* Hide all paragraph elements in audio component */
.demo-row .gradio-audio label p,
.demo-row .gradio-audio label span.text,
.demo-row .gradio-audio label div.text {
display: none !important;
}
/* More aggressive text hiding - target by content */
.demo-row .gradio-audio *::before,
.demo-row .gradio-audio *::after {
content: '' !important;
display: none !important;
}
/* Make sure only buttons and audio players are visible */
.demo-row .gradio-audio > label > div > div:not(:has(button)):not(:has(audio)) {
display: none !important;
}
/* Gradio Blocks specific - Hide wrapper divs that contain text */
.demo-row .gradio-audio .wrap > div:not(:has(button)):not(:has(audio)):not(:has(svg)) {
display: none !important;
}
/* Override for Gradio 6.x structure */
.demo-row .gradio-audio [class*="svelte-"] span:not(:has(svg)):not(:has(button)) {
display: none !important;
}
.demo-row .gradio-dropdown,
.demo-row .gradio-textbox {
margin-bottom: 2px !important;
}
.demo-row .gradio-row {
margin-bottom: 2px !important;
}
/* IMPORTANT: Button alignment - push buttons to bottom with margin-top: auto */
.demo-row .gradio-button {
margin-top: auto !important;
margin-bottom: 0px !important;
flex-shrink: 0 !important;
}
/* Output area should not push button down - set flex: 1 */
.demo-row .gradio-html {
flex: 1 !important;
margin-bottom: 0 !important;
display: flex !important;
flex-direction: column !important;
max-height: 300px !important;
overflow-y: auto !important;
}
/* Output audio component (clean_audio_output) height limit */
.demo-row .gradio-audio[data-testid="audio-output"],
.demo-row > div:last-child .gradio-audio {
max-height: 120px !important;
min-height: 60px !important;
height: auto !important;
margin-bottom: 0px !important;
}
/* ===== CUSTOM ACTION BUTTONS (DEMO CARDS) ===== */
.custom-action-btn,
.custom-action-btn button,
.custom-action-btn button[data-testid="button"],
button.custom-action-btn,
.demo-row .custom-action-btn,
.demo-row .custom-action-btn button {
width: 100% !important;
min-width: 100% !important;
max-width: 100% !important;
background: linear-gradient(135deg, #6366f1, #7c3aed) !important;
border: none !important;
border-radius: 12px !important;
padding: 8px 16px !important;
height: 38px !important;
min-height: 38px !important;
max-height: 38px !important;
font-weight: 700 !important;
font-size: 16px !important;
letter-spacing: 1.5px !important;
text-transform: uppercase !important;
color: white !important;
box-shadow: 0 4px 20px rgba(124, 58, 237, 0.4) !important;
transition: all 0.3s ease !important;
}
.custom-action-btn:hover,
.custom-action-btn button:hover,
.custom-action-btn button[data-testid="button"]:hover,
button.custom-action-btn:hover,
.demo-row .custom-action-btn:hover,
.demo-row .custom-action-btn button:hover {
transform: translateY(-2px) !important;
box-shadow: 0 8px 32px rgba(124, 58, 237, 0.6) !important;
background: linear-gradient(135deg, #6366f1, #7c3aed) !important;
}
/* ===== DECORATIVE ELEMENTS ===== */
.diamond-decoration {
position: fixed;
bottom: 40px;
right: 40px;
width: 80px;
height: 80px;
border: 2px solid rgba(124, 58, 237, 0.2);
transform: rotate(45deg);
pointer-events: none;
z-index: 1;
}
.star-decoration {
display: none;
}
"""
with gr.Blocks() as demo:
# Inject custom CSS and decorative elements (positioned fixed, no DOM space)
gr.HTML(f"""
""")
# ==================== HEADER (FLOATING) ====================
gr.HTML(f"""
VoiceKit
MCP Server
""")
# ==================== TOP ROW: QUICK START + AVAILABLE TOOLS ====================
with gr.Row(equal_height=True):
# QUICK START CARD
with gr.Column(scale=1):
gr.HTML("""
QUICK START
claude_desktop_config.json
1
2
3
4
5
6
7
8
9
10
11
12
{
"mcpServers": {
"voicekit": {
"command": "npx",
"args": [
"-y",
"mcp-remote",
"https://mcp-1st-birthday-voicekit.hf.space/gradio_api/mcp/sse"
]
}
}
}
""")
# AVAILABLE TOOLS CARD
with gr.Column(scale=1):
gr.HTML("""
AVAILABLE TOOLS
TOOL
PURPOSE
INPUT
OUTPUT
extract_embedding
Extract 768-dim voice fingerprint
audio_base64
embedding, model, dim
match_voice
Compare two voice similarities
audio1_base64, audio2_base64
similarity, tone_score
analyze_acoustics
Analyze pitch, energy, rhythm, tempo
audio_base64
pitch, energy, rhythm, tempo
transcribe_audio
Convert speech to text
audio_base64, language
text, language, model
isolate_voice
Remove background music/noise
audio_base64
isolated_audio_base64, metadata
grade_voice
5-metric comprehensive analysis
user_audio, reference_audio, text, category
overall, metrics, feedback
""")
# ==================== FIRST ROW: 3 DEMO CARDS ====================
with gr.Row(equal_height=True, elem_classes="demo-row"):
# EXTRACT EMBEDDING
with gr.Column(scale=1, elem_classes="demo-card-column"):
gr.HTML("""
EXTRACT EMBEDDING
""")
embedding_audio = gr.Audio(
type="filepath",
label="Audio Input",
show_label=False,
format="wav"
)
embedding_btn = gr.Button("EXTRACT", variant="primary", size="lg", elem_classes="custom-action-btn")
embedding_output = gr.HTML(value=create_embedding_empty())
embedding_btn.click(
demo_extract_embedding,
inputs=[embedding_audio],
outputs=[embedding_output],
api_visibility="private"
)
# COMPARE VOICES
with gr.Column(scale=1, elem_classes="demo-card-column"):
gr.HTML("""
MATCH VOICE
""")
with gr.Row():
compare_audio1 = gr.Audio(
type="filepath",
label="Audio 1",
show_label=False,
format="wav"
)
compare_audio2 = gr.Audio(
type="filepath",
label="Audio 2",
show_label=False,
format="wav"
)
compare_btn = gr.Button("COMPARE", variant="primary", size="lg", elem_classes="custom-action-btn")
compare_output = gr.HTML(value=create_compare_empty())
compare_btn.click(
demo_match_voice,
inputs=[compare_audio1, compare_audio2],
outputs=[compare_output],
api_visibility="private"
)
# ACOUSTIC ANALYSIS
with gr.Column(scale=1, elem_classes="demo-card-column"):
gr.HTML("""
ANALYZE ACOUSTICS
""")
acoustic_audio = gr.Audio(
type="filepath",
label="Audio Input",
show_label=False,
format="wav"
)
acoustic_btn = gr.Button("ANALYZE", variant="primary", size="lg", elem_classes="custom-action-btn")
acoustic_output = gr.HTML(value=create_acoustic_empty())
acoustic_btn.click(
demo_acoustic_analysis,
inputs=[acoustic_audio],
outputs=[acoustic_output],
api_visibility="private"
)
# ==================== SECOND ROW: 3 MORE DEMO CARDS ====================
with gr.Row(equal_height=True, elem_classes="demo-row"):
# AUDIO TRANSCRIPTION
with gr.Column(scale=1, elem_classes="demo-card-column"):
gr.HTML("""
TRANSCRIBE AUDIO
""")
transcribe_audio_input = gr.Audio(
type="filepath",
label="Audio Input",
show_label=False,
format="wav"
)
transcribe_btn = gr.Button("TRANSCRIBE", variant="primary", size="lg", elem_classes="custom-action-btn")
transcribe_output = gr.HTML(value=create_transcription_empty())
transcribe_btn.click(
lambda audio: demo_transcribe_audio(audio, "en"),
inputs=[transcribe_audio_input],
outputs=[transcribe_output],
api_visibility="private"
)
# CLEAN AUDIO EXTRACTION
with gr.Column(scale=1, elem_classes="demo-card-column"):
gr.HTML("""
ISOLATE VOICE
""")
clean_audio_input = gr.Audio(
type="filepath",
label="Audio with Background",
show_label=False,
format="wav"
)
clean_btn = gr.Button("EXTRACT VOICE", variant="primary", size="lg", elem_classes="custom-action-btn")
clean_audio_output = gr.Audio(label="Clean Audio", type="filepath", visible=True)
clean_btn.click(
demo_clean_extraction,
inputs=[clean_audio_input],
outputs=[clean_audio_output],
api_visibility="private"
)
# VOICE SIMILARITY
with gr.Column(scale=1, elem_classes="demo-card-column"):
gr.HTML("""
GRADE VOICE
""")
with gr.Row():
similarity_user_audio = gr.Audio(
type="filepath",
label="User Audio",
show_label=False,
format="wav"
)
similarity_ref_audio = gr.Audio(
type="filepath",
label="Reference Audio",
show_label=False,
format="wav"
)
similarity_btn = gr.Button("ANALYZE", variant="primary", size="lg", elem_classes="custom-action-btn")
similarity_output = gr.HTML(value=create_similarity_empty())
similarity_btn.click(
demo_voice_similarity,
inputs=[similarity_user_audio, similarity_ref_audio],
outputs=[similarity_output],
api_visibility="private"
)
# ==================== MCP TOOL INTERFACES (HIDDEN, API ONLY) ====================
with gr.Row(visible=False):
# extract_embedding
mcp_emb_input = gr.Textbox()
mcp_emb_output = gr.Textbox()
mcp_emb_btn = gr.Button()
mcp_emb_btn.click(extract_embedding, inputs=[mcp_emb_input], outputs=[mcp_emb_output])
# match_voice
mcp_cmp_input1 = gr.Textbox()
mcp_cmp_input2 = gr.Textbox()
mcp_cmp_output = gr.Textbox()
mcp_cmp_btn = gr.Button()
mcp_cmp_btn.click(match_voice, inputs=[mcp_cmp_input1, mcp_cmp_input2], outputs=[mcp_cmp_output])
# analyze_acoustics
mcp_ac_input = gr.Textbox()
mcp_ac_output = gr.Textbox()
mcp_ac_btn = gr.Button()
mcp_ac_btn.click(analyze_acoustics, inputs=[mcp_ac_input], outputs=[mcp_ac_output])
# transcribe_audio
mcp_tr_input = gr.Textbox()
mcp_tr_lang = gr.Textbox(value="en")
mcp_tr_output = gr.Textbox()
mcp_tr_btn = gr.Button()
mcp_tr_btn.click(transcribe_audio, inputs=[mcp_tr_input, mcp_tr_lang], outputs=[mcp_tr_output])
# isolate_voice
mcp_iso_input = gr.Textbox()
mcp_iso_output = gr.Textbox()
mcp_iso_btn = gr.Button()
mcp_iso_btn.click(isolate_voice, inputs=[mcp_iso_input], outputs=[mcp_iso_output])
# grade_voice
mcp_sim_user = gr.Textbox()
mcp_sim_ref = gr.Textbox()
mcp_sim_text = gr.Textbox()
mcp_sim_cat = gr.Textbox(value="meme")
mcp_sim_output = gr.Textbox()
mcp_sim_btn = gr.Button()
mcp_sim_btn.click(grade_voice, inputs=[mcp_sim_user, mcp_sim_ref, mcp_sim_text, mcp_sim_cat], outputs=[mcp_sim_output])
if __name__ == "__main__":
demo.launch(
server_name="0.0.0.0",
server_port=7860,
mcp_server=True
)