"""
VoiceKit - MCP Server for Voice Analysis
6 MCP tools for voice processing (all accept base64 audio):
- Embedding extraction, voice comparison, acoustic analysis
- Speech-to-text, voice isolation, similarity analysis
MCP Endpoint: https://mcp-1st-birthday-voicekit.hf.space/gradio_api/mcp/sse
"""
import gradio as gr
import base64
import os
import json
import tempfile
import math
import re
# Set Gradio temp directory to current directory
GRADIO_TEMP_DIR = os.path.join(os.getcwd(), "gradio_temp")
os.makedirs(GRADIO_TEMP_DIR, exist_ok=True)
os.environ['GRADIO_TEMP_DIR'] = GRADIO_TEMP_DIR
tempfile.tempdir = GRADIO_TEMP_DIR
# Modal connection (requires MODAL_TOKEN_ID and MODAL_TOKEN_SECRET in HF Secrets)
try:
import modal
AudioAnalyzer = modal.Cls.from_name("voice-semantle", "AudioAnalyzer")
analyzer = AudioAnalyzer()
modal_available = True
print("Modal connected!")
except Exception as e:
modal_available = False
analyzer = None
print(f"Modal not available: {e}")
# Load README.md and convert to HTML
def load_readme_as_html():
"""Load README.md and convert markdown to HTML"""
try:
with open("README.md", "r", encoding="utf-8") as f:
content = f.read()
# Remove YAML front matter
content = re.sub(r'^---\n.*?\n---\n', '', content, flags=re.DOTALL)
html = content
# Headers
html = re.sub(r'^### (.+)$', r'
\1
', html, flags=re.MULTILINE)
html = re.sub(r'^## (.+)$', r'
\1
', html, flags=re.MULTILINE)
html = re.sub(r'^# (.+)$', r'
\1
', html, flags=re.MULTILINE)
# Code blocks
html = re.sub(r'```(\w*)\n(.*?)```', r'
\2
', html, flags=re.DOTALL)
# Inline code
html = re.sub(r'`([^`]+)`', r'\1', html)
# Bold
html = re.sub(r'\*\*(.+?)\*\*', r'\1', html)
# Links
html = re.sub(r'\[([^\]]+)\]\(([^)]+)\)', r'\1', html)
# Tables
lines = html.split('\n')
in_table = False
table_html = []
new_lines = []
for line in lines:
if '|' in line and line.strip().startswith('|'):
if not in_table:
in_table = True
table_html = ['
']
if re.match(r'^\|[\s\-:|]+\|$', line.strip()):
continue
cells = [c.strip() for c in line.strip().split('|')[1:-1]]
if len(table_html) == 1:
table_html.append('
')
for cell in cells:
table_html.append(f'
{cell}
')
table_html.append('
')
else:
table_html.append('
')
for cell in cells:
table_html.append(f'
{cell}
')
table_html.append('
')
else:
if in_table:
table_html.append('
')
new_lines.append('\n'.join(table_html))
table_html = []
in_table = False
new_lines.append(line)
if in_table:
table_html.append('')
new_lines.append('\n'.join(table_html))
html = '\n'.join(new_lines)
# Lists
html = re.sub(r'^- (.+)$', r'
\1
', html, flags=re.MULTILINE)
html = re.sub(r'(
.*
\n?)+', r'
\g<0>
', html)
# Paragraphs
lines = html.split('\n')
result = []
for line in lines:
stripped = line.strip()
if stripped and not stripped.startswith('<') and not stripped.startswith('```'):
result.append(f'
"
readme_html = load_readme_as_html()
def file_to_base64(file_path: str) -> str:
"""Convert file path to base64 string"""
if not file_path:
return ""
with open(file_path, "rb") as f:
return base64.b64encode(f.read()).decode()
# ============================================================================
# MCP Tools (all accept base64 directly)
# ============================================================================
def extract_embedding(audio_base64: str) -> str:
"""
Extract voice embedding using Wav2Vec2.
Returns a 768-dimensional vector representing voice characteristics.
Useful for voice comparison, speaker identification, etc.
Args:
audio_base64: Audio file as base64 encoded string
Returns:
embedding (768-dim list), model, dim
"""
if not modal_available:
return json.dumps({"error": "Modal not available. Please set MODAL_TOKEN_ID and MODAL_TOKEN_SECRET in HF Secrets."})
if not audio_base64:
return json.dumps({"error": "No audio provided"})
try:
result = analyzer.extract_embedding.remote(audio_base64)
if "embedding" in result:
result["embedding_preview"] = result["embedding"][:5] + ["..."]
result["embedding_length"] = len(result["embedding"])
del result["embedding"]
return json.dumps(result, ensure_ascii=False, indent=2)
except Exception as e:
return json.dumps({"error": str(e)})
def match_voice(audio1_base64: str, audio2_base64: str) -> str:
"""
Compare similarity between two voices.
Extracts Wav2Vec2 embeddings and calculates cosine similarity.
Useful for checking if the same person spoke with similar tone.
Args:
audio1_base64: First audio as base64 encoded string
audio2_base64: Second audio as base64 encoded string
Returns:
similarity (0-1), tone_score (0-100)
"""
if not modal_available:
return json.dumps({"error": "Modal not available. Please set MODAL_TOKEN_ID and MODAL_TOKEN_SECRET in HF Secrets."})
if not audio1_base64 or not audio2_base64:
return json.dumps({"error": "Both audio files required"})
try:
result = analyzer.compare_voices.remote(audio1_base64, audio2_base64)
return json.dumps(result, ensure_ascii=False, indent=2)
except Exception as e:
return json.dumps({"error": str(e)})
def analyze_acoustics(audio_base64: str) -> str:
"""
Analyze acoustic features of audio.
Extracts pitch, energy, rhythm, tempo, and spectral characteristics.
Useful for understanding voice expressiveness and characteristics.
Args:
audio_base64: Audio file as base64 encoded string
Returns:
pitch, energy, rhythm, tempo, spectral information
"""
if not modal_available:
return json.dumps({"error": "Modal not available. Please set MODAL_TOKEN_ID and MODAL_TOKEN_SECRET in HF Secrets."})
if not audio_base64:
return json.dumps({"error": "No audio provided"})
try:
result = analyzer.analyze_acoustic_features.remote(audio_base64)
return json.dumps(result, ensure_ascii=False, indent=2)
except Exception as e:
return json.dumps({"error": str(e)})
def transcribe_audio(audio_base64: str, language: str = "en") -> str:
"""
Convert audio to text (Speech-to-Text).
Uses ElevenLabs Scribe v1 model for high-quality speech recognition.
Supports various languages.
Args:
audio_base64: Audio file as base64 encoded string
language: Language code (e.g., "en", "ko", "ja"). Default is "en"
Returns:
text, language, model
"""
if not modal_available:
return json.dumps({"error": "Modal not available. Please set MODAL_TOKEN_ID and MODAL_TOKEN_SECRET in HF Secrets."})
if not audio_base64:
return json.dumps({"error": "No audio provided"})
try:
result = analyzer.transcribe_audio.remote(audio_base64, language)
return json.dumps(result, ensure_ascii=False, indent=2)
except Exception as e:
return json.dumps({"error": str(e)})
def isolate_voice(audio_base64: str) -> str:
"""
Remove background music (BGM) and extract voice only.
Uses ElevenLabs Voice Isolator to remove music, noise, etc.
Useful for memes, songs, and other audio with background sounds.
Args:
audio_base64: Audio file as base64 encoded string
Returns:
isolated_audio_base64, metadata (bgm_detected, sizes, duration)
"""
if not modal_available:
return json.dumps({"error": "Modal not available. Please set MODAL_TOKEN_ID and MODAL_TOKEN_SECRET in HF Secrets."})
if not audio_base64:
return json.dumps({"error": "No audio provided"})
try:
result = analyzer.isolate_voice.remote(audio_base64)
return json.dumps(result, ensure_ascii=False, indent=2)
except Exception as e:
return json.dumps({"error": str(e)})
def grade_voice(
user_audio_base64: str,
reference_audio_base64: str,
reference_text: str = "",
category: str = "meme"
) -> str:
"""
Comprehensively compare and analyze user voice with reference voice.
Evaluates with 5 metrics:
- pronunciation: Pronunciation accuracy (STT-based)
- tone: Voice timbre similarity (Wav2Vec2 embedding)
- pitch: Pitch matching
- rhythm: Rhythm sense
- energy: Energy expressiveness
Args:
user_audio_base64: User audio as base64 encoded string
reference_audio_base64: Reference audio as base64 encoded string
reference_text: Reference text (optional, enables pronunciation scoring)
category: Category (meme, song, movie) - determines weights
Returns:
overall_score, metrics, weak_points, strong_points, feedback
"""
if not modal_available:
return json.dumps({"error": "Modal not available. Please set MODAL_TOKEN_ID and MODAL_TOKEN_SECRET in HF Secrets."})
if not user_audio_base64 or not reference_audio_base64:
return json.dumps({"error": "Both user and reference audio required"})
try:
result = analyzer.analyze_audio.remote(
user_audio_base64=user_audio_base64,
reference_audio_base64=reference_audio_base64,
reference_text=reference_text if reference_text else None,
challenge_id="mcp_analysis",
category=category,
)
# Simplify output for backend/API use
metrics = result.get("metrics", {})
simple_result = {
"pitch": metrics.get("pitch", 0),
"rhythm": metrics.get("rhythm", 0),
"energy": metrics.get("energy", 0),
"pronunciation": metrics.get("pronunciation", 0),
"transcript": metrics.get("transcript", 0),
"overall": result.get("overall_score", 0),
"user_text": result.get("user_text", "")
}
return json.dumps(simple_result, ensure_ascii=False, indent=2)
except Exception as e:
return json.dumps({"error": str(e)})
# ============================================================================
# Demo Functions for UI
# ============================================================================
def demo_acoustic_analysis(audio_file):
"""Acoustic Analysis - Analyze pitch, energy, rhythm, tempo"""
if not audio_file:
return create_acoustic_empty()
audio_b64 = file_to_base64(audio_file)
result_json = analyze_acoustics(audio_b64)
try:
result = json.loads(result_json)
if "error" in result:
return f'''
Error in result: {result.get("error", "Unknown error")}
'''
return create_acoustic_visualization(result)
except Exception as e:
return f'''
Parsing Error: {str(e)}
Raw Result (first 500 chars): {result_json[:500]}
'''
def demo_transcribe_audio(audio_file, language):
"""Audio Transcription"""
if not audio_file:
return create_transcription_empty()
audio_b64 = file_to_base64(audio_file)
result_json = transcribe_audio(audio_b64, language)
try:
result = json.loads(result_json)
if "error" in result:
return create_transcription_empty()
text = result.get("text", "")
return create_transcription_visualization(text)
except:
return create_transcription_empty()
def demo_clean_extraction(audio_file):
"""Clean Audio Extraction - returns audio file only"""
if not audio_file:
return None
audio_b64 = file_to_base64(audio_file)
result_json = isolate_voice(audio_b64)
try:
result = json.loads(result_json)
if "error" in result:
return None
# Convert isolated audio base64 back to file
import tempfile
isolated_audio_bytes = base64.b64decode(result["isolated_audio_base64"])
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
tmp.write(isolated_audio_bytes)
isolated_audio_path = tmp.name
return isolated_audio_path
except:
return None
def demo_extract_embedding(audio_file):
"""Extract Embedding - extract voice fingerprint"""
if not audio_file:
return create_embedding_empty()
audio_b64 = file_to_base64(audio_file)
result_json = extract_embedding(audio_b64)
try:
result = json.loads(result_json)
if "error" in result:
return f'''
Error in result: {result.get("error", "Unknown error")}
'''
return create_embedding_visualization(result)
except Exception as e:
return f'''
Parsing Error: {str(e)}
Raw Result (first 500 chars): {result_json[:500]}
'''
def demo_match_voice(audio1, audio2):
"""Compare Voices - compare two voice similarities"""
if not audio1 or not audio2:
return create_compare_empty()
audio1_b64 = file_to_base64(audio1)
audio2_b64 = file_to_base64(audio2)
result_json = match_voice(audio1_b64, audio2_b64)
try:
result = json.loads(result_json)
if "error" in result:
return create_compare_empty()
return create_compare_visualization(result)
except:
return create_compare_empty()
def demo_voice_similarity(user_audio, ref_audio):
"""Voice Similarity - comprehensive 5-metric analysis"""
if not user_audio or not ref_audio:
return create_similarity_empty()
user_b64 = file_to_base64(user_audio)
ref_b64 = file_to_base64(ref_audio)
result_json = grade_voice(user_b64, ref_b64, "", "meme")
try:
result = json.loads(result_json)
if "error" in result:
return create_similarity_empty()
return create_similarity_visualization(result)
except:
return create_similarity_empty()
# ============================================================================
# Visualization Functions
# ============================================================================
def create_acoustic_empty():
"""Empty state for acoustic analysis"""
return """
Upload audio to analyze acoustic features
"""
def create_acoustic_visualization(result):
"""Acoustic analysis visualization with radar chart"""
pitch = result.get("pitch", {})
energy = result.get("energy", {})
rhythm = result.get("rhythm", {})
tempo = result.get("tempo", 0)
spectral = result.get("spectral", {})
# Use pre-calculated scores from Modal backend (already 0-100)
pitch_norm = pitch.get("score", 0)
energy_norm = energy.get("score", 0)
rhythm_norm = rhythm.get("score", 0)
spectral_norm = spectral.get("score", 0)
# Tempo: normalize BPM to 0-100 (60-180 BPM range)
tempo_bpm = tempo
tempo_norm = min(100, max(0, (tempo_bpm - 60) / 120 * 100)) if tempo_bpm > 0 else 0
# Radar chart calculation
center_x, center_y = 150, 150
radius = 110
# 5 metrics in order: Pitch(top), Energy(top-right), Rhythm(bottom-right), Tempo(bottom-left), Spectral(top-left)
metrics = [
("Pitch", pitch_norm, -90), # 0° - 90° = -90° (top)
("Energy", energy_norm, -18), # 72° - 90° = -18° (top-right)
("Rhythm", rhythm_norm, 54), # 144° - 90° = 54° (bottom-right)
("Tempo", tempo_norm, 126), # 216° - 90° = 126° (bottom-left)
("Spectral", spectral_norm, 198) # 288° - 90° = 198° (top-left)
]
# Calculate polygon points for data
data_points = []
for _, value, angle_deg in metrics:
angle_rad = math.radians(angle_deg)
point_radius = (value / 100) * radius
x = center_x + point_radius * math.cos(angle_rad)
y = center_y + point_radius * math.sin(angle_rad)
data_points.append(f"{x:.2f},{y:.2f}")
# Background concentric pentagons (20, 40, 60, 80, 100)
def create_pentagon_points(scale):
points = []
for _, _, angle_deg in metrics:
angle_rad = math.radians(angle_deg)
r = radius * scale
x = center_x + r * math.cos(angle_rad)
y = center_y + r * math.sin(angle_rad)
points.append(f"{x:.2f},{y:.2f}")
return " ".join(points)
background_pentagons = ""
for scale in [0.2, 0.4, 0.6, 0.8, 1.0]:
background_pentagons += f''
# Axis lines from center to vertices
axis_lines = ""
for _, _, angle_deg in metrics:
angle_rad = math.radians(angle_deg)
x = center_x + radius * math.cos(angle_rad)
y = center_y + radius * math.sin(angle_rad)
axis_lines += f''
# Labels at vertices
labels = ""
for label, value, angle_deg in metrics:
angle_rad = math.radians(angle_deg)
# Position label outside the pentagon
label_radius = radius + 25
x = center_x + label_radius * math.cos(angle_rad)
y = center_y + label_radius * math.sin(angle_rad)
labels += f'''
{label}
{int(value)}'''
return f"""
"""
def create_mimicry_empty():
"""Empty state for voice mimicry game"""
return """
Upload reference and your voice to see similarity scores
"""
def create_mimicry_visualization(result):
"""Voice mimicry score visualization with progress bars"""
pronunciation = result.get("pronunciation", 0)
tone = result.get("transcript", 0) # Tone score
pitch = result.get("pitch", 0)
rhythm = result.get("rhythm", 0)
energy = result.get("energy", 0)
def create_progress_bar(label, value):
return f"""
{label}
{value}
"""
return f"""
CLAUDE
Wow, that voice input, takes analytical skills of course but I'll handle it
"""
def create_transcription_empty():
"""Empty state for transcription"""
return """
Upload audio to transcribe
"""
def create_transcription_visualization(text):
"""Simple text display for transcription result"""
return f"""
{text if text else "Transcription completed"}
"""
def create_embedding_empty():
"""Empty state for embedding extraction"""
return """
Upload audio to extract voice embedding
"""
def create_embedding_visualization(result):
"""Embedding extraction visualization"""
model = result.get("model", "Wav2Vec2")
dim = result.get("embedding_length", result.get("dim", 768))
preview = result.get("embedding_preview", [])
# Filter only numeric values to avoid format errors with strings like "..."
if preview:
numeric_preview = [v for v in preview if isinstance(v, (int, float))]
preview_str = ", ".join([f"{v:.4f}" for v in numeric_preview]) if numeric_preview else "..."
else:
preview_str = "..."
return f"""
Model
{model}
Dimensions
{dim}
Preview
[{preview_str}]
"""
def create_compare_empty():
"""Empty state for voice comparison"""
return """
Upload two audio files to compare voices
"""
def create_compare_visualization(result):
"""Voice comparison visualization with similarity score"""
similarity = result.get("similarity", 0)
tone_score = result.get("tone_score", 0)
# Convert similarity to percentage
similarity_pct = int(similarity * 100)
# Color based on similarity - Purple theme matching VOICE SIMILARITY
if similarity_pct >= 80:
color = "#c084fc" # Light purple (high score)
elif similarity_pct >= 60:
color = "#a855f7" # Medium purple (medium score)
else:
color = "#7c3aed" # Dark purple (low score)
return f"""
{similarity_pct}SIMILARITY
"""
def create_similarity_empty():
"""Empty state for voice similarity analysis"""
return """
Upload audio files for comprehensive similarity analysis
"""
def create_similarity_visualization(result):
"""Voice similarity visualization with radar chart"""
overall = result.get("overall", 0)
pronunciation = result.get("pronunciation", 0)
transcript = result.get("transcript", 0)
pitch = result.get("pitch", 0)
rhythm = result.get("rhythm", 0)
energy = result.get("energy", 0)
# Color based on overall score - Purple theme
if overall >= 80:
color = "#c084fc" # Light purple (high score)
elif overall >= 60:
color = "#a855f7" # Medium purple (medium score)
else:
color = "#7c3aed" # Dark purple (low score)
# Radar chart calculation
center_x, center_y = 150, 150
radius = 110
# 5 metrics in order: Pronunciation(top), Transcript(top-right), Pitch(bottom-right), Energy(bottom-left), Rhythm(top-left)
metrics = [
("Pronunciation", pronunciation, -90), # 0° - 90° = -90° (top)
("Transcript", transcript, -18), # 72° - 90° = -18° (top-right)
("Pitch", pitch, 54), # 144° - 90° = 54° (bottom-right)
("Energy", energy, 126), # 216° - 90° = 126° (bottom-left)
("Rhythm", rhythm, 198) # 288° - 90° = 198° (top-left)
]
# Calculate polygon points for data
data_points = []
for _, value, angle_deg in metrics:
angle_rad = math.radians(angle_deg)
point_radius = (value / 100) * radius
x = center_x + point_radius * math.cos(angle_rad)
y = center_y + point_radius * math.sin(angle_rad)
data_points.append(f"{x:.2f},{y:.2f}")
# Background concentric pentagons (20, 40, 60, 80, 100)
def create_pentagon_points(scale):
points = []
for _, _, angle_deg in metrics:
angle_rad = math.radians(angle_deg)
r = radius * scale
x = center_x + r * math.cos(angle_rad)
y = center_y + r * math.sin(angle_rad)
points.append(f"{x:.2f},{y:.2f}")
return " ".join(points)
background_pentagons = ""
for scale in [0.2, 0.4, 0.6, 0.8, 1.0]:
background_pentagons += f''
# Axis lines from center to vertices
axis_lines = ""
for _, _, angle_deg in metrics:
angle_rad = math.radians(angle_deg)
x = center_x + radius * math.cos(angle_rad)
y = center_y + radius * math.sin(angle_rad)
axis_lines += f''
# Labels at vertices
labels = ""
for label, value, angle_deg in metrics:
angle_rad = math.radians(angle_deg)
# Position label outside the pentagon
label_radius = radius + 25
x = center_x + label_radius * math.cos(angle_rad)
y = center_y + label_radius * math.sin(angle_rad)
labels += f'''
{label}
{value}'''
return f"""
""")
# ==================== TOP ROW: QUICK START + AVAILABLE TOOLS ====================
with gr.Row(equal_height=True):
# QUICK START CARD
with gr.Column(scale=1):
gr.HTML("""