File size: 15,646 Bytes
11df203
 
 
 
 
 
 
 
ab96cfe
 
 
7023fcd
 
 
ca96eb9
2b910cc
11df203
 
ab96cfe
11df203
7023fcd
 
 
 
 
 
 
a13b986
ca96eb9
a13b986
ca96eb9
a13b986
7023fcd
 
ea0aee9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7023fcd
 
 
 
 
 
 
ab96cfe
7023fcd
ab96cfe
7023fcd
 
ab96cfe
7023fcd
ab96cfe
7023fcd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ab96cfe
7023fcd
ab96cfe
7023fcd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ab96cfe
7023fcd
ab96cfe
7023fcd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ab96cfe
7023fcd
ab96cfe
7023fcd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ab96cfe
7023fcd
ab96cfe
7023fcd
 
 
 
 
 
 
 
 
 
 
 
 
ca96eb9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7023fcd
 
 
 
ca96eb9
 
7023fcd
ca96eb9
7023fcd
ca96eb9
7023fcd
 
 
ca96eb9
 
7023fcd
2b910cc
ca96eb9
7023fcd
ca96eb9
 
677d2ba
7023fcd
ab96cfe
 
 
a13b986
 
 
 
7023fcd
 
11df203
 
 
 
7023fcd
ab96cfe
 
 
 
7023fcd
 
 
 
 
ab96cfe
 
 
 
7023fcd
ab96cfe
 
 
7023fcd
ab96cfe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0b5eef0
 
ab96cfe
 
 
 
 
 
 
 
 
 
 
 
7023fcd
 
ab96cfe
 
 
 
 
 
 
 
7023fcd
 
 
ab96cfe
 
 
7023fcd
 
3ef1838
 
 
 
 
 
 
ab96cfe
7023fcd
 
ab96cfe
 
 
7023fcd
ab96cfe
7023fcd
ab96cfe
 
 
 
 
ca96eb9
ab96cfe
 
 
7023fcd
ab96cfe
11df203
7023fcd
 
ab96cfe
 
 
 
 
 
 
 
 
7023fcd
ab96cfe
 
 
 
 
 
 
 
 
 
 
 
 
7023fcd
ab96cfe
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
"""
Main Gradio application with MCP server functionality.

This module provides the main entry point for the hf-eda-mcp server,
creating Gradio interfaces for EDA tools and enabling MCP server functionality.
"""

import gradio as gr
import sys
from typing import Optional

from hf_eda_mcp.tools.metadata import get_dataset_metadata
from hf_eda_mcp.tools.sampling import get_dataset_sample
from hf_eda_mcp.tools.analysis import analyze_dataset_features
from hf_eda_mcp.tools.search import search_text_in_dataset
from hf_eda_mcp.config import ServerConfig, setup_logging, validate_config, set_config


def create_gradio_app(config: ServerConfig) -> gr.Blocks:
    """Create and configure the main Gradio application with MCP server."""

    # Create main app with MCP tool interfaces
    with gr.Blocks(
        title="HF EDA MCP Server",
    ) as app:
        gr.Markdown(
            """
            # πŸ“Š HuggingFace EDA MCP Server

            **MCP server for exploratory data analysis of HuggingFace datasets**

            This server provides four tools for dataset exploration that are automatically exposed as MCP tools.
            """
        )
        with gr.Row():
            gr.HTML(
                """
                <div style="display: flex; gap: 8px; justify-content: center; flex-wrap: wrap;">
                    <a href="https://www.youtube.com/watch?v=XdP7zGSb81k" target="_blank">
                        <img src="https://img.shields.io/badge/▢️_Demo_Video-FF0000?style=for-the-badge&logo=youtube&logoColor=white" alt="Demo Video">
                    </a>
                    <a href="https://www.linkedin.com/posts/khalil-guetari-00a61415a_mcp-server-for-huggingface-datasets-discovery-activity-7400587711838842880-2K8p" target="_blank">
                        <img src="https://img.shields.io/badge/LinkedIn_Post-0A66C2?style=for-the-badge&logo=linkedin&logoColor=white" alt="LinkedIn Post">
                    </a>
                    <a href="https://huggingface.co/spaces/MCP-1st-Birthday/hf-eda-mcp/blob/main/README.md" target="_blank">
                        <img src="https://img.shields.io/badge/πŸ“–_README-FFD21E?style=for-the-badge" alt="README">
                    </a>
                </div>
                """
            )

        # Create interfaces for each EDA tool - these will be automatically exposed as MCP tools
        with gr.Tab("πŸ“Š Dataset Metadata"):
            gr.Interface(
                fn=get_dataset_metadata,
                inputs=[
                    gr.Textbox(
                        label="dataset_id",
                        placeholder="e.g., imdb, squad, glue",
                        info="HuggingFace dataset identifier",
                    ),
                    gr.Textbox(
                        label="config_name",
                        placeholder="e.g., cola, sst2 (optional)",
                        info="Configuration name for multi-config datasets",
                    ),
                ],
                outputs=gr.JSON(label="Dataset Metadata"),
                title="Get Dataset Metadata",
                description="Retrieve comprehensive metadata for a HuggingFace dataset including size, features, splits, and configuration details.",
                examples=[
                    ["imdb", ""],
                    ["glue", "cola"],
                    ["squad", ""],
                    ["wikitext", "wikitext-2-raw-v1"],
                ],
            )

        with gr.Tab("πŸ” Dataset Sampling"):
            gr.Interface(
                fn=get_dataset_sample,
                inputs=[
                    gr.Textbox(
                        label="dataset_id",
                        placeholder="e.g., imdb, squad, glue",
                        info="HuggingFace dataset identifier",
                    ),
                    gr.Dropdown(
                        choices=["train", "validation", "test", "dev", "val"],
                        value="train",
                        label="split",
                        info="Dataset split to sample from",
                        allow_custom_value=True,
                    ),
                    gr.Slider(
                        minimum=1,
                        maximum=1000,
                        value=10,
                        step=1,
                        label="num_samples",
                        info="Number of samples to retrieve (max: 10000 for MCP)",
                    ),
                    gr.Textbox(
                        label="config_name",
                        placeholder="e.g., cola, sst2 (optional)",
                        info="Configuration name for multi-config datasets",
                    ),
                ],
                outputs=gr.JSON(label="Dataset Sample"),
                title="Get Dataset Sample",
                description="Retrieve a sample of rows from a HuggingFace dataset with support for different splits and configurable sample sizes.",
                examples=[
                    ["imdb", "train", 5, ""],
                    ["glue", "validation", 3, "cola"],
                    ["squad", "train", 2, ""],
                    ["wikitext", "test", 1, "wikitext-2-raw-v1"],
                ],
            )

        with gr.Tab("πŸ“ˆ Feature Analysis"):
            gr.Interface(
                fn=analyze_dataset_features,
                inputs=[
                    gr.Textbox(
                        label="dataset_id",
                        placeholder="e.g., imdb, squad, glue",
                        info="HuggingFace dataset identifier",
                    ),
                    gr.Dropdown(
                        choices=["train", "validation", "test", "dev", "val"],
                        value="train",
                        label="split",
                        info="Dataset split to analyze",
                        allow_custom_value=True,
                    ),
                    gr.Slider(
                        minimum=100,
                        maximum=10000,
                        value=1000,
                        step=100,
                        label="sample_size",
                        info="Number of samples to use for analysis (max: 50000 for MCP)",
                    ),
                    gr.Textbox(
                        label="config_name",
                        placeholder="e.g., cola, sst2 (optional)",
                        info="Configuration name for multi-config datasets",
                    ),
                ],
                outputs=gr.JSON(label="Analysis Results"),
                title="Analyze Dataset Features",
                description="Perform basic exploratory analysis on dataset features including statistics, missing values, and data quality assessment.",
                examples=[
                    ["imdb", "train", 1000, ""],
                    ["glue", "train", 500, "cola"],
                    ["squad", "validation", 800, ""],
                    ["wikitext", "train", 1200, "wikitext-2-raw-v1"],
                ],
            )

        with gr.Tab("πŸ”Ž Text Search"):
            gr.Interface(
                fn=search_text_in_dataset,
                inputs=[
                    gr.Textbox(
                        label="dataset_id",
                        placeholder="e.g., imdb, squad, glue",
                        info="HuggingFace dataset identifier",
                    ),
                    gr.Textbox(
                        label="config_name",
                        placeholder="e.g., cola, sst2",
                        info="Configuration name (required for search)",
                    ),
                    gr.Dropdown(
                        choices=["train", "validation", "test", "dev", "val"],
                        value="train",
                        label="split",
                        info="Dataset split to search in",
                        allow_custom_value=True,
                    ),
                    gr.Textbox(
                        label="query",
                        placeholder="Enter search query...",
                        info="Text to search for in the dataset",
                    ),
                    gr.Slider(
                        minimum=0,
                        maximum=1000,
                        value=0,
                        step=10,
                        label="offset",
                        info="Offset for pagination",
                    ),
                    gr.Slider(
                        minimum=1,
                        maximum=100,
                        value=10,
                        step=1,
                        label="length",
                        info="Number of results to return",
                    ),
                ],
                outputs=gr.JSON(label="Search Results"),
                title="Search Text in Dataset",
                description="Search for text in text columns of a dataset. Only text columns are searched and only parquet datasets are supported.",
                examples=[
                    ["stanfordnlp/imdb", "plain_text", "train", "great movie", 0, 10],
                    ["rajpurkar/squad", "plain_text", "train", "president", 0, 5],
                    ["nyu-mll/glue", "cola", "train", "friends", 0, 10],
                ],
            )

        with gr.Tab("ℹ️ About"):
            gr.Markdown(
                f"""
                ## About HF EDA MCP Server

                This server implements the Model Context Protocol (MCP) to provide AI assistants
                with tools for exploring and analyzing HuggingFace datasets.

                ### Available MCP Tools

                1. **get_dataset_metadata**: Retrieve comprehensive dataset information
                2. **get_dataset_sample**: Sample data from datasets with configurable parameters
                3. **analyze_dataset_features**: Perform exploratory data analysis
                4. **search_text_in_dataset**: Search for text in dataset columns

                ### MCP Server Configuration


                ### Server Status

                - **MCP Tools**: 4 tools available
                - **Authentication**: To explore private or gated datasets, set `hf-api-token` in MCP configuration headers
                - **MCP Schema**: Available at `/gradio_api/mcp/schema`
                - **Cache Directory**: {config.cache_dir or "Default system cache"}
                - **Max Sample Size**: {config.max_sample_size:,}
                - **Request Timeout**: {config.request_timeout}s

                ### Documentation

                For full documentation, MCP client configuration, and local development instructions, see the [README](https://huggingface.co/spaces/MCP-1st-Birthday/hf-eda-mcp/blob/main/README.md).
                """
            )

    return app


def launch_server(
    config: Optional[ServerConfig] = None,
    port: Optional[int] = None,
    mcp_server: Optional[bool] = None,
    share: Optional[bool] = None,
) -> None:
    """
    Launch the Gradio app with MCP server enabled.

    Args:
        config: Server configuration object. If None, loads from environment
        port: Port to run the server on (overrides config)
        mcp_server: Whether to enable MCP server functionality (overrides config)
        share: Whether to create a public shareable link (overrides config)
    """
    # Load configuration
    if config is None:
        config = ServerConfig.from_env()

    # Override config with explicit parameters
    if port is not None:
        config.port = port
    if mcp_server is not None:
        config.mcp_server = mcp_server
    if share is not None:
        config.share = share

    # Set global configuration for tools to use
    set_config(config)
    
    # Set up logging
    logger = setup_logging(config)

    logger.info("=" * 60)
    logger.info("πŸš€ Starting HuggingFace EDA MCP Server")
    logger.info("=" * 60)

    # Validate configuration
    try:
        validate_config(config)
    except Exception as e:
        logger.error(f"Configuration validation failed: {e}")
        sys.exit(1)
    
    logger.info(f"HF Token is: {config.hf_token}")

    # Log configuration
    logger.info("Server configuration:")
    logger.info(f"  - Host: {config.host}")
    logger.info(f"  - Port: {config.port}")
    logger.info(f"  - MCP server enabled: {config.mcp_server}")
    logger.info(f"  - Share enabled: {config.share}")
    logger.info(f"  - Log level: {config.log_level}")
    logger.info(f"  - Cache directory: {config.cache_dir or 'Default system cache'}")
    logger.info(f"  - Max sample size: {config.max_sample_size:,}")
    logger.info(f"  - Request timeout: {config.request_timeout}s")
    logger.info(f"  - Max concurrent requests: {config.max_concurrent_requests}")

    # Create the Gradio app
    try:
        logger.info("Creating Gradio application with EDA tools...")
        app = create_gradio_app(config)
        logger.info("βœ… Gradio application created successfully")
    except Exception as e:
        logger.error(f"Failed to create Gradio application: {e}")
        logger.exception("Full traceback:")
        sys.exit(1)

    # Configure launch parameters
    launch_kwargs = {
        "server_name": config.host,
        "server_port": config.port,
        "share": config.share,
        "show_error": True,
        "quiet": False,
        "footer_links": ["api", "gradio", "settings"],
        "theme": gr.themes.Soft(),
        "css": """
        .gradio-container {
            max-width: 1200px !important;
        }
        """,
        "max_threads": config.max_concurrent_requests,
    }

    # Add additional Gradio settings from config
    launch_kwargs.update(config.gradio_settings)

    # Add MCP server configuration
    if config.mcp_server:
        launch_kwargs["mcp_server"] = True
        logger.info("πŸ”— MCP server functionality enabled")
        logger.info("πŸ“Š MCP tools available:")
        logger.info("  - get_dataset_metadata: Retrieve dataset information")
        logger.info("  - get_dataset_sample: Sample data from datasets")
        logger.info("  - analyze_dataset_features: Perform EDA analysis")
        logger.info("  - search_text_in_dataset: Search for text in datasets")
        logger.info(
            f"🌐 MCP schema available at: http://{config.host}:{config.port}/gradio_api/mcp/schema"
        )
    else:
        logger.info("🌐 Running in web-only mode (MCP disabled)")

    # Launch the server
    try:
        logger.info("πŸš€ Launching Gradio application...")
        logger.info(f"🌐 Web interface: http://{config.host}:{config.port}")
        if config.share:
            logger.info("🌍 Public sharing enabled - shareable link will be generated")

        logger.info("=" * 60)
        logger.info("Server is starting... Press Ctrl+C to stop")
        logger.info("=" * 60)

        app.launch(**launch_kwargs)

    except KeyboardInterrupt:
        logger.info("πŸ‘‹ Server stopped by user (Ctrl+C)")
        sys.exit(0)
    except OSError as e:
        if "Address already in use" in str(e):
            logger.error(f"❌ Port {config.port} is already in use")
            logger.info(
                "πŸ’‘ Try using a different port with --port or HF_EDA_PORT environment variable"
            )
        else:
            logger.error(f"❌ Network error: {e}")
        sys.exit(1)
    except Exception as e:
        logger.error(f"❌ Failed to launch server: {e}")
        logger.exception("Full traceback:")
        sys.exit(1)