Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -123,7 +123,12 @@ def submit(
|
|
| 123 |
compute_selector,
|
| 124 |
min_node_selector,
|
| 125 |
max_node_selector,
|
| 126 |
-
security_selector
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 127 |
):
|
| 128 |
compute_resources = compute_selector.split("路")
|
| 129 |
accelerator = compute_resources[0][:3].strip()
|
|
@@ -148,7 +153,19 @@ def submit(
|
|
| 148 |
"model": {
|
| 149 |
"framework": framework_selector.lower(),
|
| 150 |
"image": {
|
| 151 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 152 |
},
|
| 153 |
"repository": repository_selector.lower(),
|
| 154 |
"revision": "main",
|
|
@@ -322,7 +339,7 @@ Name for your new endpoint""")
|
|
| 322 |
gr.Markdown("""### Custom Cuda Kernels
|
| 323 |
|
| 324 |
TGI uses custom kernels to speed up inference for some models. You can try disabling them if you encounter issues.""")
|
| 325 |
-
|
| 326 |
value="Enabled",
|
| 327 |
choices=["Enabled", "Disabled"],
|
| 328 |
interactive=True,
|
|
@@ -347,7 +364,7 @@ Name for your new endpoint""")
|
|
| 347 |
gr.Markdown("""### Max Input Length (per Query)
|
| 348 |
|
| 349 |
Increasing this value can impact the amount of RAM required. Some models can only handle a finite range of sequences.""")
|
| 350 |
-
|
| 351 |
value=1024,
|
| 352 |
interactive=True,
|
| 353 |
show_label=False,
|
|
@@ -358,7 +375,7 @@ Name for your new endpoint""")
|
|
| 358 |
gr.Markdown("""### Max Number of Tokens (per Query)
|
| 359 |
|
| 360 |
The larger this value, the more memory each request will consume and the less effective batching can be.""")
|
| 361 |
-
|
| 362 |
value=1512,
|
| 363 |
interactive=True,
|
| 364 |
show_label=False,
|
|
@@ -370,7 +387,7 @@ Name for your new endpoint""")
|
|
| 370 |
gr.Markdown("""### Max Batch Prefill Tokens
|
| 371 |
|
| 372 |
Number of prefill tokens used during continuous batching. It can be useful to adjust this number since the prefill operation is memory-intensive and compute-bound.""")
|
| 373 |
-
|
| 374 |
value=2048,
|
| 375 |
interactive=True,
|
| 376 |
show_label=False,
|
|
@@ -381,7 +398,7 @@ Name for your new endpoint""")
|
|
| 381 |
gr.Markdown("""### Max Batch Total Tokens
|
| 382 |
|
| 383 |
Number of tokens that can be passed before forcing waiting queries to be put on the batch. A value of 1000 can fit 10 queries of 100 tokens or a single query of 1000 tokens.""")
|
| 384 |
-
|
| 385 |
value=None,
|
| 386 |
interactive=True,
|
| 387 |
show_label=False,
|
|
@@ -416,7 +433,12 @@ Name for your new endpoint""")
|
|
| 416 |
compute_selector,
|
| 417 |
min_node_selector,
|
| 418 |
max_node_selector,
|
| 419 |
-
security_selector
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 420 |
outputs=status_txt)
|
| 421 |
|
| 422 |
with gr.Tab("AWS", elem_classes=["no-border"]):
|
|
|
|
| 123 |
compute_selector,
|
| 124 |
min_node_selector,
|
| 125 |
max_node_selector,
|
| 126 |
+
security_selector,
|
| 127 |
+
custom_kernel,
|
| 128 |
+
max_input_length,
|
| 129 |
+
max_tokens,
|
| 130 |
+
max_batch_prefill_token,
|
| 131 |
+
max_batch_total_token
|
| 132 |
):
|
| 133 |
compute_resources = compute_selector.split("路")
|
| 134 |
accelerator = compute_resources[0][:3].strip()
|
|
|
|
| 153 |
"model": {
|
| 154 |
"framework": framework_selector.lower(),
|
| 155 |
"image": {
|
| 156 |
+
"custom": {
|
| 157 |
+
health_route: "/health",
|
| 158 |
+
env: {
|
| 159 |
+
DISABLE_CUSTOM_KERNELS: "true" if custom_kernel == "Enabled" else "false",
|
| 160 |
+
MAX_BATCH_PREFILL_TOKENS: str(max_batch_prefill_token),
|
| 161 |
+
MAX_BATCH_TOTAL_TOKENS?: str(max_batch_total_token);
|
| 162 |
+
MAX_INPUT_LENGTH: str(max_input_length),
|
| 163 |
+
MAX_TOTAL_TOKENS: str(max_tokens),
|
| 164 |
+
MODEL_ID: repository_selector.lower(),
|
| 165 |
+
# QUANTIZE: 'bitsandbytes' | 'gptq';
|
| 166 |
+
},
|
| 167 |
+
url: "ghcr.io/huggingface/text-generation-inference:1.0.1",
|
| 168 |
+
}
|
| 169 |
},
|
| 170 |
"repository": repository_selector.lower(),
|
| 171 |
"revision": "main",
|
|
|
|
| 339 |
gr.Markdown("""### Custom Cuda Kernels
|
| 340 |
|
| 341 |
TGI uses custom kernels to speed up inference for some models. You can try disabling them if you encounter issues.""")
|
| 342 |
+
custom_kernel = gr.Dropdown(
|
| 343 |
value="Enabled",
|
| 344 |
choices=["Enabled", "Disabled"],
|
| 345 |
interactive=True,
|
|
|
|
| 364 |
gr.Markdown("""### Max Input Length (per Query)
|
| 365 |
|
| 366 |
Increasing this value can impact the amount of RAM required. Some models can only handle a finite range of sequences.""")
|
| 367 |
+
max_input_length = gr.Number(
|
| 368 |
value=1024,
|
| 369 |
interactive=True,
|
| 370 |
show_label=False,
|
|
|
|
| 375 |
gr.Markdown("""### Max Number of Tokens (per Query)
|
| 376 |
|
| 377 |
The larger this value, the more memory each request will consume and the less effective batching can be.""")
|
| 378 |
+
max_tokens = gr.Number(
|
| 379 |
value=1512,
|
| 380 |
interactive=True,
|
| 381 |
show_label=False,
|
|
|
|
| 387 |
gr.Markdown("""### Max Batch Prefill Tokens
|
| 388 |
|
| 389 |
Number of prefill tokens used during continuous batching. It can be useful to adjust this number since the prefill operation is memory-intensive and compute-bound.""")
|
| 390 |
+
max_batch_prefill_token = gr.Number(
|
| 391 |
value=2048,
|
| 392 |
interactive=True,
|
| 393 |
show_label=False,
|
|
|
|
| 398 |
gr.Markdown("""### Max Batch Total Tokens
|
| 399 |
|
| 400 |
Number of tokens that can be passed before forcing waiting queries to be put on the batch. A value of 1000 can fit 10 queries of 100 tokens or a single query of 1000 tokens.""")
|
| 401 |
+
max_batch_total_tokens = gr.Number(
|
| 402 |
value=None,
|
| 403 |
interactive=True,
|
| 404 |
show_label=False,
|
|
|
|
| 433 |
compute_selector,
|
| 434 |
min_node_selector,
|
| 435 |
max_node_selector,
|
| 436 |
+
security_selector,
|
| 437 |
+
custom_kernel,
|
| 438 |
+
max_input_length,
|
| 439 |
+
max_tokens,
|
| 440 |
+
max_batch_prefill_token,
|
| 441 |
+
max_batch_total_token],
|
| 442 |
outputs=status_txt)
|
| 443 |
|
| 444 |
with gr.Tab("AWS", elem_classes=["no-border"]):
|