Spaces:

chansung
/

hf-inference-endpoint

Runtime error

App Files Files Community

chansung commited on Sep 2, 2023

Commit

4e963f7

1 Parent(s): 5c9d90a

Update app.py

Browse files

Files changed (1) hide show

app.py +30 -8

app.py CHANGED Viewed

@@ -123,7 +123,12 @@ def submit(
     compute_selector,
     min_node_selector,
     max_node_selector,
-    security_selector
 ):
     compute_resources = compute_selector.split("·")
     accelerator = compute_resources[0][:3].strip()
@@ -148,7 +153,19 @@ def submit(
       "model": {
         "framework": framework_selector.lower(),
         "image": {
-          "huggingface": {}
         },
         "repository": repository_selector.lower(),
         "revision": "main",
@@ -322,7 +339,7 @@ Name for your new endpoint""")
                         gr.Markdown("""### Custom Cuda Kernels
     TGI uses custom kernels to speed up inference for some models. You can try disabling them if you encounter issues.""")
-                        _ = gr.Dropdown(
                             value="Enabled",
                             choices=["Enabled", "Disabled"],
                             interactive=True,
@@ -347,7 +364,7 @@ Name for your new endpoint""")
                         gr.Markdown("""### Max Input Length (per Query)
     Increasing this value can impact the amount of RAM required. Some models can only handle a finite range of sequences.""")
-                        _ = gr.Number(
                             value=1024,
                             interactive=True,
                             show_label=False,
@@ -358,7 +375,7 @@ Name for your new endpoint""")
                         gr.Markdown("""### Max Number of Tokens (per Query)
     The larger this value, the more memory each request will consume and the less effective batching can be.""")
-                        _ = gr.Number(
                             value=1512,
                             interactive=True,
                             show_label=False,
@@ -370,7 +387,7 @@ Name for your new endpoint""")
                         gr.Markdown("""### Max Batch Prefill Tokens
     Number of prefill tokens used during continuous batching. It can be useful to adjust this number since the prefill operation is memory-intensive and compute-bound.""")
-                        _ = gr.Number(
                             value=2048,
                             interactive=True,
                             show_label=False,
@@ -381,7 +398,7 @@ Name for your new endpoint""")
                         gr.Markdown("""### Max Batch Total Tokens
     Number of tokens that can be passed before forcing waiting queries to be put on the batch. A value of 1000 can fit 10 queries of 100 tokens or a single query of 1000 tokens.""")
-                        _ = gr.Number(
                             value=None,
                             interactive=True,
                             show_label=False,
@@ -416,7 +433,12 @@ Name for your new endpoint""")
                 compute_selector,
                 min_node_selector,
                 max_node_selector,
-                security_selector],
             outputs=status_txt)
     with gr.Tab("AWS", elem_classes=["no-border"]):

     compute_selector,
     min_node_selector,
     max_node_selector,
+    security_selector,
+    custom_kernel,
+    max_input_length,
+    max_tokens,
+    max_batch_prefill_token,
+    max_batch_total_token
 ):
     compute_resources = compute_selector.split("·")
     accelerator = compute_resources[0][:3].strip()
       "model": {
         "framework": framework_selector.lower(),
         "image": {
+          "custom": {
+            health_route: "/health",
+            env: {
+                DISABLE_CUSTOM_KERNELS: "true" if custom_kernel == "Enabled" else "false",
+                MAX_BATCH_PREFILL_TOKENS: str(max_batch_prefill_token),
+                MAX_BATCH_TOTAL_TOKENS?: str(max_batch_total_token);
+                MAX_INPUT_LENGTH: str(max_input_length),
+                MAX_TOTAL_TOKENS: str(max_tokens),
+                MODEL_ID: repository_selector.lower(),
+                # QUANTIZE: 'bitsandbytes' | 'gptq';
+            },
+            url: "ghcr.io/huggingface/text-generation-inference:1.0.1",
+          }
         },
         "repository": repository_selector.lower(),
         "revision": "main",
                         gr.Markdown("""### Custom Cuda Kernels
     TGI uses custom kernels to speed up inference for some models. You can try disabling them if you encounter issues.""")
+                        custom_kernel = gr.Dropdown(
                             value="Enabled",
                             choices=["Enabled", "Disabled"],
                             interactive=True,
                         gr.Markdown("""### Max Input Length (per Query)
     Increasing this value can impact the amount of RAM required. Some models can only handle a finite range of sequences.""")
+                        max_input_length = gr.Number(
                             value=1024,
                             interactive=True,
                             show_label=False,
                         gr.Markdown("""### Max Number of Tokens (per Query)
     The larger this value, the more memory each request will consume and the less effective batching can be.""")
+                        max_tokens = gr.Number(
                             value=1512,
                             interactive=True,
                             show_label=False,
                         gr.Markdown("""### Max Batch Prefill Tokens
     Number of prefill tokens used during continuous batching. It can be useful to adjust this number since the prefill operation is memory-intensive and compute-bound.""")
+                        max_batch_prefill_token = gr.Number(
                             value=2048,
                             interactive=True,
                             show_label=False,
                         gr.Markdown("""### Max Batch Total Tokens
     Number of tokens that can be passed before forcing waiting queries to be put on the batch. A value of 1000 can fit 10 queries of 100 tokens or a single query of 1000 tokens.""")
+                        max_batch_total_tokens = gr.Number(
                             value=None,
                             interactive=True,
                             show_label=False,
                 compute_selector,
                 min_node_selector,
                 max_node_selector,
+                security_selector,
+                custom_kernel,
+                max_input_length,
+                max_tokens,
+                max_batch_prefill_token,
+                max_batch_total_token],
             outputs=status_txt)
     with gr.Tab("AWS", elem_classes=["no-border"]):