Spaces:

Rainbowdesign
/

CodeLlama-13b-Instruct-hf-Chatbot

Sleeping

App Files Files Community

Rainbowdesign commited on 27 days ago

Commit

6f2bef2

verified ·

1 Parent(s): e774293

Update app.py

Browse files

Files changed (1) hide show

app.py +112 -79

app.py CHANGED Viewed

@@ -1,5 +1,7 @@
 import gradio as gr
-from huggingface_hub import InferenceClient, HfApi
 import json
 import os
@@ -150,12 +152,14 @@ def add_model_box(
 # -----------------------------
-# Helper: check model access
 # -----------------------------
 def check_model_access(model_id, hf_token):
     """
     Try to get model info; return (ok: bool, message: str).
     This helps distinguish auth/gating vs other issues.
     """
     try:
         api = HfApi(token=hf_token.token if hf_token else None)
@@ -169,7 +173,46 @@ def check_model_access(model_id, hf_token):
 # -----------------------------
-# Chat function (robust)
 # -----------------------------
 def respond(
     message,
@@ -197,19 +240,13 @@ def respond(
     meta, _, _, _ = meta_tuple
     model_id = meta["id"]
-    debug(f"Chat using model: {model_id}")
-    # Check token presence
-    if hf_token is None or hf_token.token is None:
-        debug("No HF token available from Login.")
-        yield "No Hugging Face token detected. Please click Login in the sidebar and try again."
-        return
-    # Check access to the model
     ok, msg = check_model_access(model_id, hf_token)
     if not ok:
         yield (
-            f"Could not access model `{model_id}`.\n\n"
             f"This is usually because:\n"
             f"- The repo is private or gated and this token has no access\n"
             f"- Or the token is invalid/expired\n\n"
@@ -218,68 +255,63 @@ def respond(
         )
         return
-    # Build messages
-    messages = [{"role": "system", "content": system_message}]
-    messages.extend(history or [])
-    messages.append({"role": "user", "content": message})
     try:
-        client = InferenceClient(token=hf_token.token, model=model_id)
-        # Try chat_completion first
-        response = ""
-        try:
-            for msg_obj in client.chat_completion(
-                messages,
-                max_tokens=max_tokens,
-                stream=True,
-                temperature=temperature,
-                top_p=top_p,
-            ):
-                # Defensive handling of streaming structure
-                choice_list = getattr(msg_obj, "choices", [])
-                if not choice_list:
-                    continue
-                delta = getattr(choice_list[0], "delta", None)
-                if delta is None:
-                    continue
-                chunk = getattr(delta, "content", "") or ""
-                response += chunk
-                yield response
-            # If we streamed something, we're done
-            if response:
-                return
-        except Exception:
-            # Fall back to text_generation if chat_completion fails or model isn't chat-style
-            import traceback
-            tb = traceback.format_exc()
-            debug(f"chat_completion failed, falling back to text_generation:\n{tb}")
-            prompt = (
-                system_message
-                + "\n\n"
-                + "\n".join([f"User: {h['content']}" if h["role"] == "user" else f"Assistant: {h['content']}" for h in (history or [])])
-                + f"\nUser: {message}\nAssistant:"
-            )
-            text = client.text_generation(
-                prompt,
-                max_new_tokens=max_tokens,
-                temperature=temperature,
-                top_p=top_p,
-                stream=False,
-            )
-            yield text
     except Exception:
         import traceback
         tb = traceback.format_exc()
-        debug(f"ERROR in respond:\n{tb}")
         yield (
-            "An unexpected error occurred while talking to the model.\n"
-            "Please check the Debug Log for more details."
         )
@@ -339,21 +371,21 @@ def build_model_tree(
                         # Model accordion
                         with gr.Accordion(f"{emoji} {model_name}", open=False):
-                                info_text = (
-                                    f"**Model ID:** `{meta['id']}`  \n"
-                                    f"**Description:** {meta['description']}  \n"
-                                    f"[Model card]({meta['link']})"
-                                )
-                                gr.Markdown(info_text)
-                                use_btn = gr.Button("Use this model", size="sm")
-                                # Wire button -> use_model
-                                use_btn.click(
-                                    use_model,
-                                    inputs=[gr.State(full_key), active_model_state],
-                                    outputs=[active_model_state, current_model_label],
-                                )
 # -----------------------------
@@ -417,6 +449,7 @@ with gr.Blocks() as demo:
             )
         # Debug Log box (separate accordion)
             debug_log = gr.Textbox(
                 label="System Debug Output",
                 value="",

 import gradio as gr
+from huggingface_hub import HfApi
+from transformers import AutoTokenizer, AutoModelForCausalLM
+import torch
 import json
 import os
 # -----------------------------
+# Helper: check model access (repo visibility)
 # -----------------------------
 def check_model_access(model_id, hf_token):
     """
     Try to get model info; return (ok: bool, message: str).
     This helps distinguish auth/gating vs other issues.
+    For local loading, this is not strictly required, but we keep
+    it to give clearer messages for private/gated models.
     """
     try:
         api = HfApi(token=hf_token.token if hf_token else None)
 # -----------------------------
+# Local model cache
+# -----------------------------
+from transformers import AutoTokenizer, AutoModelForCausalLM
+import torch
+LOCAL_MODEL_CACHE = {}
+def load_local_model(model_id):
+    """
+    Load a model + tokenizer locally and cache them.
+    This makes the Space behave like a dedicated model Space:
+    models are executed inside the container, not via Inference API.
+    """
+    if model_id in LOCAL_MODEL_CACHE:
+        debug(f"Using cached model: {model_id}")
+        return LOCAL_MODEL_CACHE[model_id]
+    debug(f"Loading model locally: {model_id}")
+    try:
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+    except Exception as e:
+        debug(f"ERROR loading tokenizer for {model_id}: {e}")
+        raise
+    try:
+        model = AutoModelForCausalLM.from_pretrained(
+            model_id,
+            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+            device_map="auto"
+        )
+    except Exception as e:
+        debug(f"ERROR loading model weights for {model_id}: {e}")
+        raise
+    LOCAL_MODEL_CACHE[model_id] = (tokenizer, model)
+    return tokenizer, model
+# -----------------------------
+# Chat function (local models)
 # -----------------------------
 def respond(
     message,
     meta, _, _, _ = meta_tuple
     model_id = meta["id"]
+    debug(f"Chat using local model: {model_id}")
+    # Optional: check repo access (for private/gated models)
     ok, msg = check_model_access(model_id, hf_token)
     if not ok:
         yield (
+            f"Could not access model `{model_id}` on Hugging Face.\n\n"
             f"This is usually because:\n"
             f"- The repo is private or gated and this token has no access\n"
             f"- Or the token is invalid/expired\n\n"
         )
         return
+    # Load model locally
+    try:
+        tokenizer, model = load_local_model(model_id)
+    except Exception:
+        import traceback
+        tb = traceback.format_exc()
+        debug(f"ERROR in load_local_model for {model_id}:\n{tb}")
+        yield (
+            f"Failed to load model `{model_id}` locally inside the Space.\n"
+            f"Check the Debug Log for details (likely out of memory or missing files)."
+        )
+        return
+    # Build chat-style prompt from history + current message
+    prompt = system_message.strip() + "\n\n"
+    for turn in history or []:
+        role = turn.get("role", "user")
+        content = turn.get("content", "")
+        if role == "user":
+            prompt += f"User: {content}\n"
+        else:
+            prompt += f"Assistant: {content}\n"
+    prompt += f"User: {message}\nAssistant:"
+    debug(f"Prompt length (chars): {len(prompt)}")
     try:
+        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
+        # Generate text locally
+        output_ids = model.generate(
+            **inputs,
+            max_new_tokens=int(max_tokens),
+            do_sample=True,
+            temperature=float(temperature),
+            top_p=float(top_p),
+            pad_token_id=tokenizer.eos_token_id if tokenizer.eos_token_id is not None else None,
+        )
+        output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
+        # Extract only the assistant's final answer
+        if "Assistant:" in output_text:
+            answer = output_text.split("Assistant:")[-1].strip()
+        else:
+            answer = output_text.strip()
+        yield answer
     except Exception:
         import traceback
         tb = traceback.format_exc()
+        debug(f"ERROR during local generation for {model_id}:\n{tb}")
         yield (
+            "An error occurred during local text generation.\n"
+            "This is often due to running out of memory for large models.\n"
+            "Try a smaller model, fewer max tokens, or check the Debug Log."
         )
                         # Model accordion
                         with gr.Accordion(f"{emoji} {model_name}", open=False):
+                            info_text = (
+                                f"**Model ID:** `{meta['id']}`  \n"
+                                f"**Description:** {meta['description']}  \n"
+                                f"[Model card]({meta['link']})"
+                            )
+                            gr.Markdown(info_text)
+                            use_btn = gr.Button("Use this model", size="sm")
+                            # Wire button -> use_model
+                            use_btn.click(
+                                use_model,
+                                inputs=[gr.State(full_key), active_model_state],
+                                outputs=[active_model_state, current_model_label],
+                            )
 # -----------------------------
             )
         # Debug Log box (separate accordion)
+        with gr.Accordion("Debug Log", open=False):
             debug_log = gr.Textbox(
                 label="System Debug Output",
                 value="",