eot-visualization

Running on Zero

App Files Files Community

theomonnom commited on Sep 16

Commit

8d2d3a6

verified ·

1 Parent(s): 7c74905

Update app.py

Browse files

Files changed (1) hide show

app.py +252 -150

app.py CHANGED Viewed

@@ -1,47 +1,26 @@
-import random
 import numpy as np
 import pandas as pd
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
-import unicodedata
-import re
 import gradio as gr
-from pprint import pprint
-MODEL_ID = "livekit/turn-detector"
-REVISION_ID = "v0.3.0-intl"
-tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, revision=REVISION_ID)
-model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID,
-    revision=REVISION_ID,
-    torch_dtype=torch.bfloat16,
-    device_map="auto",
-)
-model.eval()
-EN_THRESHOLD = 0.0049
-START_TOKEN_ID = tokenizer.convert_tokens_to_ids('<|im_start|>')
-EOU_TOKEN_ID = tokenizer.convert_tokens_to_ids("<|im_end|>")
-NEWLINE_TOKEN_ID = tokenizer.convert_tokens_to_ids('\n')
-USER_TOKEN_IDS = (
-    tokenizer.convert_tokens_to_ids('user'),
-    tokenizer.convert_tokens_to_ids('<|user|>')
-)
-SPECIAL_TOKENS = set([
-    NEWLINE_TOKEN_ID,
-    START_TOKEN_ID,
-    tokenizer.convert_tokens_to_ids('user'),
-    tokenizer.convert_tokens_to_ids('assistant'),
-])
-CONTROL_TOKS = ['<|im_start|>', '<|im_end|>', 'user', 'assistant', '\n']
-def normalize_text(text):
     text = unicodedata.normalize("NFKC", text.lower())
     text = ''.join(
         ch for ch in text
@@ -51,145 +30,268 @@ def normalize_text(text):
     return text
-def format_input(text):
-    if '<|im_start|>' not in text:
-        # assume single user turn
-        text = {'role': 'user', 'content': normalize_text(text)}
-        text = tokenizer.apply_chat_template(
-            [text],
-            tokenize=False,
-            add_generation_prompt=True
-        )
-    return text
-def log_odds(p, eps=0):
-    return np.log(p /(1 - p + eps))
-def make_pred_mask(input_ids):
-    user_mask = [False] * len(input_ids)
-    is_user_role = False
-    for i in range(len(input_ids)):
-        tok = input_ids[i]
-        if tok == START_TOKEN_ID and i + 1 < len(input_ids):
-            is_user_role = input_ids[i + 1] in USER_TOKEN_IDS
-            user_mask[i] = False
-            continue
-        user_mask[i] = is_user_role and (tok not in SPECIAL_TOKENS)
-    return user_mask
-def predict_eou(text):
-    text = format_input(text)
-    with torch.no_grad():
-        with torch.amp.autocast(model.device.type):
-            inputs = tokenizer.encode(
                 text,
                 add_special_tokens=False,
                 return_tensors="pt"
-            ).to(model.device)
-            outputs = model(inputs)
-    probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
-    probs = probs.cpu().float().numpy()[:, :, EOU_TOKEN_ID].flatten()
-    input_ids = inputs.cpu().int().flatten().numpy()
-    mask = np.array(make_pred_mask(input_ids))
-    probs[~mask] = np.nan
-    tokens = [tokenizer.decode(id) for id in input_ids]
-    res = {'token':tokens,'pred':probs}
-    return pd.DataFrame(res)
-def make_styled_df(df, thresh=EN_THRESHOLD, cmap="coolwarm"):
-    EPS = 1e-12
-    df = df.copy()
-    df = df[~df.token.isin(CONTROL_TOKS)]
-    df.token = df.token.replace({"\n": "⏎"," ": "␠",})
-    df['log_odds'] = (
-        df.pred.fillna(thresh)
-        .add(EPS)
-        .apply(log_odds).sub(log_odds(thresh))
-        .mask(df.pred.isna())
-    )
-    df['Prob(EoT) as %'] = df.pred.mul(100).fillna(0).astype(int)
-    vmin, vmax = df.log_odds.min(), df.log_odds.max()
-    vmax_abs = max(abs(vmin), abs(vmax)) * 1.5
-    fmt = (
-        df.drop(columns=['pred'])
-        .style
-        .bar(
-            subset=['log_odds'],
-            align="zero",
-            vmin=-vmax_abs,
-            vmax=vmax_abs,
-            cmap=cmap,
-            height=70,
-            width=100,
         )
-        .text_gradient(subset=['log_odds'], cmap=cmap, vmin=-vmax_abs, vmax=vmax_abs)
-        .format(na_rep='', precision=1, subset=['log_odds'])
-        .format("{:3d}", subset=['Prob(EoT) as %'])
-        .hide(axis='index')
-    )
-    return fmt.to_html()
-def generate_highlighted_text(text, threshold=EN_THRESHOLD):
-    eps = 1e-12
-    if not text:
-        return []
-    df = predict_eou(text)
-    df.token = df.token.replace({"user": "\nUSER:", "assistant": "\nAGENT:"})
-    df = df[~df.token.isin(CONTROL_TOKS)]
-    df['score'] = (
-        df.pred.fillna(threshold)
-        .add(eps)
-        .apply(log_odds).sub(log_odds(threshold))
-        .mask(df.pred.isna() | df.pred.round(2) == 0)
-    )
-    max_abs_score = df['score'].abs().max() * 1.5
-    if max_abs_score > 0:
-        df.score = df.score / max_abs_score
-    styled_df = make_styled_df(df[['token', 'pred']])
-    return list(zip(df.token, df.score)), styled_df
-convo_text = """<|im_start|>assistant
 what is your phone number<|im_end|>
 <|im_start|>user
 555 410 0423<|im_end|>"""
-demo = gr.Interface(
-    fn=generate_highlighted_text,
-    theme="soft",
-    inputs=gr.Textbox(
-        label="Input Text",
-        info="If <|im_start|> is present it will treat input as formatted convo. if not it will format it as convo with 1 user message.",
-        # value="can you help me order some pizza",
-        value=convo_text,
-        lines=2
-    ),
-    outputs=[
-        gr.HighlightedText(
-            label="EoT Predictions",
-            color_map="coolwarm",
-            scale=1.5,
-        ),
-        gr.HTML(label="Raw scores",)
-    ],
-    title="Turn Detector Debugger",
-    description="Visualize predicted turn endings. The coloring is based on log-odds, centered on the threshold.\n Red means agent should reply; Blue means agent should wait",
-    allow_flagging="never"
-)
 demo.launch()

+import os
+import re
+import unicodedata
+from functools import lru_cache
 import numpy as np
 import pandas as pd
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
 import gradio as gr
+# ===== Defaults (you can change from the UI too) =====
+DEFAULT_MODEL_A_ID = "livekit/turn-detector"
+DEFAULT_MODEL_A_REV = "v0.3.0-intl"
+DEFAULT_MODEL_B_ID = "livekit/eou-experiment"
+DEFAULT_MODEL_B_REV = "main"   # adjust if there's a specific revision
+# ===== Utilities =====
+def normalize_text(text: str) -> str:
     text = unicodedata.normalize("NFKC", text.lower())
     text = ''.join(
         ch for ch in text
     return text
+def log_odds(p, eps=0.0):
+    return np.log(p / (1 - p + eps))
+# ===== Per-model runner (keeps tokenizer/model and token ids) =====
+class ModelRunner:
+    def __init__(self, model_id: str, revision: str | None = None, dtype=torch.bfloat16):
+        self.model_id = model_id
+        self.revision = revision
+        self.tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)
+        self.model = AutoModelForCausalLM.from_pretrained(
+            model_id,
+            revision=revision,
+            torch_dtype=dtype,
+            device_map="auto",
+        )
+        self.model.eval()
+        # Pull commonly used tokens, falling back gracefully if not present
+        self.START_TOKEN_ID = self._tok_id("<|im_start|>")
+        self.EOU_TOKEN_ID   = self._tok_id("<|im_end|>")
+        self.NEWLINE_TOKEN_ID = self._tok_id("\n")
+        # Common role tokens; include both legacy and chat-template variants
+        self.USER_TOKEN_IDS = tuple(
+            tid for tid in [
+                self._tok_id("user"),
+                self._tok_id("<|user|>")
+            ] if tid is not None
+        )
+        # Tokens we do not want to score on (specials / scaffolding)
+        self.SPECIAL_TOKENS = set(
+            tid for tid in [
+                self.NEWLINE_TOKEN_ID,
+                self.START_TOKEN_ID,
+                self._tok_id("user"),
+                self._tok_id("assistant"),
+            ] if tid is not None
+        )
+        # For filtering in display
+        self.CONTROL_TOKS = set([
+            "<|im_start|>", "<|im_end|>", "user", "assistant", "\n"
+        ])
+    def _tok_id(self, tok: str) -> int | None:
+        tid = self.tokenizer.convert_tokens_to_ids(tok)
+        # convert_tokens_to_ids returns None or 0/-1 if unknown depending on tokenizer
+        if tid is None or tid < 0:
+            return None
+        return tid
+    def format_input(self, text: str) -> str:
+        # If not a chat-formatted string, wrap as a single user message via chat template
+        if "<|im_start|>" not in text:
+            msg = {"role": "user", "content": normalize_text(text)}
+            text = self.tokenizer.apply_chat_template(
+                [msg],
+                tokenize=False,
+                add_generation_prompt=True
+            )
+        return text
+    def make_pred_mask(self, input_ids: np.ndarray) -> np.ndarray:
+        """Return boolean mask: True where we should compute EoT prob (user tokens only)."""
+        if self.START_TOKEN_ID is None or not self.USER_TOKEN_IDS:
+            # Fallback: score all non-special tokens if start/user not available
+            return np.array([tok not in self.SPECIAL_TOKENS for tok in input_ids], dtype=bool)
+        user_mask = [False] * len(input_ids)
+        is_user_role = False
+        for i in range(len(input_ids)):
+            tok = input_ids[i]
+            if (self.START_TOKEN_ID is not None) and (tok == self.START_TOKEN_ID) and i + 1 < len(input_ids):
+                is_user_role = input_ids[i + 1] in self.USER_TOKEN_IDS
+                user_mask[i] = False
+                continue
+            user_mask[i] = is_user_role and (tok not in self.SPECIAL_TOKENS)
+        return np.array(user_mask, dtype=bool)
+    @torch.no_grad()
+    def predict_eou(self, text: str) -> pd.DataFrame:
+        text = self.format_input(text)
+        with torch.amp.autocast(self.model.device.type):
+            inputs = self.tokenizer.encode(
                 text,
                 add_special_tokens=False,
                 return_tensors="pt"
+            ).to(self.model.device)
+            outputs = self.model(inputs)
+        # probs over vocab for each position; then take the probability of EOU token
+        logits = outputs.logits
+        probs = torch.nn.functional.softmax(logits, dim=-1)
+        if self.EOU_TOKEN_ID is None:
+            # If the model/tokenizer doesn't have <|im_end|>, use newline as a proxy (last resort)
+            fallback_id = self.NEWLINE_TOKEN_ID if self.NEWLINE_TOKEN_ID is not None else 0
+            eou_probs = probs[..., fallback_id]
+        else:
+            eou_probs = probs[..., self.EOU_TOKEN_ID]
+        eou_probs = eou_probs.squeeze(0).float().cpu().numpy()
+        input_ids = inputs.squeeze(0).int().cpu().numpy()
+        mask = self.make_pred_mask(input_ids)
+        # set masked positions to NaN (not scored)
+        eou_probs_masked = eou_probs.copy()
+        eou_probs_masked[~mask] = np.nan
+        tokens = [self.tokenizer.decode(i) for i in input_ids]
+        return pd.DataFrame({"token": tokens, "pred": eou_probs_masked})
+    def make_styled_df(self, df: pd.DataFrame, thresh: float, cmap="coolwarm") -> str:
+        EPS = 1e-12
+        _df = df.copy()
+        _df = _df[~_df.token.isin(self.CONTROL_TOKS)]
+        _df.token = _df.token.replace({"\n": "⏎", " ": "␠"})
+        _df["log_odds"] = (
+            _df.pred.fillna(thresh)
+            .add(EPS)
+            .apply(log_odds).sub(log_odds(thresh))
+            .mask(_df.pred.isna())
         )
+        _df["Prob(EoT) as %"] = _df.pred.mul(100).fillna(0).astype(int)
+        vmin, vmax = _df.log_odds.min(), _df.log_odds.max()
+        vmax_abs = max(abs(vmin), abs(vmax)) * 1.5 if pd.notna(vmin) and pd.notna(vmax) else 1.0
+        fmt = (
+            _df.drop(columns=["pred"])
+            .style
+            .bar(
+                subset=["log_odds"],
+                align="zero",
+                vmin=-vmax_abs,
+                vmax=vmax_abs,
+                cmap=cmap,
+                height=70,
+                width=100,
+            )
+            .text_gradient(subset=["log_odds"], cmap=cmap, vmin=-vmax_abs, vmax=vmax_abs)
+            .format(na_rep="", precision=1, subset=["log_odds"])
+            .format("{:3d}", subset=["Prob(EoT) as %"])
+            .hide(axis="index")
+        )
+        return fmt.to_html()
+    def generate_highlighted_text(self, text: str, threshold: float):
+        """Returns: (highlighted_list, styled_html) for Gradio"""
+        eps = 1e-12
+        if not text:
+            return [], "<div>No input.</div>"
+        df = self.predict_eou(text)
+        df.token = df.token.replace({"user": "\nUSER:", "assistant": "\nAGENT:"})
+        df = df[~df.token.isin(self.CONTROL_TOKS)]
+        df["score"] = (
+            df.pred.fillna(threshold)
+            .add(eps)
+            .apply(log_odds).sub(log_odds(threshold))
+            .mask(df.pred.isna() | df.pred.round(2).eq(0))
+        )
+        max_abs_score = df["score"].abs().max()
+        if pd.notna(max_abs_score) and max_abs_score > 0:
+            df.score = df.score / (max_abs_score * 1.5)
+        styled_df = self.make_styled_df(df[["token", "pred"]], thresh=threshold)
+        return list(zip(df.token, df.score)), styled_df
+# ===== Cached loaders so switching models in the UI is fast =====
+@lru_cache(maxsize=4)
+def get_runner(model_id: str, revision: str | None):
+    return ModelRunner(model_id, revision)
+# ===== Gradio App =====
+EN_THRESHOLD = 0.0049
+def compare_models(
+    text: str,
+    model_a_id: str,
+    model_a_rev: str,
+    thresh_a: float,
+    model_b_id: str,
+    model_b_rev: str,
+    thresh_b: float,
+):
+    runner_a = get_runner(model_a_id, model_a_rev if model_a_rev else None)
+    runner_b = get_runner(model_b_id, model_b_rev if model_b_rev else None)
+    ht_a, html_a = runner_a.generate_highlighted_text(text, threshold=thresh_a)
+    ht_b, html_b = runner_b.generate_highlighted_text(text, threshold=thresh_b)
+    # Optional: prepend small headers indicating model names in the HTML blocks
+    html_a = f"<h4 style='margin:0 0 8px 0'>{model_a_id}@{model_a_rev or 'default'}</h4>" + html_a
+    html_b = f"<h4 style='margin:0 0 8px 0'>{model_b_id}@{model_b_rev or 'default'}</h4>" + html_b
+    return ht_a, html_a, ht_b, html_b
+EXAMPLE_CONVO = """<|im_start|>assistant
 what is your phone number<|im_end|>
 <|im_start|>user
 555 410 0423<|im_end|>"""
+with gr.Blocks(theme="soft", title="Turn Detector Debugger — Side by Side") as demo:
+    gr.Markdown(
+        """# Turn Detector Debugger — Side by Side
+Visualize predicted turn endings from **two models**.
+Red ⇒ agent should reply • Blue ⇒ agent should wait"""
+    )
+    with gr.Row():
+        text_in = gr.Textbox(
+            label="Input Text",
+            info="If <|im_start|> is present, input is treated as chat-formatted; otherwise it's wrapped as a single user turn.",
+            value=EXAMPLE_CONVO,
+            lines=4,
+        )
+    with gr.Row():
+        with gr.Column():
+            gr.Markdown("### Model A")
+            model_a_id = gr.Textbox(value=DEFAULT_MODEL_A_ID, label="Model ID")
+            model_a_rev = gr.Textbox(value=DEFAULT_MODEL_A_REV, label="Revision (optional)")
+            thresh_a = gr.Slider(0.0001, 0.05, value=EN_THRESHOLD, step=0.0001, label="Threshold")
+        with gr.Column():
+            gr.Markdown("### Model B")
+            model_b_id = gr.Textbox(value=DEFAULT_MODEL_B_ID, label="Model ID")
+            model_b_rev = gr.Textbox(value=DEFAULT_MODEL_B_REV, label="Revision (optional)")
+            thresh_b = gr.Slider(0.0001, 0.05, value=EN_THRESHOLD, step=0.0001, label="Threshold")
+    run_btn = gr.Button("Run Comparison", variant="primary")
+    with gr.Row():
+        with gr.Column():
+            out_ht_a = gr.HighlightedText(
+                label="EoT Predictions (Model A)",
+                color_map="coolwarm",
+                scale=1.5,
+            )
+            out_html_a = gr.HTML(label="Raw scores (Model A)")
+        with gr.Column():
+            out_ht_b = gr.HighlightedText(
+                label="EoT Predictions (Model B)",
+                color_map="coolwarm",
+                scale=1.5,
+            )
+            out_html_b = gr.HTML(label="Raw scores (Model B)")
+    run_btn.click(
+        fn=compare_models,
+        inputs=[text_in, model_a_id, model_a_rev, thresh_a, model_b_id, model_b_rev, thresh_b],
+        outputs=[out_ht_a, out_html_a, out_ht_b, out_html_b]
+    )
 demo.launch()