Spaces:

SWE-Arena
/

SWE-Issue

Running

App Files Files Community

zhimin-z commited on 20 days ago

Commit

f51e3c7

1 Parent(s): 1bae49b

add

Browse files

Files changed (3) hide show

README.md +4 -2
app.py +85 -85
msr.py +69 -69

README.md CHANGED Viewed

@@ -28,6 +28,8 @@ If an assistant can consistently resolve issues and discussions across different
 Key metrics from the last 180 days:
 **Leaderboard Table**
 - **Issue Resolved Rate (%)**: Percentage of closed issues successfully resolved
 - **Discussion Resolved Rate (%)**: Percentage of discussions successfully resolved (answered or closed)
 - **Total Issues**: Issues the assistant has been involved with (authored, assigned, or commented on)
@@ -51,7 +53,7 @@ We focus on 180 days to highlight current capabilities and active assistants.
 **Data Collection**
 We mine GitHub activity from [GHArchive](https://www.gharchive.org/), tracking three types of activities:
-1. **Agent-Assigned Issues**:
    - Issues opened or assigned to the assistant (`IssuesEvent`)
    - Issue comments by the assistant (`IssueCommentEvent`)
@@ -78,7 +80,7 @@ Anyone can submit an assistant. We store metadata in `SWE-Arena/bot_metadata` an
 - Searchable table (by assistant name or website)
 - Filterable columns (by issue resolved rate, discussion resolved rate)
 - Monthly charts (issue and discussion resolution trends and activity)
-- View agent-assigned metrics, wanted issue resolutions, and discussion metrics
 **Issues Wanted Tab**:
 - Browse long-standing open issues (30+ days) from major open-source projects

 Key metrics from the last 180 days:
 **Leaderboard Table**
+- **Assistant**: Display name of the assistant
+- **Website**: Link to the assistant's homepage or documentation
 - **Issue Resolved Rate (%)**: Percentage of closed issues successfully resolved
 - **Discussion Resolved Rate (%)**: Percentage of discussions successfully resolved (answered or closed)
 - **Total Issues**: Issues the assistant has been involved with (authored, assigned, or commented on)
 **Data Collection**
 We mine GitHub activity from [GHArchive](https://www.gharchive.org/), tracking three types of activities:
+1. **Assistant-Assigned Issues**:
    - Issues opened or assigned to the assistant (`IssuesEvent`)
    - Issue comments by the assistant (`IssueCommentEvent`)
 - Searchable table (by assistant name or website)
 - Filterable columns (by issue resolved rate, discussion resolved rate)
 - Monthly charts (issue and discussion resolution trends and activity)
+- View assistant-assigned metrics, wanted issue resolutions, and discussion metrics
 **Issues Wanted Tab**:
 - Browse long-standing open issues (30+ days) from major open-source projects

app.py CHANGED Viewed

@@ -24,7 +24,7 @@ load_dotenv()
 # CONFIGURATION
 # =============================================================================
-AGENTS_REPO = "SWE-Arena/bot_metadata"  # HuggingFace dataset for agent metadata
 AGENTS_REPO_LOCAL_PATH = os.path.expanduser("~/bot_metadata")  # Local git clone path
 LEADERBOARD_FILENAME = f"{os.getenv('COMPOSE_PROJECT_NAME')}.json"
 LEADERBOARD_REPO = "SWE-Arena/leaderboard_data"  # HuggingFace dataset for leaderboard data
@@ -33,7 +33,7 @@ GIT_SYNC_TIMEOUT = 300  # 5 minutes timeout for git pull
 MAX_RETRIES = 5
 LEADERBOARD_COLUMNS = [
-    ("Agent Name", "string"),
     ("Website", "string"),
     ("Total Issues", "number"),
     ("Total Discussions", "number"),
@@ -162,14 +162,14 @@ def sync_agents_repo():
 def load_agents_from_hf():
     """
-    Load all agent metadata JSON files from local git repository.
     ALWAYS syncs with remote first to ensure we have the latest bot data.
     """
     # MANDATORY: Sync with remote first to get latest bot data
-    print(f"   Syncing bot_metadata repository to get latest agents...")
     sync_agents_repo()  # Will raise exception if sync fails
-    agents = []
     # Scan local directory for JSON files
     if not os.path.exists(AGENTS_REPO_LOCAL_PATH):
@@ -177,7 +177,7 @@ def load_agents_from_hf():
     # Walk through the directory to find all JSON files
     files_processed = 0
-    print(f"   Loading agent metadata from {AGENTS_REPO_LOCAL_PATH}...")
     for root, dirs, files in os.walk(AGENTS_REPO_LOCAL_PATH):
         # Skip .git directory
@@ -195,7 +195,7 @@ def load_agents_from_hf():
                 with open(file_path, 'r', encoding='utf-8') as f:
                     agent_data = json.load(f)
-                # Only include active agents
                 if agent_data.get('status') != 'active':
                     continue
@@ -203,14 +203,14 @@ def load_agents_from_hf():
                 github_identifier = filename.replace('.json', '')
                 agent_data['github_identifier'] = github_identifier
-                agents.append(agent_data)
             except Exception as e:
                 print(f"   Warning Error loading {filename}: {str(e)}")
                 continue
-    print(f"   Success Loaded {len(agents)} active agents (from {files_processed} total files)")
-    return agents
 def get_hf_token():
@@ -265,7 +265,7 @@ def upload_with_retry(api, path_or_fileobj, path_in_repo, repo_id, repo_type, to
 def save_agent_to_hf(data):
-    """Save a new agent to HuggingFace dataset as {identifier}.json in root."""
     try:
         api = HfApi()
         token = get_hf_token()
@@ -290,7 +290,7 @@ def save_agent_to_hf(data):
                 repo_type="dataset",
                 token=token
             )
-            print(f"Saved agent to HuggingFace: {filename}")
             return True
         finally:
             # Always clean up local file, even if upload fails
@@ -298,7 +298,7 @@ def save_agent_to_hf(data):
                 os.remove(filename)
     except Exception as e:
-        print(f"Error saving agent: {str(e)}")
         return False
@@ -345,10 +345,10 @@ def create_monthly_metrics_plot(top_n=5):
     - Left y-axis: Resolved Rate (%) as line curves
     - Right y-axis: Total Issues created as bar charts
-    Each agent gets a unique color for both their line and bars.
     Args:
-        top_n: Number of top agents to show (default: 5)
     """
     # Load from saved dataset
     saved_data = load_leaderboard_data_from_hf()
@@ -373,10 +373,10 @@ def create_monthly_metrics_plot(top_n=5):
     print(f"Loaded monthly metrics from saved dataset")
     # Apply top_n filter if specified
-    if top_n is not None and top_n > 0 and metrics.get('agents'):
-        # Calculate total issues for each agent
         agent_totals = []
-        for agent_name in metrics['agents']:
             agent_data = metrics['data'].get(agent_name, {})
             total_issues = sum(agent_data.get('total_issues', []))
             agent_totals.append((agent_name, total_issues))
@@ -385,14 +385,14 @@ def create_monthly_metrics_plot(top_n=5):
         agent_totals.sort(key=lambda x: x[1], reverse=True)
         top_agents = [agent_name for agent_name, _ in agent_totals[:top_n]]
-        # Filter metrics to only include top agents
         metrics = {
-            'agents': top_agents,
             'months': metrics['months'],
-            'data': {agent: metrics['data'][agent] for agent in top_agents if agent in metrics['data']}
         }
-    if not metrics['agents'] or not metrics['months']:
         # Return an empty figure with a message
         fig = go.Figure()
         fig.add_annotation(
@@ -411,7 +411,7 @@ def create_monthly_metrics_plot(top_n=5):
     # Create figure with secondary y-axis
     fig = make_subplots(specs=[[{"secondary_y": True}]])
-    # Generate unique colors for many agents using HSL color space
     def generate_color(index, total):
         """Generate distinct colors using HSL color space for better distribution"""
         hue = (index * 360 / total) % 360
@@ -419,15 +419,15 @@ def create_monthly_metrics_plot(top_n=5):
         lightness = 45 + (index % 2) * 10   # Vary lightness slightly
         return f'hsl({hue}, {saturation}%, {lightness}%)'
-    agents = metrics['agents']
     months = metrics['months']
     data = metrics['data']
-    # Generate colors for all agents
-    agent_colors = {agent: generate_color(idx, len(agents)) for idx, agent in enumerate(agents)}
-    # Add traces for each agent
-    for idx, agent_name in enumerate(agents):
         color = agent_colors[agent_name]
         agent_data = data[agent_name]
@@ -447,8 +447,8 @@ def create_monthly_metrics_plot(top_n=5):
                     line=dict(color=color, width=2),
                     marker=dict(size=8),
                     legendgroup=agent_name,
-                    showlegend=(top_n is not None and top_n <= 10),  # Show legend for top N agents
-                    hovertemplate='<b>Agent: %{fullData.name}</b><br>' +
                                  'Month: %{x}<br>' +
                                  'Resolved Rate: %{y:.2f}%<br>' +
                                  '<extra></extra>'
@@ -457,7 +457,7 @@ def create_monthly_metrics_plot(top_n=5):
             )
         # Add bar trace for total issues (right y-axis)
-        # Only show bars for months where agent has issues
         x_bars = []
         y_bars = []
         for month, count in zip(months, agent_data['total_issues']):
@@ -474,11 +474,11 @@ def create_monthly_metrics_plot(top_n=5):
                     marker=dict(color=color, opacity=0.6),
                     legendgroup=agent_name,
                     showlegend=False,  # Hide duplicate legend entry (already shown in Scatter)
-                    hovertemplate='<b>Agent: %{fullData.name}</b><br>' +
                                  'Month: %{x}<br>' +
                                  'Total Issues: %{y}<br>' +
                                  '<extra></extra>',
-                    offsetgroup=agent_name  # Group bars by agent for proper spacing
                 ),
                 secondary_y=True
             )
@@ -500,7 +500,7 @@ def create_monthly_metrics_plot(top_n=5):
     show_legend = (top_n is not None and top_n <= 10)
     fig.update_layout(
         title=None,
-        hovermode='closest',  # Show individual agent info on hover
         barmode='group',
         height=600,
         showlegend=show_legend,
@@ -516,10 +516,10 @@ def create_discussion_monthly_metrics_plot(top_n=5):
     - Left y-axis: Discussion Resolved Rate (%) as line curves
     - Right y-axis: Total Discussions created as bar charts
-    Each agent gets a unique color for both their line and bars.
     Args:
-        top_n: Number of top agents to show (default: 5)
     """
     # Load from saved dataset
     saved_data = load_leaderboard_data_from_hf()
@@ -544,10 +544,10 @@ def create_discussion_monthly_metrics_plot(top_n=5):
     print(f"Loaded discussion monthly metrics from saved dataset")
     # Apply top_n filter if specified
-    if top_n is not None and top_n > 0 and metrics.get('agents'):
-        # Calculate total discussions for each agent
         agent_totals = []
-        for agent_name in metrics['agents']:
             agent_data = metrics['data'].get(agent_name, {})
             total_discussions = agent_data.get('total_discussions')
             agent_totals.append((agent_name, total_discussions))
@@ -556,14 +556,14 @@ def create_discussion_monthly_metrics_plot(top_n=5):
         agent_totals.sort(key=lambda x: x[1], reverse=True)
         top_agents = [agent_name for agent_name, _ in agent_totals[:top_n]]
-        # Filter metrics to only include top agents
         metrics = {
-            'agents': top_agents,
             'months': metrics['months'],
-            'data': {agent: metrics['data'][agent] for agent in top_agents if agent in metrics['data']}
         }
-    if not metrics['agents'] or not metrics['months']:
         # Return an empty figure with a message
         fig = go.Figure()
         fig.add_annotation(
@@ -582,7 +582,7 @@ def create_discussion_monthly_metrics_plot(top_n=5):
     # Create figure with secondary y-axis
     fig = make_subplots(specs=[[{"secondary_y": True}]])
-    # Generate unique colors for many agents using HSL color space
     def generate_color(index, total):
         """Generate distinct colors using HSL color space for better distribution"""
         hue = (index * 360 / total) % 360
@@ -590,15 +590,15 @@ def create_discussion_monthly_metrics_plot(top_n=5):
         lightness = 45 + (index % 2) * 10   # Vary lightness slightly
         return f'hsl({hue}, {saturation}%, {lightness}%)'
-    agents = metrics['agents']
     months = metrics['months']
     data = metrics['data']
-    # Generate colors for all agents
-    agent_colors = {agent: generate_color(idx, len(agents)) for idx, agent in enumerate(agents)}
-    # Add traces for each agent
-    for idx, agent_name in enumerate(agents):
         color = agent_colors[agent_name]
         agent_data = data[agent_name]
@@ -618,8 +618,8 @@ def create_discussion_monthly_metrics_plot(top_n=5):
                     line=dict(color=color, width=2),
                     marker=dict(size=8),
                     legendgroup=agent_name,
-                    showlegend=(top_n is not None and top_n <= 10),  # Show legend for top N agents
-                    hovertemplate='<b>Agent: %{fullData.name}</b><br>' +
                                  'Month: %{x}<br>' +
                                  'Discussion Resolved Rate: %{y:.2f}%<br>' +
                                  '<extra></extra>'
@@ -628,7 +628,7 @@ def create_discussion_monthly_metrics_plot(top_n=5):
             )
         # Add bar trace for total discussions (right y-axis)
-        # Only show bars for months where agent has discussions
         x_bars = []
         y_bars = []
         for month, count in zip(months, agent_data['total_discussions']):
@@ -645,11 +645,11 @@ def create_discussion_monthly_metrics_plot(top_n=5):
                     marker=dict(color=color, opacity=0.6),
                     legendgroup=agent_name,
                     showlegend=False,  # Hide duplicate legend entry (already shown in Scatter)
-                    hovertemplate='<b>Agent: %{fullData.name}</b><br>' +
                                  'Month: %{x}<br>' +
                                  'Total Discussions: %{y}<br>' +
                                  '<extra></extra>',
-                    offsetgroup=agent_name  # Group bars by agent for proper spacing
                 ),
                 secondary_y=True
             )
@@ -671,7 +671,7 @@ def create_discussion_monthly_metrics_plot(top_n=5):
     show_legend = (top_n is not None and top_n <= 10)
     fig.update_layout(
         title=None,
-        hovermode='closest',  # Show individual agent info on hover
         barmode='group',
         height=600,
         showlegend=show_legend,
@@ -710,9 +710,9 @@ def get_leaderboard_dataframe():
     filtered_count = 0
     for identifier, data in cache_dict.items():
         total_issues = data.get('total_issues', 0)
-        print(f"   Agent '{identifier}': {total_issues} issues")
-        # Filter out agents with zero total issues
         if total_issues == 0:
             filtered_count += 1
             continue
@@ -730,8 +730,8 @@ def get_leaderboard_dataframe():
             data.get('resolved_discussions', 0),  # Resolved Discussions
         ])
-    print(f"Filtered out {filtered_count} agents with 0 issues")
-    print(f"Leaderboard will show {len(rows)} agents")
     # Create DataFrame
     column_names = [col[0] for col in LEADERBOARD_COLUMNS]
@@ -807,14 +807,14 @@ def get_wanted_issues_dataframe():
 def submit_agent(identifier, agent_name, organization, website):
     """
-    Submit a new agent to the leaderboard.
     Validates input and saves submission.
     """
     # Validate required fields
     if not identifier or not identifier.strip():
         return "ERROR: GitHub identifier is required", gr.update()
     if not agent_name or not agent_name.strip():
-        return "ERROR: Agent name is required", gr.update()
     if not organization or not organization.strip():
         return "ERROR: Organization name is required", gr.update()
     if not website or not website.strip():
@@ -831,12 +831,12 @@ def submit_agent(identifier, agent_name, organization, website):
     if not is_valid:
         return f"ERROR: {message}", gr.update()
-    # Check for duplicates by loading agents from HuggingFace
-    agents = load_agents_from_hf()
-    if agents:
-        existing_names = {agent['github_identifier'] for agent in agents}
         if identifier in existing_names:
-            return f"WARNING: Agent with identifier '{identifier}' already exists", gr.update()
     # Create submission
     submission = {
@@ -873,7 +873,7 @@ def reload_leaderboard_data():
         if data:
             print(f"Successfully reloaded leaderboard data")
             print(f"   Last updated: {data.get('metadata', {}).get('last_updated', 'Unknown')}")
-            print(f"   Agents: {len(data.get('leaderboard', {}))}")
         else:
             print(f"No data available")
     except Exception as e:
@@ -886,7 +886,7 @@ def reload_leaderboard_data():
 # GRADIO APPLICATION
 # =============================================================================
-print(f"\nStarting SWE Agent Issue Leaderboard")
 print(f"   Data source: {LEADERBOARD_REPO}")
 print(f"   Reload frequency: Daily at 12:00 AM UTC\n")
@@ -907,19 +907,19 @@ print(f"On startup: Loads cached data from HuggingFace on demand")
 print(f"{'='*80}\n")
 # Create Gradio interface
-with gr.Blocks(title="SWE Agent Issue & Discussion Leaderboard", theme=gr.themes.Soft()) as app:
-    gr.Markdown("# SWE Agent Issue & Discussion Leaderboard")
-    gr.Markdown(f"Track and compare GitHub issue and discussion resolution statistics for SWE agents")
     with gr.Tabs():
         # Leaderboard Tab
         with gr.Tab("Leaderboard"):
-            gr.Markdown("*Statistics are based on agent issue resolution activity tracked by the system*")
             leaderboard_table = Leaderboard(
                 value=pd.DataFrame(columns=[col[0] for col in LEADERBOARD_COLUMNS]),  # Empty initially
                 datatype=LEADERBOARD_COLUMNS,
-                search_columns=["Agent Name", "Website"],
                 filter_columns=[
                     ColumnFilter(
                         "Issue Resolved Rate (%)",
@@ -942,8 +942,8 @@ with gr.Blocks(title="SWE Agent Issue & Discussion Leaderboard", theme=gr.themes
             # Monthly Metrics Section
             gr.Markdown("---")  # Divider
             with gr.Group():
-                gr.Markdown("### Issue Monthly Performance - Top 5 Agents")
-                gr.Markdown("*Shows issue resolution trends and volumes for the most active agents*")
                 monthly_metrics_plot = gr.Plot(label="Issue Monthly Metrics")
             # Load monthly metrics when app starts
@@ -956,8 +956,8 @@ with gr.Blocks(title="SWE Agent Issue & Discussion Leaderboard", theme=gr.themes
             # Discussion Monthly Metrics Section
             gr.Markdown("---")  # Divider
             with gr.Group():
-                gr.Markdown("### Discussion Monthly Performance - Top 5 Agents")
-                gr.Markdown("*Shows discussion resolution trends and volumes for the most active agents*")
                 discussion_metrics_plot = gr.Plot(label="Discussion Monthly Metrics")
             # Load discussion monthly metrics when app starts
@@ -987,20 +987,20 @@ with gr.Blocks(title="SWE Agent Issue & Discussion Leaderboard", theme=gr.themes
             )
-        # Submit Agent Tab
-        with gr.Tab("Submit Your Agent"):
-            gr.Markdown("Fill in the details below to add your agent to the leaderboard.")
             with gr.Row():
                 with gr.Column():
                     github_input = gr.Textbox(
                         label="GitHub Identifier*",
-                        placeholder="Your agent username (e.g., my-agent[bot])"
                     )
                     name_input = gr.Textbox(
-                        label="Agent Name*",
-                        placeholder="Your agent's display name"
                     )
                 with gr.Column():
@@ -1010,11 +1010,11 @@ with gr.Blocks(title="SWE Agent Issue & Discussion Leaderboard", theme=gr.themes
                     )
                     website_input = gr.Textbox(
                         label="Website*",
-                        placeholder="https://your-agent-website.com"
                     )
             submit_button = gr.Button(
-                "Submit Agent",
                 variant="primary"
             )
             submission_status = gr.Textbox(

 # CONFIGURATION
 # =============================================================================
+AGENTS_REPO = "SWE-Arena/bot_metadata"  # HuggingFace dataset for assistant metadata
 AGENTS_REPO_LOCAL_PATH = os.path.expanduser("~/bot_metadata")  # Local git clone path
 LEADERBOARD_FILENAME = f"{os.getenv('COMPOSE_PROJECT_NAME')}.json"
 LEADERBOARD_REPO = "SWE-Arena/leaderboard_data"  # HuggingFace dataset for leaderboard data
 MAX_RETRIES = 5
 LEADERBOARD_COLUMNS = [
+    ("Assistant", "string"),
     ("Website", "string"),
     ("Total Issues", "number"),
     ("Total Discussions", "number"),
 def load_agents_from_hf():
     """
+    Load all assistant metadata JSON files from local git repository.
     ALWAYS syncs with remote first to ensure we have the latest bot data.
     """
     # MANDATORY: Sync with remote first to get latest bot data
+    print(f"   Syncing bot_metadata repository to get latest assistants...")
     sync_agents_repo()  # Will raise exception if sync fails
+    assistants = []
     # Scan local directory for JSON files
     if not os.path.exists(AGENTS_REPO_LOCAL_PATH):
     # Walk through the directory to find all JSON files
     files_processed = 0
+    print(f"   Loading assistant metadata from {AGENTS_REPO_LOCAL_PATH}...")
     for root, dirs, files in os.walk(AGENTS_REPO_LOCAL_PATH):
         # Skip .git directory
                 with open(file_path, 'r', encoding='utf-8') as f:
                     agent_data = json.load(f)
+                # Only include active assistants
                 if agent_data.get('status') != 'active':
                     continue
                 github_identifier = filename.replace('.json', '')
                 agent_data['github_identifier'] = github_identifier
+                assistants.append(agent_data)
             except Exception as e:
                 print(f"   Warning Error loading {filename}: {str(e)}")
                 continue
+    print(f"   Success Loaded {len(assistants)} active assistants (from {files_processed} total files)")
+    return assistants
 def get_hf_token():
 def save_agent_to_hf(data):
+    """Save a new assistant to HuggingFace dataset as {identifier}.json in root."""
     try:
         api = HfApi()
         token = get_hf_token()
                 repo_type="dataset",
                 token=token
             )
+            print(f"Saved assistant to HuggingFace: {filename}")
             return True
         finally:
             # Always clean up local file, even if upload fails
                 os.remove(filename)
     except Exception as e:
+        print(f"Error saving assistant: {str(e)}")
         return False
     - Left y-axis: Resolved Rate (%) as line curves
     - Right y-axis: Total Issues created as bar charts
+    Each assistant gets a unique color for both their line and bars.
     Args:
+        top_n: Number of top assistants to show (default: 5)
     """
     # Load from saved dataset
     saved_data = load_leaderboard_data_from_hf()
     print(f"Loaded monthly metrics from saved dataset")
     # Apply top_n filter if specified
+    if top_n is not None and top_n > 0 and metrics.get('assistants'):
+        # Calculate total issues for each assistant
         agent_totals = []
+        for agent_name in metrics['assistants']:
             agent_data = metrics['data'].get(agent_name, {})
             total_issues = sum(agent_data.get('total_issues', []))
             agent_totals.append((agent_name, total_issues))
         agent_totals.sort(key=lambda x: x[1], reverse=True)
         top_agents = [agent_name for agent_name, _ in agent_totals[:top_n]]
+        # Filter metrics to only include top assistants
         metrics = {
+            'assistants': top_agents,
             'months': metrics['months'],
+            'data': {assistant: metrics['data'][assistant] for assistant in top_agents if assistant in metrics['data']}
         }
+    if not metrics['assistants'] or not metrics['months']:
         # Return an empty figure with a message
         fig = go.Figure()
         fig.add_annotation(
     # Create figure with secondary y-axis
     fig = make_subplots(specs=[[{"secondary_y": True}]])
+    # Generate unique colors for many assistants using HSL color space
     def generate_color(index, total):
         """Generate distinct colors using HSL color space for better distribution"""
         hue = (index * 360 / total) % 360
         lightness = 45 + (index % 2) * 10   # Vary lightness slightly
         return f'hsl({hue}, {saturation}%, {lightness}%)'
+    assistants = metrics['assistants']
     months = metrics['months']
     data = metrics['data']
+    # Generate colors for all assistants
+    agent_colors = {assistant: generate_color(idx, len(assistants)) for idx, assistant in enumerate(assistants)}
+    # Add traces for each assistant
+    for idx, agent_name in enumerate(assistants):
         color = agent_colors[agent_name]
         agent_data = data[agent_name]
                     line=dict(color=color, width=2),
                     marker=dict(size=8),
                     legendgroup=agent_name,
+                    showlegend=(top_n is not None and top_n <= 10),  # Show legend for top N assistants
+                    hovertemplate='<b>Assistant: %{fullData.name}</b><br>' +
                                  'Month: %{x}<br>' +
                                  'Resolved Rate: %{y:.2f}%<br>' +
                                  '<extra></extra>'
             )
         # Add bar trace for total issues (right y-axis)
+        # Only show bars for months where assistant has issues
         x_bars = []
         y_bars = []
         for month, count in zip(months, agent_data['total_issues']):
                     marker=dict(color=color, opacity=0.6),
                     legendgroup=agent_name,
                     showlegend=False,  # Hide duplicate legend entry (already shown in Scatter)
+                    hovertemplate='<b>Assistant: %{fullData.name}</b><br>' +
                                  'Month: %{x}<br>' +
                                  'Total Issues: %{y}<br>' +
                                  '<extra></extra>',
+                    offsetgroup=agent_name  # Group bars by assistant for proper spacing
                 ),
                 secondary_y=True
             )
     show_legend = (top_n is not None and top_n <= 10)
     fig.update_layout(
         title=None,
+        hovermode='closest',  # Show individual assistant info on hover
         barmode='group',
         height=600,
         showlegend=show_legend,
     - Left y-axis: Discussion Resolved Rate (%) as line curves
     - Right y-axis: Total Discussions created as bar charts
+    Each assistant gets a unique color for both their line and bars.
     Args:
+        top_n: Number of top assistants to show (default: 5)
     """
     # Load from saved dataset
     saved_data = load_leaderboard_data_from_hf()
     print(f"Loaded discussion monthly metrics from saved dataset")
     # Apply top_n filter if specified
+    if top_n is not None and top_n > 0 and metrics.get('assistants'):
+        # Calculate total discussions for each assistant
         agent_totals = []
+        for agent_name in metrics['assistants']:
             agent_data = metrics['data'].get(agent_name, {})
             total_discussions = agent_data.get('total_discussions')
             agent_totals.append((agent_name, total_discussions))
         agent_totals.sort(key=lambda x: x[1], reverse=True)
         top_agents = [agent_name for agent_name, _ in agent_totals[:top_n]]
+        # Filter metrics to only include top assistants
         metrics = {
+            'assistants': top_agents,
             'months': metrics['months'],
+            'data': {assistant: metrics['data'][assistant] for assistant in top_agents if assistant in metrics['data']}
         }
+    if not metrics['assistants'] or not metrics['months']:
         # Return an empty figure with a message
         fig = go.Figure()
         fig.add_annotation(
     # Create figure with secondary y-axis
     fig = make_subplots(specs=[[{"secondary_y": True}]])
+    # Generate unique colors for many assistants using HSL color space
     def generate_color(index, total):
         """Generate distinct colors using HSL color space for better distribution"""
         hue = (index * 360 / total) % 360
         lightness = 45 + (index % 2) * 10   # Vary lightness slightly
         return f'hsl({hue}, {saturation}%, {lightness}%)'
+    assistants = metrics['assistants']
     months = metrics['months']
     data = metrics['data']
+    # Generate colors for all assistants
+    agent_colors = {assistant: generate_color(idx, len(assistants)) for idx, assistant in enumerate(assistants)}
+    # Add traces for each assistant
+    for idx, agent_name in enumerate(assistants):
         color = agent_colors[agent_name]
         agent_data = data[agent_name]
                     line=dict(color=color, width=2),
                     marker=dict(size=8),
                     legendgroup=agent_name,
+                    showlegend=(top_n is not None and top_n <= 10),  # Show legend for top N assistants
+                    hovertemplate='<b>Assistant: %{fullData.name}</b><br>' +
                                  'Month: %{x}<br>' +
                                  'Discussion Resolved Rate: %{y:.2f}%<br>' +
                                  '<extra></extra>'
             )
         # Add bar trace for total discussions (right y-axis)
+        # Only show bars for months where assistant has discussions
         x_bars = []
         y_bars = []
         for month, count in zip(months, agent_data['total_discussions']):
                     marker=dict(color=color, opacity=0.6),
                     legendgroup=agent_name,
                     showlegend=False,  # Hide duplicate legend entry (already shown in Scatter)
+                    hovertemplate='<b>Assistant: %{fullData.name}</b><br>' +
                                  'Month: %{x}<br>' +
                                  'Total Discussions: %{y}<br>' +
                                  '<extra></extra>',
+                    offsetgroup=agent_name  # Group bars by assistant for proper spacing
                 ),
                 secondary_y=True
             )
     show_legend = (top_n is not None and top_n <= 10)
     fig.update_layout(
         title=None,
+        hovermode='closest',  # Show individual assistant info on hover
         barmode='group',
         height=600,
         showlegend=show_legend,
     filtered_count = 0
     for identifier, data in cache_dict.items():
         total_issues = data.get('total_issues', 0)
+        print(f"   Assistant '{identifier}': {total_issues} issues")
+        # Filter out assistants with zero total issues
         if total_issues == 0:
             filtered_count += 1
             continue
             data.get('resolved_discussions', 0),  # Resolved Discussions
         ])
+    print(f"Filtered out {filtered_count} assistants with 0 issues")
+    print(f"Leaderboard will show {len(rows)} assistants")
     # Create DataFrame
     column_names = [col[0] for col in LEADERBOARD_COLUMNS]
 def submit_agent(identifier, agent_name, organization, website):
     """
+    Submit a new assistant to the leaderboard.
     Validates input and saves submission.
     """
     # Validate required fields
     if not identifier or not identifier.strip():
         return "ERROR: GitHub identifier is required", gr.update()
     if not agent_name or not agent_name.strip():
+        return "ERROR: Assistant name is required", gr.update()
     if not organization or not organization.strip():
         return "ERROR: Organization name is required", gr.update()
     if not website or not website.strip():
     if not is_valid:
         return f"ERROR: {message}", gr.update()
+    # Check for duplicates by loading assistants from HuggingFace
+    assistants = load_agents_from_hf()
+    if assistants:
+        existing_names = {assistant['github_identifier'] for assistant in assistants}
         if identifier in existing_names:
+            return f"WARNING: Assistant with identifier '{identifier}' already exists", gr.update()
     # Create submission
     submission = {
         if data:
             print(f"Successfully reloaded leaderboard data")
             print(f"   Last updated: {data.get('metadata', {}).get('last_updated', 'Unknown')}")
+            print(f"   Assistants: {len(data.get('leaderboard', {}))}")
         else:
             print(f"No data available")
     except Exception as e:
 # GRADIO APPLICATION
 # =============================================================================
+print(f"\nStarting SWE Assistant Issue Leaderboard")
 print(f"   Data source: {LEADERBOARD_REPO}")
 print(f"   Reload frequency: Daily at 12:00 AM UTC\n")
 print(f"{'='*80}\n")
 # Create Gradio interface
+with gr.Blocks(title="SWE Assistant Issue & Discussion Leaderboard", theme=gr.themes.Soft()) as app:
+    gr.Markdown("# SWE Assistant Issue & Discussion Leaderboard")
+    gr.Markdown(f"Track and compare GitHub issue and discussion resolution statistics for SWE assistants")
     with gr.Tabs():
         # Leaderboard Tab
         with gr.Tab("Leaderboard"):
+            gr.Markdown("*Statistics are based on assistant issue resolution activity tracked by the system*")
             leaderboard_table = Leaderboard(
                 value=pd.DataFrame(columns=[col[0] for col in LEADERBOARD_COLUMNS]),  # Empty initially
                 datatype=LEADERBOARD_COLUMNS,
+                search_columns=["Assistant", "Website"],
                 filter_columns=[
                     ColumnFilter(
                         "Issue Resolved Rate (%)",
             # Monthly Metrics Section
             gr.Markdown("---")  # Divider
             with gr.Group():
+                gr.Markdown("### Issue Monthly Performance - Top 5 Assistants")
+                gr.Markdown("*Shows issue resolution trends and volumes for the most active assistants*")
                 monthly_metrics_plot = gr.Plot(label="Issue Monthly Metrics")
             # Load monthly metrics when app starts
             # Discussion Monthly Metrics Section
             gr.Markdown("---")  # Divider
             with gr.Group():
+                gr.Markdown("### Discussion Monthly Performance - Top 5 Assistants")
+                gr.Markdown("*Shows discussion resolution trends and volumes for the most active assistants*")
                 discussion_metrics_plot = gr.Plot(label="Discussion Monthly Metrics")
             # Load discussion monthly metrics when app starts
             )
+        # Submit Assistant Tab
+        with gr.Tab("Submit Your Assistant"):
+            gr.Markdown("Fill in the details below to add your assistant to the leaderboard.")
             with gr.Row():
                 with gr.Column():
                     github_input = gr.Textbox(
                         label="GitHub Identifier*",
+                        placeholder="Your assistant username (e.g., my-assistant[bot])"
                     )
                     name_input = gr.Textbox(
+                        label="Assistant Name*",
+                        placeholder="Your assistant's display name"
                     )
                 with gr.Column():
                     )
                     website_input = gr.Textbox(
                         label="Website*",
+                        placeholder="https://your-assistant-website.com"
                     )
             submit_button = gr.Button(
+                "Submit Assistant",
                 variant="primary"
             )
             submission_status = gr.Textbox(

msr.py CHANGED Viewed

@@ -361,14 +361,14 @@ def generate_file_path_patterns(start_date, end_date, data_dir=GHARCHIVE_DATA_LO
 def fetch_all_metadata_streaming(conn, identifiers, start_date, end_date):
     """
     UNIFIED QUERY: Fetches ALL metadata types in ONE query per batch:
-    - IssuesEvent, IssueCommentEvent (for agent-assigned issues AND wanted issues)
     - PullRequestEvent (for wanted issue tracking)
     - DiscussionEvent (for discussion tracking)
     Then post-processes in Python to separate into:
-    1. Agent-assigned issues: Issues where agents are assigned to or commented on
-    2. Wanted issues: Long-standing issues from tracked orgs linked to merged PRs by agents
-    3. Discussions: GitHub discussions created by agents
     This approach is more efficient than running separate queries for each category.
@@ -380,17 +380,17 @@ def fetch_all_metadata_streaming(conn, identifiers, start_date, end_date):
     Returns:
         Dictionary with four keys:
-        - 'agent_issues': {agent_id: [issue_metadata]} for agent-assigned issues
         - 'wanted_open': [open_wanted_issues] for long-standing open issues
         - 'wanted_resolved': {agent_id: [resolved_wanted]} for resolved wanted issues
-        - 'agent_discussions': {agent_id: [discussion_metadata]} for agent discussions
     """
     print(f"   Fetching ALL metadata (issues, PRs, discussions) with unified query...")
     identifier_set = set(identifiers)
     identifier_list = ', '.join([f"'{id}'" for id in identifiers])
     tracked_orgs_list = ', '.join([f"'{org}'" for org in TRACKED_ORGS])
-    # Storage for agent-assigned issues
     agent_issues = defaultdict(list)  # agent_id -> [issue_metadata]
     agent_issue_urls = defaultdict(set)  # agent_id -> set of issue URLs (for deduplication)
@@ -433,7 +433,7 @@ def fetch_all_metadata_streaming(conn, identifiers, start_date, end_date):
         try:
             # UNIFIED QUERY: Fetch ALL event types in ONE query
-            # Post-process in Python to separate into agent-assigned issues, wanted issues, PRs, and discussions
             unified_query = f"""
             SELECT
                 type,
@@ -448,7 +448,7 @@ def fetch_all_metadata_streaming(conn, identifiers, start_date, end_date):
                 json_extract(payload, '$.issue.labels') as issue_labels,
                 json_extract_string(payload, '$.issue.pull_request') as is_pull_request,
                 json_extract_string(payload, '$.issue.state_reason') as issue_state_reason,
-                -- Actor/assignee fields for agent assignment
                 json_extract_string(payload, '$.issue.user.login') as issue_creator,
                 json_extract_string(payload, '$.issue.assignee.login') as issue_assignee,
                 json_extract(payload, '$.issue.assignees') as issue_assignees,
@@ -481,7 +481,7 @@ def fetch_all_metadata_streaming(conn, identifiers, start_date, end_date):
             WHERE
                 type IN ('IssuesEvent', 'IssueCommentEvent', 'PullRequestEvent', 'DiscussionEvent')
                 AND (
-                    -- Agent-assigned issues: agent is creator, assignee, or commenter
                     (type = 'IssuesEvent' AND (
                         json_extract_string(payload, '$.issue.user.login') IN ({identifier_list})
                         OR json_extract_string(payload, '$.issue.assignee.login') IN ({identifier_list})
@@ -491,17 +491,17 @@ def fetch_all_metadata_streaming(conn, identifiers, start_date, end_date):
                         )
                         OR SPLIT_PART(json_extract_string(repo, '$.name'), '/', 1) IN ({tracked_orgs_list})
                     ))
-                    -- Issue comments: agent is commenter OR tracked org
                     OR (type = 'IssueCommentEvent' AND (
                         json_extract_string(payload, '$.comment.user.login') IN ({identifier_list})
                         OR SPLIT_PART(json_extract_string(repo, '$.name'), '/', 1) IN ({tracked_orgs_list})
                     ))
-                    -- PRs: agent is creator OR tracked org (for wanted issue tracking)
                     OR (type = 'PullRequestEvent' AND (
                         json_extract_string(payload, '$.pull_request.user.login') IN ({identifier_list})
                         OR SPLIT_PART(json_extract_string(repo, '$.name'), '/', 1) IN ({tracked_orgs_list})
                     ))
-                    -- Discussions: agent is creator AND tracked org
                     OR (type = 'DiscussionEvent'
                         AND json_extract_string(payload, '$.discussion.user.login') IN ({identifier_list})
                         AND SPLIT_PART(json_extract_string(repo, '$.name'), '/', 1) IN ({tracked_orgs_list})
@@ -522,7 +522,7 @@ def fetch_all_metadata_streaming(conn, identifiers, start_date, end_date):
             issue_events = []  # For wanted tracking
             pr_events = []     # For wanted tracking
             discussion_events = []  # For discussion tracking
-            agent_issue_events = []  # For agent-assigned issues
             for row in all_results:
                 event_type = row[0]
@@ -530,7 +530,7 @@ def fetch_all_metadata_streaming(conn, identifiers, start_date, end_date):
                 if event_type in ('IssuesEvent', 'IssueCommentEvent'):
                     if not is_pr:  # It's an issue, not a PR
-                        # Check if this is an agent-assigned issue
                         issue_creator = row[11]
                         issue_assignee = row[12]
                         issue_assignees_json = row[13]
@@ -564,7 +564,7 @@ def fetch_all_metadata_streaming(conn, identifiers, start_date, end_date):
                                     pass
                         elif event_type == 'IssueCommentEvent':
-                            # Check if commenter is an agent
                             if commenter in identifier_set:
                                 agent_identifier = commenter
@@ -584,7 +584,7 @@ def fetch_all_metadata_streaming(conn, identifiers, start_date, end_date):
                 elif event_type == 'DiscussionEvent':
                     discussion_events.append(row)
-            # Process agent-assigned issues
             for row, agent_identifier in agent_issue_events:
                 # Row indices: repo_url=2, issue_url=3, issue_created_at=6, issue_closed_at=7, issue_state_reason=10
                 repo_url = row[2]
@@ -612,7 +612,7 @@ def fetch_all_metadata_streaming(conn, identifiers, start_date, end_date):
                     except:
                         continue
-                # Deduplicate: only add if we haven't seen this issue for this agent
                 if full_url in agent_issue_urls[agent_identifier]:
                     continue
@@ -753,8 +753,8 @@ def fetch_all_metadata_streaming(conn, identifiers, start_date, end_date):
                     except:
                         continue
-                # Group by creator (agent identifier)
-                # Only track discussions from our agent identifiers
                 if discussion_creator not in identifier_set:
                     continue
@@ -776,12 +776,12 @@ def fetch_all_metadata_streaming(conn, identifiers, start_date, end_date):
                     'state_reason': discussion_state_reason
                 }
-                # Group by agent
                 if discussion_creator not in discussions_by_agent:
                     discussions_by_agent[discussion_creator] = []
                 discussions_by_agent[discussion_creator].append(discussion_meta)
-            print(f"✓ {len(agent_issue_events)} agent issues, {len(issue_events)} wanted issues, {len(pr_events)} PRs, {len(discussion_events)} discussions")
         except Exception as e:
             print(f"\n   ✗ Batch {batch_num} error: {str(e)}")
@@ -790,7 +790,7 @@ def fetch_all_metadata_streaming(conn, identifiers, start_date, end_date):
         # Move to next batch
         current_date = batch_end + timedelta(days=1)
-    # Post-processing: Filter issues and assign to agents
     print(f"\n   Post-processing {len(all_issues)} wanted issues...")
     wanted_open = []
@@ -803,7 +803,7 @@ def fetch_all_metadata_streaming(conn, identifiers, start_date, end_date):
         if not linked_prs:
             continue
-        # Check if any linked PR was merged AND created by an agent
         resolved_by = None
         for pr_url in linked_prs:
             merged_at = pr_merged_at.get(pr_url)
@@ -862,10 +862,10 @@ def fetch_all_metadata_streaming(conn, identifiers, start_date, end_date):
                 except:
                     pass
-    print(f"   ✓ Found {sum(len(issues) for issues in agent_issues.values())} agent-assigned issues across {len(agent_issues)} agents")
     print(f"   ✓ Found {len(wanted_open)} long-standing open wanted issues")
-    print(f"   ✓ Found {sum(len(issues) for issues in wanted_resolved.values())} resolved wanted issues across {len(wanted_resolved)} agents")
-    print(f"   ✓ Found {sum(len(discussions) for discussions in discussions_by_agent.values())} discussions across {len(discussions_by_agent)} agents")
     return {
         'agent_issues': dict(agent_issues),
@@ -933,14 +933,14 @@ def sync_agents_repo():
 def load_agents_from_hf():
     """
-    Load all agent metadata JSON files from local git repository.
     ALWAYS syncs with remote first to ensure we have the latest bot data.
     """
     # MANDATORY: Sync with remote first to get latest bot data
-    print(f"   Syncing bot_metadata repository to get latest agents...")
     sync_agents_repo()  # Will raise exception if sync fails
-    agents = []
     # Scan local directory for JSON files
     if not os.path.exists(AGENTS_REPO_LOCAL_PATH):
@@ -948,7 +948,7 @@ def load_agents_from_hf():
     # Walk through the directory to find all JSON files
     files_processed = 0
-    print(f"   Loading agent metadata from {AGENTS_REPO_LOCAL_PATH}...")
     for root, dirs, files in os.walk(AGENTS_REPO_LOCAL_PATH):
         # Skip .git directory
@@ -966,7 +966,7 @@ def load_agents_from_hf():
                 with open(file_path, 'r', encoding='utf-8') as f:
                     agent_data = json.load(f)
-                # Only include active agents
                 if agent_data.get('status') != 'active':
                     continue
@@ -974,14 +974,14 @@ def load_agents_from_hf():
                 github_identifier = filename.replace('.json', '')
                 agent_data['github_identifier'] = github_identifier
-                agents.append(agent_data)
             except Exception as e:
                 print(f"   ⚠ Error loading {filename}: {str(e)}")
                 continue
-    print(f"   ✓ Loaded {len(agents)} active agents (from {files_processed} total files)")
-    return agents
 def calculate_issue_stats_from_metadata(metadata_list):
@@ -1002,12 +1002,12 @@ def calculate_issue_stats_from_metadata(metadata_list):
     }
-def calculate_monthly_metrics_by_agent(all_metadata_dict, agents):
-    """Calculate monthly metrics for all agents for visualization."""
-    identifier_to_name = {agent.get('github_identifier'): agent.get('name') for agent in agents if agent.get('github_identifier')}
     if not all_metadata_dict:
-        return {'agents': [], 'months': [], 'data': {}}
     agent_month_data = defaultdict(lambda: defaultdict(list))
@@ -1065,7 +1065,7 @@ def calculate_monthly_metrics_by_agent(all_metadata_dict, agents):
     agents_list = sorted(list(agent_month_data.keys()))
     return {
-        'agents': agents_list,
         'months': months,
         'data': result_data
     }
@@ -1086,12 +1086,12 @@ def calculate_discussion_stats_from_metadata(metadata_list):
     }
-def calculate_monthly_metrics_by_agent_discussions(all_discussions_dict, agents):
-    """Calculate monthly metrics for discussions for all agents for visualization."""
-    identifier_to_name = {agent.get('github_identifier'): agent.get('name') for agent in agents if agent.get('github_identifier')}
     if not all_discussions_dict:
-        return {'agents': [], 'months': [], 'data': {}}
     agent_month_data = defaultdict(lambda: defaultdict(list))
@@ -1145,23 +1145,23 @@ def calculate_monthly_metrics_by_agent_discussions(all_discussions_dict, agents)
     agents_list = sorted(list(agent_month_data.keys()))
     return {
-        'agents': agents_list,
         'months': months,
         'data': result_data
     }
-def construct_leaderboard_from_metadata(all_metadata_dict, agents, wanted_resolved_dict=None, discussions_dict=None):
     """Construct leaderboard from in-memory issue metadata and discussion metadata.
     Args:
-        all_metadata_dict: Dictionary mapping agent ID to list of issue metadata (agent-assigned issues)
-        agents: List of agent metadata
-        wanted_resolved_dict: Optional dictionary mapping agent ID to list of resolved wanted issues
-        discussions_dict: Optional dictionary mapping agent ID to list of discussion metadata
     """
-    if not agents:
-        print("Error: No agents found")
         return {}
     if wanted_resolved_dict is None:
@@ -1172,9 +1172,9 @@ def construct_leaderboard_from_metadata(all_metadata_dict, agents, wanted_resolv
     cache_dict = {}
-    for agent in agents:
-        identifier = agent.get('github_identifier')
-        agent_name = agent.get('name', 'Unknown')
         bot_metadata = all_metadata_dict.get(identifier, [])
         stats = calculate_issue_stats_from_metadata(bot_metadata)
@@ -1188,7 +1188,7 @@ def construct_leaderboard_from_metadata(all_metadata_dict, agents, wanted_resolv
         cache_dict[identifier] = {
             'name': agent_name,
-            'website': agent.get('website', 'N/A'),
             'github_identifier': identifier,
             **stats,
             'resolved_wanted_issues': resolved_wanted,
@@ -1211,7 +1211,7 @@ def save_leaderboard_data_to_hf(leaderboard_dict, monthly_metrics, wanted_issues
             wanted_issues = []
         if discussion_monthly_metrics is None:
-            discussion_monthly_metrics = {'agents': [], 'months': [], 'data': {}}
         combined_data = {
             'metadata': {
@@ -1255,7 +1255,7 @@ def save_leaderboard_data_to_hf(leaderboard_dict, monthly_metrics, wanted_issues
 def mine_all_agents():
     """
-    Mine issue metadata for all agents using STREAMING batch processing.
     Downloads GHArchive data, then uses BATCH-based DuckDB queries.
     """
     print(f"\n[1/4] Downloading GHArchive data...")
@@ -1263,19 +1263,19 @@ def mine_all_agents():
     if not download_all_gharchive_data():
         print("Warning: Download had errors, continuing with available data...")
-    print(f"\n[2/4] Loading agent metadata...")
-    agents = load_agents_from_hf()
-    if not agents:
-        print("Error: No agents found")
         return
-    identifiers = [agent['github_identifier'] for agent in agents if agent.get('github_identifier')]
     if not identifiers:
-        print("Error: No valid agent identifiers found")
         return
-    print(f"\n[3/4] Mining issue metadata ({len(identifiers)} agents, {LEADERBOARD_TIME_FRAME_DAYS} days)...")
     try:
         conn = get_duckdb_connection()
@@ -1309,11 +1309,11 @@ def mine_all_agents():
     try:
         leaderboard_dict = construct_leaderboard_from_metadata(
-            agent_issues, agents, wanted_resolved, agent_discussions
         )
-        monthly_metrics = calculate_monthly_metrics_by_agent(agent_issues, agents)
         discussion_monthly_metrics = calculate_monthly_metrics_by_agent_discussions(
-            agent_discussions, agents
         )
         save_leaderboard_data_to_hf(
             leaderboard_dict, monthly_metrics, wanted_open, discussion_monthly_metrics
@@ -1350,7 +1350,7 @@ def setup_scheduler():
         mine_all_agents,
         trigger=trigger,
         id='mine_all_agents',
-        name='Mine GHArchive data for all agents',
         replace_existing=True
     )

 def fetch_all_metadata_streaming(conn, identifiers, start_date, end_date):
     """
     UNIFIED QUERY: Fetches ALL metadata types in ONE query per batch:
+    - IssuesEvent, IssueCommentEvent (for assistant-assigned issues AND wanted issues)
     - PullRequestEvent (for wanted issue tracking)
     - DiscussionEvent (for discussion tracking)
     Then post-processes in Python to separate into:
+    1. Assistant-assigned issues: Issues where assistants are assigned to or commented on
+    2. Wanted issues: Long-standing issues from tracked orgs linked to merged PRs by assistants
+    3. Discussions: GitHub discussions created by assistants
     This approach is more efficient than running separate queries for each category.
     Returns:
         Dictionary with four keys:
+        - 'agent_issues': {agent_id: [issue_metadata]} for assistant-assigned issues
         - 'wanted_open': [open_wanted_issues] for long-standing open issues
         - 'wanted_resolved': {agent_id: [resolved_wanted]} for resolved wanted issues
+        - 'agent_discussions': {agent_id: [discussion_metadata]} for assistant discussions
     """
     print(f"   Fetching ALL metadata (issues, PRs, discussions) with unified query...")
     identifier_set = set(identifiers)
     identifier_list = ', '.join([f"'{id}'" for id in identifiers])
     tracked_orgs_list = ', '.join([f"'{org}'" for org in TRACKED_ORGS])
+    # Storage for assistant-assigned issues
     agent_issues = defaultdict(list)  # agent_id -> [issue_metadata]
     agent_issue_urls = defaultdict(set)  # agent_id -> set of issue URLs (for deduplication)
         try:
             # UNIFIED QUERY: Fetch ALL event types in ONE query
+            # Post-process in Python to separate into assistant-assigned issues, wanted issues, PRs, and discussions
             unified_query = f"""
             SELECT
                 type,
                 json_extract(payload, '$.issue.labels') as issue_labels,
                 json_extract_string(payload, '$.issue.pull_request') as is_pull_request,
                 json_extract_string(payload, '$.issue.state_reason') as issue_state_reason,
+                -- Actor/assignee fields for assistant assignment
                 json_extract_string(payload, '$.issue.user.login') as issue_creator,
                 json_extract_string(payload, '$.issue.assignee.login') as issue_assignee,
                 json_extract(payload, '$.issue.assignees') as issue_assignees,
             WHERE
                 type IN ('IssuesEvent', 'IssueCommentEvent', 'PullRequestEvent', 'DiscussionEvent')
                 AND (
+                    -- Assistant-assigned issues: assistant is creator, assignee, or commenter
                     (type = 'IssuesEvent' AND (
                         json_extract_string(payload, '$.issue.user.login') IN ({identifier_list})
                         OR json_extract_string(payload, '$.issue.assignee.login') IN ({identifier_list})
                         )
                         OR SPLIT_PART(json_extract_string(repo, '$.name'), '/', 1) IN ({tracked_orgs_list})
                     ))
+                    -- Issue comments: assistant is commenter OR tracked org
                     OR (type = 'IssueCommentEvent' AND (
                         json_extract_string(payload, '$.comment.user.login') IN ({identifier_list})
                         OR SPLIT_PART(json_extract_string(repo, '$.name'), '/', 1) IN ({tracked_orgs_list})
                     ))
+                    -- PRs: assistant is creator OR tracked org (for wanted issue tracking)
                     OR (type = 'PullRequestEvent' AND (
                         json_extract_string(payload, '$.pull_request.user.login') IN ({identifier_list})
                         OR SPLIT_PART(json_extract_string(repo, '$.name'), '/', 1) IN ({tracked_orgs_list})
                     ))
+                    -- Discussions: assistant is creator AND tracked org
                     OR (type = 'DiscussionEvent'
                         AND json_extract_string(payload, '$.discussion.user.login') IN ({identifier_list})
                         AND SPLIT_PART(json_extract_string(repo, '$.name'), '/', 1) IN ({tracked_orgs_list})
             issue_events = []  # For wanted tracking
             pr_events = []     # For wanted tracking
             discussion_events = []  # For discussion tracking
+            agent_issue_events = []  # For assistant-assigned issues
             for row in all_results:
                 event_type = row[0]
                 if event_type in ('IssuesEvent', 'IssueCommentEvent'):
                     if not is_pr:  # It's an issue, not a PR
+                        # Check if this is an assistant-assigned issue
                         issue_creator = row[11]
                         issue_assignee = row[12]
                         issue_assignees_json = row[13]
                                     pass
                         elif event_type == 'IssueCommentEvent':
+                            # Check if commenter is an assistant
                             if commenter in identifier_set:
                                 agent_identifier = commenter
                 elif event_type == 'DiscussionEvent':
                     discussion_events.append(row)
+            # Process assistant-assigned issues
             for row, agent_identifier in agent_issue_events:
                 # Row indices: repo_url=2, issue_url=3, issue_created_at=6, issue_closed_at=7, issue_state_reason=10
                 repo_url = row[2]
                     except:
                         continue
+                # Deduplicate: only add if we haven't seen this issue for this assistant
                 if full_url in agent_issue_urls[agent_identifier]:
                     continue
                     except:
                         continue
+                # Group by creator (assistant identifier)
+                # Only track discussions from our assistant identifiers
                 if discussion_creator not in identifier_set:
                     continue
                     'state_reason': discussion_state_reason
                 }
+                # Group by assistant
                 if discussion_creator not in discussions_by_agent:
                     discussions_by_agent[discussion_creator] = []
                 discussions_by_agent[discussion_creator].append(discussion_meta)
+            print(f"✓ {len(agent_issue_events)} assistant issues, {len(issue_events)} wanted issues, {len(pr_events)} PRs, {len(discussion_events)} discussions")
         except Exception as e:
             print(f"\n   ✗ Batch {batch_num} error: {str(e)}")
         # Move to next batch
         current_date = batch_end + timedelta(days=1)
+    # Post-processing: Filter issues and assign to assistants
     print(f"\n   Post-processing {len(all_issues)} wanted issues...")
     wanted_open = []
         if not linked_prs:
             continue
+        # Check if any linked PR was merged AND created by an assistant
         resolved_by = None
         for pr_url in linked_prs:
             merged_at = pr_merged_at.get(pr_url)
                 except:
                     pass
+    print(f"   ✓ Found {sum(len(issues) for issues in agent_issues.values())} assistant-assigned issues across {len(agent_issues)} assistants")
     print(f"   ✓ Found {len(wanted_open)} long-standing open wanted issues")
+    print(f"   ✓ Found {sum(len(issues) for issues in wanted_resolved.values())} resolved wanted issues across {len(wanted_resolved)} assistants")
+    print(f"   ✓ Found {sum(len(discussions) for discussions in discussions_by_agent.values())} discussions across {len(discussions_by_agent)} assistants")
     return {
         'agent_issues': dict(agent_issues),
 def load_agents_from_hf():
     """
+    Load all assistant metadata JSON files from local git repository.
     ALWAYS syncs with remote first to ensure we have the latest bot data.
     """
     # MANDATORY: Sync with remote first to get latest bot data
+    print(f"   Syncing bot_metadata repository to get latest assistants...")
     sync_agents_repo()  # Will raise exception if sync fails
+    assistants = []
     # Scan local directory for JSON files
     if not os.path.exists(AGENTS_REPO_LOCAL_PATH):
     # Walk through the directory to find all JSON files
     files_processed = 0
+    print(f"   Loading assistant metadata from {AGENTS_REPO_LOCAL_PATH}...")
     for root, dirs, files in os.walk(AGENTS_REPO_LOCAL_PATH):
         # Skip .git directory
                 with open(file_path, 'r', encoding='utf-8') as f:
                     agent_data = json.load(f)
+                # Only include active assistants
                 if agent_data.get('status') != 'active':
                     continue
                 github_identifier = filename.replace('.json', '')
                 agent_data['github_identifier'] = github_identifier
+                assistants.append(agent_data)
             except Exception as e:
                 print(f"   ⚠ Error loading {filename}: {str(e)}")
                 continue
+    print(f"   ✓ Loaded {len(assistants)} active assistants (from {files_processed} total files)")
+    return assistants
 def calculate_issue_stats_from_metadata(metadata_list):
     }
+def calculate_monthly_metrics_by_agent(all_metadata_dict, assistants):
+    """Calculate monthly metrics for all assistants for visualization."""
+    identifier_to_name = {assistant.get('github_identifier'): assistant.get('name') for assistant in assistants if assistant.get('github_identifier')}
     if not all_metadata_dict:
+        return {'assistants': [], 'months': [], 'data': {}}
     agent_month_data = defaultdict(lambda: defaultdict(list))
     agents_list = sorted(list(agent_month_data.keys()))
     return {
+        'assistants': agents_list,
         'months': months,
         'data': result_data
     }
     }
+def calculate_monthly_metrics_by_agent_discussions(all_discussions_dict, assistants):
+    """Calculate monthly metrics for discussions for all assistants for visualization."""
+    identifier_to_name = {assistant.get('github_identifier'): assistant.get('name') for assistant in assistants if assistant.get('github_identifier')}
     if not all_discussions_dict:
+        return {'assistants': [], 'months': [], 'data': {}}
     agent_month_data = defaultdict(lambda: defaultdict(list))
     agents_list = sorted(list(agent_month_data.keys()))
     return {
+        'assistants': agents_list,
         'months': months,
         'data': result_data
     }
+def construct_leaderboard_from_metadata(all_metadata_dict, assistants, wanted_resolved_dict=None, discussions_dict=None):
     """Construct leaderboard from in-memory issue metadata and discussion metadata.
     Args:
+        all_metadata_dict: Dictionary mapping assistant ID to list of issue metadata (assistant-assigned issues)
+        assistants: List of assistant metadata
+        wanted_resolved_dict: Optional dictionary mapping assistant ID to list of resolved wanted issues
+        discussions_dict: Optional dictionary mapping assistant ID to list of discussion metadata
     """
+    if not assistants:
+        print("Error: No assistants found")
         return {}
     if wanted_resolved_dict is None:
     cache_dict = {}
+    for assistant in assistants:
+        identifier = assistant.get('github_identifier')
+        agent_name = assistant.get('name', 'Unknown')
         bot_metadata = all_metadata_dict.get(identifier, [])
         stats = calculate_issue_stats_from_metadata(bot_metadata)
         cache_dict[identifier] = {
             'name': agent_name,
+            'website': assistant.get('website', 'N/A'),
             'github_identifier': identifier,
             **stats,
             'resolved_wanted_issues': resolved_wanted,
             wanted_issues = []
         if discussion_monthly_metrics is None:
+            discussion_monthly_metrics = {'assistants': [], 'months': [], 'data': {}}
         combined_data = {
             'metadata': {
 def mine_all_agents():
     """
+    Mine issue metadata for all assistants using STREAMING batch processing.
     Downloads GHArchive data, then uses BATCH-based DuckDB queries.
     """
     print(f"\n[1/4] Downloading GHArchive data...")
     if not download_all_gharchive_data():
         print("Warning: Download had errors, continuing with available data...")
+    print(f"\n[2/4] Loading assistant metadata...")
+    assistants = load_agents_from_hf()
+    if not assistants:
+        print("Error: No assistants found")
         return
+    identifiers = [assistant['github_identifier'] for assistant in assistants if assistant.get('github_identifier')]
     if not identifiers:
+        print("Error: No valid assistant identifiers found")
         return
+    print(f"\n[3/4] Mining issue metadata ({len(identifiers)} assistants, {LEADERBOARD_TIME_FRAME_DAYS} days)...")
     try:
         conn = get_duckdb_connection()
     try:
         leaderboard_dict = construct_leaderboard_from_metadata(
+            agent_issues, assistants, wanted_resolved, agent_discussions
         )
+        monthly_metrics = calculate_monthly_metrics_by_agent(agent_issues, assistants)
         discussion_monthly_metrics = calculate_monthly_metrics_by_agent_discussions(
+            agent_discussions, assistants
         )
         save_leaderboard_data_to_hf(
             leaderboard_dict, monthly_metrics, wanted_open, discussion_monthly_metrics
         mine_all_agents,
         trigger=trigger,
         id='mine_all_agents',
+        name='Mine GHArchive data for all assistants',
         replace_existing=True
     )