File size: 7,939 Bytes
b619545
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
import streamlit as st
import time
import json
import pandas as pd
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from utils.ui import set_page_style, display_metric
from utils.auth import check_hf_token
from utils.training import simulate_training_progress

# Set page configuration
st.set_page_config(
    page_title="Training Monitor - Gemma Fine-tuning",
    page_icon="πŸ€–",
    layout="wide"
)

# Apply custom styling
set_page_style()

# Sidebar for authentication
with st.sidebar:
    st.title("πŸ€– Gemma Fine-tuning")
    st.image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/gemma-banner.png", 
             use_column_width=True)
    
    # Authentication section
    st.subheader("πŸ”‘ Authentication")
    hf_token = st.text_input("Hugging Face API Token", type="password", 
                             help="Enter your Hugging Face write token to enable model fine-tuning")
    auth_status = check_hf_token(hf_token) if hf_token else False
    
    if auth_status:
        st.success("Authenticated successfully!")
    elif hf_token:
        st.error("Invalid token. Please check and try again.")
    
    st.divider()
    st.caption("A simple UI for fine-tuning Gemma models")

# Main content
st.title("πŸ“Š Training Monitor")

if not hf_token or not auth_status:
    st.warning("Please authenticate with your Hugging Face token in the sidebar first")
    st.stop()

# Check if training was started
if "model_repo" not in st.session_state:
    st.warning("No active training jobs found")
    st.page_link("pages/02_Model_Configuration.py", label="Go to Model Configuration", icon="βš™οΈ")
    
    # For testing purposes, allow manual entry
    manual_repo = st.text_input("Or enter model repository name manually:")
    if manual_repo:
        st.session_state["model_repo"] = manual_repo
        st.session_state["model_version"] = "google/gemma-2b"  # Default
    else:
        st.stop()

# Training information
st.header("Training Information")

col1, col2 = st.columns(2)

with col1:
    st.subheader("Model Repository")
    st.info(st.session_state["model_repo"])
    
    st.subheader("Base Model")
    st.info(st.session_state.get("model_version", "google/gemma-2b"))

with col2:
    # For demo purposes, create a button to start simulated training
    if "training_started" not in st.session_state:
        st.subheader("Start Training")
        if st.button("Launch Training Job", type="primary"):
            st.session_state["training_started"] = True
            st.experimental_rerun()
    else:
        st.subheader("Status")
        st.success("Training in Progress")
        
        # Simulate a cancel button
        if st.button("Cancel Training Job", type="secondary"):
            st.warning("This is a simulation - in a real environment, this would cancel the training job")

# If training has started, show the progress
if st.session_state.get("training_started", False):
    st.header("Training Progress")
    
    # Create a placeholder for the progress bar
    progress_bar = st.progress(0)
    
    # Create placeholder metrics
    col1, col2, col3, col4 = st.columns(4)
    
    # Get training progress (simulated for demo)
    progress_data = simulate_training_progress()
    
    # Update progress bar
    progress_bar.progress(progress_data["progress"])
    
    # Update metrics
    with col1:
        display_metric("Epoch", f"{progress_data['current_epoch'] + 1}/{progress_data['total_epochs']}")
    
    with col2:
        display_metric("Loss", f"{progress_data['loss']:.4f}")
    
    with col3:
        display_metric("Learning Rate", f"{progress_data['learning_rate']:.1e}")
    
    with col4:
        status_text = "Complete" if progress_data["status"] == "completed" else "Running"
        display_metric("Status", status_text)
    
    # Create training history visualization
    st.subheader("Training Metrics")
    
    # Simulate training history data
    if "training_history" not in st.session_state:
        st.session_state.training_history = []
    
    # Add current data point to history if not completed
    if progress_data["status"] != "completed" or len(st.session_state.training_history) == 0:
        st.session_state.training_history.append({
            "epoch": progress_data["current_epoch"],
            "loss": progress_data["loss"],
            "learning_rate": progress_data["learning_rate"],
            "timestamp": time.time()
        })
    
    # Convert history to DataFrame
    history_df = pd.DataFrame(st.session_state.training_history)
    
    if not history_df.empty and len(history_df) > 1:
        # Create tabs for different visualizations
        loss_tab, lr_tab = st.tabs(["Loss Curve", "Learning Rate"])
        
        with loss_tab:
            # Create a Plotly figure for the loss curve
            fig = go.Figure()
            fig.add_trace(go.Scatter(
                x=history_df["epoch"], 
                y=history_df["loss"],
                mode='lines+markers',
                name='Training Loss',
                line=dict(color='#FF4B4B', width=3)
            ))
            
            fig.update_layout(
                title="Training Loss Over Time",
                xaxis_title="Epoch",
                yaxis_title="Loss",
                template="plotly_white",
                height=400
            )
            
            st.plotly_chart(fig, use_container_width=True)
        
        with lr_tab:
            # Create a Plotly figure for the learning rate
            fig = go.Figure()
            fig.add_trace(go.Scatter(
                x=history_df["epoch"], 
                y=history_df["learning_rate"],
                mode='lines+markers',
                name='Learning Rate',
                line=dict(color='#0068C9', width=3)
            ))
            
            fig.update_layout(
                title="Learning Rate Schedule",
                xaxis_title="Epoch",
                yaxis_title="Learning Rate",
                template="plotly_white",
                height=400
            )
            
            st.plotly_chart(fig, use_container_width=True)
    else:
        st.info("Training metrics will appear here once training progresses")
    
    # Training logs
    st.subheader("Training Logs")
    
    # Simulate logs
    log_lines = [
        f"[{pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}] Initialized training job",
        f"[{pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}] Loading model: {st.session_state.get('model_version', 'google/gemma-2b')}",
        f"[{pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}] Preparing LoRA configuration"
    ]
    
    # Add epoch logs based on progress
    current_epoch = progress_data["current_epoch"]
    for epoch in range(min(current_epoch + 1, progress_data["total_epochs"])):
        timestamp = pd.Timestamp.now() - pd.Timedelta(seconds=(current_epoch - epoch) * 60)
        log_lines.append(f"[{timestamp.strftime('%Y-%m-%d %H:%M:%S')}] Epoch {epoch+1}/{progress_data['total_epochs']} started")
        
        if epoch < current_epoch:
            # For completed epochs, add completion log
            timestamp = pd.Timestamp.now() - pd.Timedelta(seconds=(current_epoch - epoch - 0.5) * 60)
            sim_loss = max(2.5 - (epoch * 0.5), 0.5)
            log_lines.append(f"[{timestamp.strftime('%Y-%m-%d %H:%M:%S')}] Epoch {epoch+1} completed: loss={sim_loss:.4f}")
    
    # Display logs in a scrollable area
    st.code("\n".join(log_lines))
    
    # Next steps (only show when training is complete)
    if progress_data["status"] == "completed":
        st.success("Training completed successfully!")
        st.page_link("pages/04_Evaluation.py", label="Next: Evaluate Model", icon="πŸ”")
    else:
        # Auto-refresh for monitoring
        st.empty()
        time.sleep(2)  # Wait for 2 seconds
        st.experimental_rerun()