freetestn / app.py
kimhyunwoo's picture
Update app.py
8a7a11f verified
import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import gc
import os
import datetime
import time
# --- Configuration ---
MODEL_ID = "naver-hyperclovax/HyperCLOVAX-SEED-Text-Instruct-0.5B"
MAX_NEW_TOKENS = 512
CPU_THREAD_COUNT = 4 # 필요시 조절
# --- Optional: Set CPU Threads ---
# torch.set_num_threads(CPU_THREAD_COUNT)
# os.environ["OMP_NUM_THREADS"] = str(CPU_THREAD_COUNT)
# os.environ["MKL_NUM_THREADS"] = str(CPU_THREAD_COUNT)
print("--- Environment Setup ---")
print(f"PyTorch version: {torch.__version__}")
print(f"Running on device: cpu")
print(f"Torch Threads: {torch.get_num_threads()}")
# --- Model and Tokenizer Loading ---
print(f"--- Loading Model: {MODEL_ID} ---")
print("This might take a few minutes, especially on the first launch...")
model = None
tokenizer = None
load_successful = False
stop_token_ids_list = [] # Initialize stop_token_ids_list
try:
start_load_time = time.time()
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
torch_dtype=torch.float32,
device_map="cpu",
# force_download=True # Keep commented unless cache issues reappear
)
tokenizer = AutoTokenizer.from_pretrained(
MODEL_ID,
# force_download=True # Keep commented
)
model.eval()
load_time = time.time() - start_load_time
print(f"--- Model and Tokenizer Loaded Successfully on CPU in {load_time:.2f} seconds ---")
load_successful = True
# --- Stop Token Configuration ---
stop_token_strings = ["<|endofturn|>", "<|stop|>"]
temp_stop_ids = [tokenizer.convert_tokens_to_ids(token) for token in stop_token_strings]
if tokenizer.eos_token_id is not None and tokenizer.eos_token_id not in temp_stop_ids:
temp_stop_ids.append(tokenizer.eos_token_id)
elif tokenizer.eos_token_id is None:
print("Warning: tokenizer.eos_token_id is None. Cannot add to stop tokens.")
stop_token_ids_list = [tid for tid in temp_stop_ids if tid is not None] # Assign to the global scope variable
if not stop_token_ids_list:
print("Warning: Could not find any stop token IDs. Using default EOS if available, otherwise generation might not stop correctly.")
if tokenizer.eos_token_id is not None:
stop_token_ids_list = [tokenizer.eos_token_id]
else:
print("Error: No stop tokens found, including default EOS. Generation may run indefinitely.")
# Consider raising an error or setting a default if this is critical
print(f"Using Stop Token IDs: {stop_token_ids_list}")
except Exception as e:
print(f"!!! Error loading model: {e}")
if 'model' in locals() and model is not None: del model
if 'tokenizer' in locals() and tokenizer is not None: del tokenizer
gc.collect()
# Raise Gradio error to display in the Space UI if loading fails
raise gr.Error(f"Failed to load the model {MODEL_ID}. Cannot start the application. Error: {e}")
# --- System Prompt Definition ---
def get_system_prompt():
current_date = datetime.datetime.now().strftime("%Y-%m-%d (%A)")
return (
f"- AI 언어모델의 이름은 \"CLOVA X\" 이며 네이버에서 만들었다.\n"
# f"- 오늘은 {current_date}이다.\n" # Uncomment if needed
f"- 사용자의 질문에 대해 친절하고 자세하게 한국어로 답변해야 한다."
)
# --- Warm-up Function ---
def warmup_model():
if not load_successful or model is None or tokenizer is None:
print("Skipping warmup: Model not loaded successfully.")
return
print("--- Starting Model Warm-up ---")
try:
start_warmup_time = time.time()
warmup_message = "안녕하세요"
system_prompt = get_system_prompt()
warmup_chat = [
{"role": "tool_list", "content": ""},
{"role": "system", "content": system_prompt},
{"role": "user", "content": warmup_message}
]
inputs = tokenizer.apply_chat_template(
warmup_chat,
add_generation_prompt=True,
return_dict=True,
return_tensors="pt"
).to("cpu")
# Check if stop_token_ids_list is empty and handle appropriately
gen_kwargs = {
"max_new_tokens": 10,
"pad_token_id": tokenizer.eos_token_id if tokenizer.eos_token_id is not None else tokenizer.pad_token_id,
"do_sample": False
}
if stop_token_ids_list:
gen_kwargs["eos_token_id"] = stop_token_ids_list
else:
print("Warmup Warning: No stop tokens defined for generation.")
with torch.no_grad():
output_ids = model.generate(**inputs, **gen_kwargs)
# Optional: Decode warmup response for verification
# response = tokenizer.decode(output_ids[0, inputs['input_ids'].shape[1]:], skip_special_tokens=True)
# print(f"Warm-up response (decoded): {response}")
del inputs
del output_ids
gc.collect()
warmup_time = time.time() - start_warmup_time
print(f"--- Model Warm-up Completed in {warmup_time:.2f} seconds ---")
except Exception as e:
print(f"!!! Error during model warm-up: {e}")
finally:
gc.collect()
# --- Inference Function ---
def predict(message, history):
"""
Generates response using HyperCLOVAX.
Assumes 'history' is in the Gradio 'messages' format: List[Dict].
"""
if model is None or tokenizer is None:
return "오류: 모델이 로드되지 않았습니다."
system_prompt = get_system_prompt()
# Start with system prompt
chat_history_formatted = [
{"role": "tool_list", "content": ""}, # As required by model card
{"role": "system", "content": system_prompt}
]
# Append history (List of {'role': 'user'/'assistant', 'content': '...'})
if isinstance(history, list): # Check if history is a list
for turn in history:
# Validate turn format
if isinstance(turn, dict) and "role" in turn and "content" in turn:
chat_history_formatted.append(turn)
# Handle potential older tuple format if necessary (though less likely now)
elif isinstance(turn, (list, tuple)) and len(turn) == 2:
print(f"Warning: Received history item in tuple format: {turn}. Converting to messages format.")
chat_history_formatted.append({"role": "user", "content": turn[0]})
if turn[1]: # Ensure assistant message exists
chat_history_formatted.append({"role": "assistant", "content": turn[1]})
else:
print(f"Warning: Skipping unexpected history format item: {turn}")
# Append the latest user message
chat_history_formatted.append({"role": "user", "content": message})
inputs = None
output_ids = None
try:
inputs = tokenizer.apply_chat_template(
chat_history_formatted,
add_generation_prompt=True,
return_dict=True,
return_tensors="pt"
).to("cpu")
input_length = inputs['input_ids'].shape[1]
print(f"\nInput tokens: {input_length}")
except Exception as e:
print(f"!!! Error applying chat template: {e}")
return f"오류: 입력 형식을 처리하는 중 문제가 발생했습니다. ({e})"
try:
print("Generating response...")
generation_start_time = time.time()
# Prepare generation arguments, handling empty stop_token_ids_list
gen_kwargs = {
"max_new_tokens": MAX_NEW_TOKENS,
"pad_token_id": tokenizer.eos_token_id if tokenizer.eos_token_id is not None else tokenizer.pad_token_id,
"do_sample": True,
"temperature": 0.7,
"top_p": 0.9,
}
if stop_token_ids_list:
gen_kwargs["eos_token_id"] = stop_token_ids_list
else:
print("Generation Warning: No stop tokens defined.")
with torch.no_grad():
output_ids = model.generate(**inputs, **gen_kwargs)
generation_time = time.time() - generation_start_time
print(f"Generation complete in {generation_time:.2f} seconds.")
except Exception as e:
print(f"!!! Error during model generation: {e}")
if inputs is not None: del inputs
if output_ids is not None: del output_ids
gc.collect()
return f"오류: 응답을 생성하는 중 문제가 발생했습니다. ({e})"
# Decode the response
response = "오류: 응답 생성에 실패했습니다."
if output_ids is not None:
try:
new_tokens = output_ids[0, input_length:]
response = tokenizer.decode(new_tokens, skip_special_tokens=True)
print(f"Output tokens: {len(new_tokens)}")
del new_tokens
except Exception as e:
print(f"!!! Error decoding response: {e}")
response = "오류: 응답을 디코딩하는 중 문제가 발생했습니다."
# Clean up memory
if inputs is not None: del inputs
if output_ids is not None: del output_ids
gc.collect()
print("Memory cleaned.")
return response
# --- Gradio Interface Setup ---
print("--- Setting up Gradio Interface ---")
# No need to create a separate Chatbot component beforehand
# chatbot_component = gr.Chatbot(...) # REMOVED
examples = [
["네이버 클로바X는 무엇인가요?"],
["슈뢰딩거 방정식과 양자역학의 관계를 설명해주세요."],
["딥러닝 모델 학습 과정을 단계별로 알려줘."],
["제주도 여행 계획을 세우고 있는데, 3박 4일 추천 코스 좀 짜줄래?"],
]
# Let ChatInterface manage its own internal Chatbot component
# Remove the chatbot=... argument
demo = gr.ChatInterface(
fn=predict, # Link the prediction function
# chatbot=chatbot_component, # REMOVED
title="🇰🇷 네이버 HyperCLOVA X SEED (0.5B) 데모",
description=(
f"**모델:** {MODEL_ID}\n"
f"**환경:** Hugging Face 무료 CPU (16GB RAM)\n"
f"**주의:** CPU에서 실행되므로 응답 생성에 다소 시간이 걸릴 수 있습니다. (웜업 완료)\n"
f"최대 생성 토큰 수는 {MAX_NEW_TOKENS}개로 제한됩니다."
),
examples=examples,
cache_examples=False,
theme="soft",
)
# --- Application Launch ---
if __name__ == "__main__":
if load_successful:
warmup_model()
else:
print("Skipping warm-up because model loading failed.")
print("--- Launching Gradio App ---")
demo.queue().launch(
# share=True # Uncomment for public link
# server_name="0.0.0.0" # Uncomment for local network access
)