Spaces:
Build error
Build error
| import gradio as gr | |
| import torch | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| import gc | |
| import os | |
| import datetime | |
| import time | |
| # --- Configuration --- | |
| MODEL_ID = "naver-hyperclovax/HyperCLOVAX-SEED-Text-Instruct-0.5B" | |
| MAX_NEW_TOKENS = 512 | |
| CPU_THREAD_COUNT = 4 # 필요시 조절 | |
| # --- Optional: Set CPU Threads --- | |
| # torch.set_num_threads(CPU_THREAD_COUNT) | |
| # os.environ["OMP_NUM_THREADS"] = str(CPU_THREAD_COUNT) | |
| # os.environ["MKL_NUM_THREADS"] = str(CPU_THREAD_COUNT) | |
| print("--- Environment Setup ---") | |
| print(f"PyTorch version: {torch.__version__}") | |
| print(f"Running on device: cpu") | |
| print(f"Torch Threads: {torch.get_num_threads()}") | |
| # --- Model and Tokenizer Loading --- | |
| print(f"--- Loading Model: {MODEL_ID} ---") | |
| print("This might take a few minutes, especially on the first launch...") | |
| model = None | |
| tokenizer = None | |
| load_successful = False | |
| stop_token_ids_list = [] # Initialize stop_token_ids_list | |
| try: | |
| start_load_time = time.time() | |
| model = AutoModelForCausalLM.from_pretrained( | |
| MODEL_ID, | |
| torch_dtype=torch.float32, | |
| device_map="cpu", | |
| # force_download=True # Keep commented unless cache issues reappear | |
| ) | |
| tokenizer = AutoTokenizer.from_pretrained( | |
| MODEL_ID, | |
| # force_download=True # Keep commented | |
| ) | |
| model.eval() | |
| load_time = time.time() - start_load_time | |
| print(f"--- Model and Tokenizer Loaded Successfully on CPU in {load_time:.2f} seconds ---") | |
| load_successful = True | |
| # --- Stop Token Configuration --- | |
| stop_token_strings = ["<|endofturn|>", "<|stop|>"] | |
| temp_stop_ids = [tokenizer.convert_tokens_to_ids(token) for token in stop_token_strings] | |
| if tokenizer.eos_token_id is not None and tokenizer.eos_token_id not in temp_stop_ids: | |
| temp_stop_ids.append(tokenizer.eos_token_id) | |
| elif tokenizer.eos_token_id is None: | |
| print("Warning: tokenizer.eos_token_id is None. Cannot add to stop tokens.") | |
| stop_token_ids_list = [tid for tid in temp_stop_ids if tid is not None] # Assign to the global scope variable | |
| if not stop_token_ids_list: | |
| print("Warning: Could not find any stop token IDs. Using default EOS if available, otherwise generation might not stop correctly.") | |
| if tokenizer.eos_token_id is not None: | |
| stop_token_ids_list = [tokenizer.eos_token_id] | |
| else: | |
| print("Error: No stop tokens found, including default EOS. Generation may run indefinitely.") | |
| # Consider raising an error or setting a default if this is critical | |
| print(f"Using Stop Token IDs: {stop_token_ids_list}") | |
| except Exception as e: | |
| print(f"!!! Error loading model: {e}") | |
| if 'model' in locals() and model is not None: del model | |
| if 'tokenizer' in locals() and tokenizer is not None: del tokenizer | |
| gc.collect() | |
| # Raise Gradio error to display in the Space UI if loading fails | |
| raise gr.Error(f"Failed to load the model {MODEL_ID}. Cannot start the application. Error: {e}") | |
| # --- System Prompt Definition --- | |
| def get_system_prompt(): | |
| current_date = datetime.datetime.now().strftime("%Y-%m-%d (%A)") | |
| return ( | |
| f"- AI 언어모델의 이름은 \"CLOVA X\" 이며 네이버에서 만들었다.\n" | |
| # f"- 오늘은 {current_date}이다.\n" # Uncomment if needed | |
| f"- 사용자의 질문에 대해 친절하고 자세하게 한국어로 답변해야 한다." | |
| ) | |
| # --- Warm-up Function --- | |
| def warmup_model(): | |
| if not load_successful or model is None or tokenizer is None: | |
| print("Skipping warmup: Model not loaded successfully.") | |
| return | |
| print("--- Starting Model Warm-up ---") | |
| try: | |
| start_warmup_time = time.time() | |
| warmup_message = "안녕하세요" | |
| system_prompt = get_system_prompt() | |
| warmup_chat = [ | |
| {"role": "tool_list", "content": ""}, | |
| {"role": "system", "content": system_prompt}, | |
| {"role": "user", "content": warmup_message} | |
| ] | |
| inputs = tokenizer.apply_chat_template( | |
| warmup_chat, | |
| add_generation_prompt=True, | |
| return_dict=True, | |
| return_tensors="pt" | |
| ).to("cpu") | |
| # Check if stop_token_ids_list is empty and handle appropriately | |
| gen_kwargs = { | |
| "max_new_tokens": 10, | |
| "pad_token_id": tokenizer.eos_token_id if tokenizer.eos_token_id is not None else tokenizer.pad_token_id, | |
| "do_sample": False | |
| } | |
| if stop_token_ids_list: | |
| gen_kwargs["eos_token_id"] = stop_token_ids_list | |
| else: | |
| print("Warmup Warning: No stop tokens defined for generation.") | |
| with torch.no_grad(): | |
| output_ids = model.generate(**inputs, **gen_kwargs) | |
| # Optional: Decode warmup response for verification | |
| # response = tokenizer.decode(output_ids[0, inputs['input_ids'].shape[1]:], skip_special_tokens=True) | |
| # print(f"Warm-up response (decoded): {response}") | |
| del inputs | |
| del output_ids | |
| gc.collect() | |
| warmup_time = time.time() - start_warmup_time | |
| print(f"--- Model Warm-up Completed in {warmup_time:.2f} seconds ---") | |
| except Exception as e: | |
| print(f"!!! Error during model warm-up: {e}") | |
| finally: | |
| gc.collect() | |
| # --- Inference Function --- | |
| def predict(message, history): | |
| """ | |
| Generates response using HyperCLOVAX. | |
| Assumes 'history' is in the Gradio 'messages' format: List[Dict]. | |
| """ | |
| if model is None or tokenizer is None: | |
| return "오류: 모델이 로드되지 않았습니다." | |
| system_prompt = get_system_prompt() | |
| # Start with system prompt | |
| chat_history_formatted = [ | |
| {"role": "tool_list", "content": ""}, # As required by model card | |
| {"role": "system", "content": system_prompt} | |
| ] | |
| # Append history (List of {'role': 'user'/'assistant', 'content': '...'}) | |
| if isinstance(history, list): # Check if history is a list | |
| for turn in history: | |
| # Validate turn format | |
| if isinstance(turn, dict) and "role" in turn and "content" in turn: | |
| chat_history_formatted.append(turn) | |
| # Handle potential older tuple format if necessary (though less likely now) | |
| elif isinstance(turn, (list, tuple)) and len(turn) == 2: | |
| print(f"Warning: Received history item in tuple format: {turn}. Converting to messages format.") | |
| chat_history_formatted.append({"role": "user", "content": turn[0]}) | |
| if turn[1]: # Ensure assistant message exists | |
| chat_history_formatted.append({"role": "assistant", "content": turn[1]}) | |
| else: | |
| print(f"Warning: Skipping unexpected history format item: {turn}") | |
| # Append the latest user message | |
| chat_history_formatted.append({"role": "user", "content": message}) | |
| inputs = None | |
| output_ids = None | |
| try: | |
| inputs = tokenizer.apply_chat_template( | |
| chat_history_formatted, | |
| add_generation_prompt=True, | |
| return_dict=True, | |
| return_tensors="pt" | |
| ).to("cpu") | |
| input_length = inputs['input_ids'].shape[1] | |
| print(f"\nInput tokens: {input_length}") | |
| except Exception as e: | |
| print(f"!!! Error applying chat template: {e}") | |
| return f"오류: 입력 형식을 처리하는 중 문제가 발생했습니다. ({e})" | |
| try: | |
| print("Generating response...") | |
| generation_start_time = time.time() | |
| # Prepare generation arguments, handling empty stop_token_ids_list | |
| gen_kwargs = { | |
| "max_new_tokens": MAX_NEW_TOKENS, | |
| "pad_token_id": tokenizer.eos_token_id if tokenizer.eos_token_id is not None else tokenizer.pad_token_id, | |
| "do_sample": True, | |
| "temperature": 0.7, | |
| "top_p": 0.9, | |
| } | |
| if stop_token_ids_list: | |
| gen_kwargs["eos_token_id"] = stop_token_ids_list | |
| else: | |
| print("Generation Warning: No stop tokens defined.") | |
| with torch.no_grad(): | |
| output_ids = model.generate(**inputs, **gen_kwargs) | |
| generation_time = time.time() - generation_start_time | |
| print(f"Generation complete in {generation_time:.2f} seconds.") | |
| except Exception as e: | |
| print(f"!!! Error during model generation: {e}") | |
| if inputs is not None: del inputs | |
| if output_ids is not None: del output_ids | |
| gc.collect() | |
| return f"오류: 응답을 생성하는 중 문제가 발생했습니다. ({e})" | |
| # Decode the response | |
| response = "오류: 응답 생성에 실패했습니다." | |
| if output_ids is not None: | |
| try: | |
| new_tokens = output_ids[0, input_length:] | |
| response = tokenizer.decode(new_tokens, skip_special_tokens=True) | |
| print(f"Output tokens: {len(new_tokens)}") | |
| del new_tokens | |
| except Exception as e: | |
| print(f"!!! Error decoding response: {e}") | |
| response = "오류: 응답을 디코딩하는 중 문제가 발생했습니다." | |
| # Clean up memory | |
| if inputs is not None: del inputs | |
| if output_ids is not None: del output_ids | |
| gc.collect() | |
| print("Memory cleaned.") | |
| return response | |
| # --- Gradio Interface Setup --- | |
| print("--- Setting up Gradio Interface ---") | |
| # No need to create a separate Chatbot component beforehand | |
| # chatbot_component = gr.Chatbot(...) # REMOVED | |
| examples = [ | |
| ["네이버 클로바X는 무엇인가요?"], | |
| ["슈뢰딩거 방정식과 양자역학의 관계를 설명해주세요."], | |
| ["딥러닝 모델 학습 과정을 단계별로 알려줘."], | |
| ["제주도 여행 계획을 세우고 있는데, 3박 4일 추천 코스 좀 짜줄래?"], | |
| ] | |
| # Let ChatInterface manage its own internal Chatbot component | |
| # Remove the chatbot=... argument | |
| demo = gr.ChatInterface( | |
| fn=predict, # Link the prediction function | |
| # chatbot=chatbot_component, # REMOVED | |
| title="🇰🇷 네이버 HyperCLOVA X SEED (0.5B) 데모", | |
| description=( | |
| f"**모델:** {MODEL_ID}\n" | |
| f"**환경:** Hugging Face 무료 CPU (16GB RAM)\n" | |
| f"**주의:** CPU에서 실행되므로 응답 생성에 다소 시간이 걸릴 수 있습니다. (웜업 완료)\n" | |
| f"최대 생성 토큰 수는 {MAX_NEW_TOKENS}개로 제한됩니다." | |
| ), | |
| examples=examples, | |
| cache_examples=False, | |
| theme="soft", | |
| ) | |
| # --- Application Launch --- | |
| if __name__ == "__main__": | |
| if load_successful: | |
| warmup_model() | |
| else: | |
| print("Skipping warm-up because model loading failed.") | |
| print("--- Launching Gradio App ---") | |
| demo.queue().launch( | |
| # share=True # Uncomment for public link | |
| # server_name="0.0.0.0" # Uncomment for local network access | |
| ) |