Upload InternVL2 implementation
Browse files- app_internvl2.py +115 -37
app_internvl2.py
CHANGED
|
@@ -145,17 +145,29 @@ def load_internvl2_model():
|
|
| 145 |
|
| 146 |
print("Loading InternVL2 model...")
|
| 147 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 148 |
# Configure for AWQ quantized model
|
| 149 |
-
backend_config = TurbomindEngineConfig(
|
|
|
|
|
|
|
|
|
|
| 150 |
|
| 151 |
-
# Create
|
|
|
|
| 152 |
internvl2_pipeline = pipeline(
|
| 153 |
MODEL_ID,
|
| 154 |
backend_config=backend_config,
|
| 155 |
log_level='INFO',
|
| 156 |
model_name_or_path=None,
|
| 157 |
backend_name="turbomind",
|
| 158 |
-
stream=False # Important: disable streaming
|
|
|
|
| 159 |
)
|
| 160 |
|
| 161 |
print("InternVL2 model loaded successfully!")
|
|
@@ -196,46 +208,112 @@ def analyze_image(image, prompt):
|
|
| 196 |
else:
|
| 197 |
# If somehow it's already a PIL Image
|
| 198 |
image_pil = image.convert('RGB')
|
|
|
|
|
|
|
|
|
|
|
|
|
| 199 |
|
| 200 |
-
#
|
| 201 |
-
|
| 202 |
-
import queue
|
| 203 |
-
|
| 204 |
-
result_queue = queue.Queue()
|
| 205 |
-
|
| 206 |
-
def run_inference_in_thread():
|
| 207 |
try:
|
| 208 |
-
#
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 212 |
except Exception as e:
|
| 213 |
-
|
|
|
|
|
|
|
| 214 |
|
| 215 |
-
#
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
#
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 233 |
else:
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
|
|
|
| 239 |
except Exception as e:
|
| 240 |
print(f"Error in image analysis: {str(e)}")
|
| 241 |
# Try to clean up memory in case of error
|
|
|
|
| 145 |
|
| 146 |
print("Loading InternVL2 model...")
|
| 147 |
try:
|
| 148 |
+
# Force synchronous execution for everything
|
| 149 |
+
import os
|
| 150 |
+
# Set environment variables to force synchronous behavior
|
| 151 |
+
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
|
| 152 |
+
# Disable asyncio in lmdeploy
|
| 153 |
+
os.environ["LMDEPLOY_DISABLE_ASYNC"] = "1"
|
| 154 |
+
|
| 155 |
# Configure for AWQ quantized model
|
| 156 |
+
backend_config = TurbomindEngineConfig(
|
| 157 |
+
model_format='awq',
|
| 158 |
+
session_len=2048 # Explicitly set session length
|
| 159 |
+
)
|
| 160 |
|
| 161 |
+
# Create a synchronous pipeline to avoid asyncio issues
|
| 162 |
+
# Explicitly set all parameters that might default to async behavior
|
| 163 |
internvl2_pipeline = pipeline(
|
| 164 |
MODEL_ID,
|
| 165 |
backend_config=backend_config,
|
| 166 |
log_level='INFO',
|
| 167 |
model_name_or_path=None,
|
| 168 |
backend_name="turbomind",
|
| 169 |
+
stream=False, # Important: disable streaming
|
| 170 |
+
tensor_parallel=1, # Use single GPU to avoid distributed processing
|
| 171 |
)
|
| 172 |
|
| 173 |
print("InternVL2 model loaded successfully!")
|
|
|
|
| 208 |
else:
|
| 209 |
# If somehow it's already a PIL Image
|
| 210 |
image_pil = image.convert('RGB')
|
| 211 |
+
|
| 212 |
+
# We'll use a completely different approach - multiprocessing
|
| 213 |
+
# This runs the model in a separate process, avoiding any event loop conflicts
|
| 214 |
+
import multiprocessing as mp
|
| 215 |
|
| 216 |
+
# Define a function to run in a separate process
|
| 217 |
+
def run_in_process(prompt, image_path, result_queue):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 218 |
try:
|
| 219 |
+
# Set environment variables in the subprocess
|
| 220 |
+
import os
|
| 221 |
+
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
|
| 222 |
+
os.environ["LMDEPLOY_DISABLE_ASYNC"] = "1"
|
| 223 |
+
|
| 224 |
+
# Import libraries inside the process
|
| 225 |
+
from lmdeploy import pipeline, TurbomindEngineConfig
|
| 226 |
+
|
| 227 |
+
# Save the image to a temporary file to pass between processes
|
| 228 |
+
import tempfile
|
| 229 |
+
import torch
|
| 230 |
+
|
| 231 |
+
# Check GPU in subprocess
|
| 232 |
+
print(f"Subprocess GPU available: {torch.cuda.is_available()}")
|
| 233 |
+
|
| 234 |
+
# Configure for AWQ quantized model
|
| 235 |
+
backend_config = TurbomindEngineConfig(
|
| 236 |
+
model_format='awq',
|
| 237 |
+
session_len=2048
|
| 238 |
+
)
|
| 239 |
+
|
| 240 |
+
# Create new pipeline in the subprocess
|
| 241 |
+
model_pipeline = pipeline(
|
| 242 |
+
MODEL_ID,
|
| 243 |
+
backend_config=backend_config,
|
| 244 |
+
log_level='INFO',
|
| 245 |
+
model_name_or_path=None,
|
| 246 |
+
backend_name="turbomind",
|
| 247 |
+
stream=False,
|
| 248 |
+
tensor_parallel=1,
|
| 249 |
+
)
|
| 250 |
+
|
| 251 |
+
# Load the image in the subprocess
|
| 252 |
+
from PIL import Image
|
| 253 |
+
image = Image.open(image_path).convert('RGB')
|
| 254 |
+
|
| 255 |
+
# Run inference
|
| 256 |
+
response = model_pipeline((prompt, image))
|
| 257 |
+
result = response.text if hasattr(response, "text") else str(response)
|
| 258 |
+
|
| 259 |
+
# Put the result in the queue
|
| 260 |
+
result_queue.put(("success", result))
|
| 261 |
+
|
| 262 |
except Exception as e:
|
| 263 |
+
import traceback
|
| 264 |
+
error_msg = f"Error in subprocess: {str(e)}\n{traceback.format_exc()}"
|
| 265 |
+
result_queue.put(("error", error_msg))
|
| 266 |
|
| 267 |
+
# Create a temporary file for the image
|
| 268 |
+
import tempfile
|
| 269 |
+
with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as temp_file:
|
| 270 |
+
temp_path = temp_file.name
|
| 271 |
+
image_pil.save(temp_path)
|
| 272 |
|
| 273 |
+
try:
|
| 274 |
+
# Create a process-safe queue
|
| 275 |
+
result_queue = mp.Queue()
|
| 276 |
+
|
| 277 |
+
# Start the process
|
| 278 |
+
print("Starting model inference in a separate process")
|
| 279 |
+
process = mp.Process(
|
| 280 |
+
target=run_in_process,
|
| 281 |
+
args=(prompt, temp_path, result_queue)
|
| 282 |
+
)
|
| 283 |
+
|
| 284 |
+
# Make it a daemon so it terminates when the main process ends
|
| 285 |
+
process.daemon = True
|
| 286 |
+
process.start()
|
| 287 |
+
|
| 288 |
+
# Wait for the process to complete (with timeout)
|
| 289 |
+
process.join(timeout=180) # 3 minute timeout
|
| 290 |
+
|
| 291 |
+
# Delete the temporary file
|
| 292 |
+
try:
|
| 293 |
+
os.unlink(temp_path)
|
| 294 |
+
except:
|
| 295 |
+
pass
|
| 296 |
+
|
| 297 |
+
if process.is_alive():
|
| 298 |
+
# Terminate the process if it's still running after timeout
|
| 299 |
+
process.terminate()
|
| 300 |
+
return "Model inference timed out after 180 seconds. The model might be too slow on this hardware."
|
| 301 |
+
|
| 302 |
+
# Get the result from the queue (non-blocking to avoid hanging)
|
| 303 |
+
if not result_queue.empty():
|
| 304 |
+
status, result = result_queue.get(block=False)
|
| 305 |
+
if status == "error":
|
| 306 |
+
return f"Error in model inference: {result}"
|
| 307 |
+
else:
|
| 308 |
+
elapsed_time = time.time() - start_time
|
| 309 |
+
return result
|
| 310 |
else:
|
| 311 |
+
return "Unknown error: Model inference process completed but did not produce a result"
|
| 312 |
+
|
| 313 |
+
except Exception as e:
|
| 314 |
+
print(f"Error in multiprocessing: {str(e)}")
|
| 315 |
+
return f"Error setting up multiprocessing: {str(e)}"
|
| 316 |
+
|
| 317 |
except Exception as e:
|
| 318 |
print(f"Error in image analysis: {str(e)}")
|
| 319 |
# Try to clean up memory in case of error
|