Upload InternVL2 implementation
Browse files- app_internvl2.py +59 -2
app_internvl2.py
CHANGED
|
@@ -197,9 +197,61 @@ def analyze_image(image, prompt):
|
|
| 197 |
if internvl2_model is not None:
|
| 198 |
try:
|
| 199 |
print("Running inference with InternVL2...")
|
|
|
|
|
|
|
|
|
|
| 200 |
response = internvl2_model((prompt, pil_image))
|
| 201 |
-
|
| 202 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 203 |
except Exception as e:
|
| 204 |
print(f"Error with InternVL2: {str(e)}")
|
| 205 |
# If InternVL2 fails, fall back to BLIP if available
|
|
@@ -212,6 +264,11 @@ def analyze_image(image, prompt):
|
|
| 212 |
inputs = blip_processor(pil_image, return_tensors="pt").to("cuda")
|
| 213 |
out = blip_model.generate(**inputs, max_new_tokens=100)
|
| 214 |
result = blip_processor.decode(out[0], skip_special_tokens=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 215 |
return f"[BLIP] {result} (Note: Custom prompts not supported with BLIP fallback model)"
|
| 216 |
except Exception as e:
|
| 217 |
print(f"Error with BLIP: {str(e)}")
|
|
|
|
| 197 |
if internvl2_model is not None:
|
| 198 |
try:
|
| 199 |
print("Running inference with InternVL2...")
|
| 200 |
+
print(f"Using prompt: '{prompt}'")
|
| 201 |
+
|
| 202 |
+
# Run the model and capture the raw response
|
| 203 |
response = internvl2_model((prompt, pil_image))
|
| 204 |
+
|
| 205 |
+
# Print debug info about the response
|
| 206 |
+
print(f"Response type: {type(response)}")
|
| 207 |
+
print(f"Response attributes: {dir(response) if hasattr(response, '__dir__') else 'No dir available'}")
|
| 208 |
+
|
| 209 |
+
# Try different ways to extract the text
|
| 210 |
+
if hasattr(response, "text"):
|
| 211 |
+
result = response.text
|
| 212 |
+
print(f"Found 'text' attribute: '{result}'")
|
| 213 |
+
elif hasattr(response, "response"):
|
| 214 |
+
result = response.response
|
| 215 |
+
print(f"Found 'response' attribute: '{result}'")
|
| 216 |
+
elif hasattr(response, "generated_text"):
|
| 217 |
+
result = response.generated_text
|
| 218 |
+
print(f"Found 'generated_text' attribute: '{result}'")
|
| 219 |
+
else:
|
| 220 |
+
# If no attribute worked, convert the whole response to string
|
| 221 |
+
result = str(response)
|
| 222 |
+
print(f"Using string conversion: '{result}'")
|
| 223 |
+
|
| 224 |
+
# Check if we got an empty result
|
| 225 |
+
if not result or result.strip() == "":
|
| 226 |
+
print("WARNING: Received empty response from InternVL2")
|
| 227 |
+
# Try an alternative prompt to see if that works
|
| 228 |
+
print("Trying alternative prompt...")
|
| 229 |
+
alt_prompt = "This is an image. Describe what you see in detail."
|
| 230 |
+
response2 = internvl2_model((alt_prompt, pil_image))
|
| 231 |
+
|
| 232 |
+
if hasattr(response2, "text"):
|
| 233 |
+
result = response2.text
|
| 234 |
+
elif hasattr(response2, "response"):
|
| 235 |
+
result = response2.response
|
| 236 |
+
elif hasattr(response2, "generated_text"):
|
| 237 |
+
result = response2.generated_text
|
| 238 |
+
else:
|
| 239 |
+
result = str(response2)
|
| 240 |
+
|
| 241 |
+
if not result or result.strip() == "":
|
| 242 |
+
print("Alternative prompt also gave empty result")
|
| 243 |
+
# Fall through to BLIP fallback
|
| 244 |
+
raise ValueError("Empty response from InternVL2")
|
| 245 |
+
else:
|
| 246 |
+
print(f"Alternative prompt worked: '{result}'")
|
| 247 |
+
|
| 248 |
+
# If we got a valid result, return it
|
| 249 |
+
if result and result.strip() != "":
|
| 250 |
+
return f"[InternVL2] {result}"
|
| 251 |
+
else:
|
| 252 |
+
# Try BLIP instead
|
| 253 |
+
raise ValueError("Empty response from InternVL2")
|
| 254 |
+
|
| 255 |
except Exception as e:
|
| 256 |
print(f"Error with InternVL2: {str(e)}")
|
| 257 |
# If InternVL2 fails, fall back to BLIP if available
|
|
|
|
| 264 |
inputs = blip_processor(pil_image, return_tensors="pt").to("cuda")
|
| 265 |
out = blip_model.generate(**inputs, max_new_tokens=100)
|
| 266 |
result = blip_processor.decode(out[0], skip_special_tokens=True)
|
| 267 |
+
|
| 268 |
+
# Check if BLIP result is empty
|
| 269 |
+
if not result or result.strip() == "":
|
| 270 |
+
return "BLIP model returned an empty response. The model may be having issues processing this image."
|
| 271 |
+
|
| 272 |
return f"[BLIP] {result} (Note: Custom prompts not supported with BLIP fallback model)"
|
| 273 |
except Exception as e:
|
| 274 |
print(f"Error with BLIP: {str(e)}")
|