Tas01 commited on
Commit
2b56642
Β·
verified Β·
1 Parent(s): 4577c44

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +230 -64
app.py CHANGED
@@ -6,30 +6,32 @@ import torch
6
  from transformers import pipeline
7
  import requests
8
  from io import BytesIO
 
 
9
 
10
  class ImageStoryteller:
11
  def __init__(self):
12
- print("Initializing Image Storyteller...")
13
 
14
- # Load YOLO model for object detection
15
  try:
16
- self.yolo_model = torch.hub.load('ultralytics/yolov5', 'yolov5s', pretrained=True)
17
- print("YOLOv5 model loaded successfully!")
18
  except Exception as e:
19
- print(f"YOLO loading failed: {e}")
20
  self.yolo_model = None
21
 
22
  # Initialize text generation pipelines
23
  try:
24
- # For narrative generation
25
  self.story_pipeline = pipeline(
26
  "text-generation",
27
- model="microsoft/DialoGPT-medium",
28
- torch_dtype=torch.float16
29
  )
30
  print("Story pipeline initialized!")
31
- except:
32
- print("Using fallback story generation")
33
  self.story_pipeline = None
34
 
35
  # Common objects for fallback detection
@@ -49,35 +51,150 @@ class ImageStoryteller:
49
  ]
50
 
51
  def detect_objects(self, image):
52
- """Detect objects in the image using YOLO or fallback method"""
53
  if self.yolo_model is not None:
54
  try:
55
- # Convert PIL to numpy for YOLO
56
  img_np = np.array(image)
57
 
58
- # Run YOLO detection
59
  results = self.yolo_model(img_np)
60
- detections = results.pandas().xyxy[0]
61
 
62
- # Extract detected objects with confidence > 0.5
63
  objects = []
64
- for _, detection in detections.iterrows():
65
- if detection['confidence'] > 0.5:
66
- objects.append({
67
- 'name': detection['name'],
68
- 'confidence': detection['confidence'],
69
- 'bbox': [detection['xmin'], detection['ymin'],
70
- detection['xmax'], detection['ymax']]
71
- })
 
 
 
 
 
 
 
72
 
73
  return objects
74
 
75
  except Exception as e:
76
- print(f"YOLO detection failed: {e}")
77
 
78
  # Fallback: Simple color-based object detection
79
  return self.fallback_object_detection(image)
80
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  def fallback_object_detection(self, image):
82
  """Simple fallback object detection based on color and composition"""
83
  img_np = np.array(image)
@@ -111,8 +228,8 @@ class ImageStoryteller:
111
 
112
  return objects
113
 
114
- def generate_narrative(self, objects, image_size):
115
- """Generate a narrative story based on detected objects"""
116
  if not objects:
117
  return "In this serene scene, the world holds its breath in quiet contemplation. " \
118
  "Though specific elements remain mysterious, the composition speaks of " \
@@ -122,36 +239,51 @@ class ImageStoryteller:
122
  object_names = [obj['name'] for obj in objects]
123
  unique_objects = list(set(object_names))
124
 
 
 
 
 
 
 
 
 
125
  # Create prompt for story generation
126
- prompt = f"In an image containing {', '.join(unique_objects)}, "
127
 
128
  if self.story_pipeline is not None:
129
  try:
130
  story = self.story_pipeline(
131
  prompt + "tell a beautiful narrative story about this scene:",
132
- max_length=150,
133
  num_return_sequences=1,
134
  temperature=0.8,
135
- do_sample=True
 
136
  )[0]['generated_text']
137
  return story
138
- except:
139
- pass
140
 
141
  # Fallback narrative generation
142
- return self.fallback_narrative(unique_objects, image_size)
143
 
144
- def fallback_narrative(self, objects, image_size):
145
  """Fallback method for generating narratives"""
146
  width, height = image_size
147
 
 
 
 
 
 
 
148
  if 'person' in objects:
149
  if 'nature' in objects or 'sky' in objects:
150
- return f"In this {width}x{height} frame, a solitary figure stands amidst nature's embrace. " \
151
  "The person seems lost in thought, surrounded by the gentle whispers of the environment. " \
152
  "Each element in the scene tells a story of connection between humanity and the natural world."
153
  else:
154
- return f"Within this {width}x{height} composition, a human presence captures our attention. " \
155
  "Their story unfolds silently, inviting us to imagine their journey, their dreams, " \
156
  "and the moments that led them to this precise point in time."
157
 
@@ -182,7 +314,8 @@ class ImageStoryteller:
182
  max_length=200,
183
  num_return_sequences=1,
184
  temperature=0.9,
185
- do_sample=True
 
186
  )[0]['generated_text']
187
 
188
  # Extract just the poetic lines
@@ -190,8 +323,8 @@ class ImageStoryteller:
190
  poetic_lines = [line for line in lines if line.strip() and len(line.strip()) > 10]
191
  if len(poetic_lines) >= 4:
192
  return '\n'.join(poetic_lines[:6])
193
- except:
194
- pass
195
 
196
  # Fallback poetry generation
197
  return self.fallback_poetry(narrative)
@@ -224,41 +357,70 @@ class ImageStoryteller:
224
  # Detect objects
225
  objects = self.detect_objects(image)
226
 
 
 
 
 
 
227
  # Generate narrative
228
- narrative = self.generate_narrative(objects, image.size)
229
 
230
  # Generate poetry
231
  poetry = self.generate_poetry(narrative)
232
 
233
- return narrative, poetry
 
 
 
234
 
235
  except Exception as e:
236
  error_msg = f"An error occurred while processing the image: {str(e)}"
237
- return error_msg, "Unable to generate poetry due to processing error."
238
 
239
  # Initialize the storyteller
240
  storyteller = ImageStoryteller()
241
 
 
 
 
 
 
 
 
 
 
 
 
 
 
242
  # Create Gradio interface
243
- with gr.Blocks(title="AI Image Storyteller", theme="soft") as demo:
244
- gr.Markdown("# πŸ“– AI Image Storyteller")
245
- gr.Markdown("**Upload any image and watch AI transform it into beautiful stories and poetry!**")
246
 
247
  with gr.Row():
248
  with gr.Column():
249
  input_image = gr.Image(
250
  type="pil",
251
  label="πŸ–ΌοΈ Upload Your Image",
252
- height=400
253
  )
254
- process_btn = gr.Button("✨ Create Story & Poetry", variant="primary", size="lg")
255
 
 
 
 
 
 
 
 
 
256
  with gr.Column():
257
  with gr.Tab("πŸ“– Narrative Story"):
258
  narrative_output = gr.Textbox(
259
  label="Image Narrative",
260
- lines=6,
261
- max_lines=10,
262
  placeholder="Your image's story will appear here...",
263
  show_copy_button=True
264
  )
@@ -266,24 +428,20 @@ with gr.Blocks(title="AI Image Storyteller", theme="soft") as demo:
266
  with gr.Tab("🎭 Poetic Verses"):
267
  poetry_output = gr.Textbox(
268
  label="6-Line Poetry",
269
- lines=7,
270
- max_lines=8,
271
  placeholder="Poetic interpretation will appear here...",
272
  show_copy_button=True
273
  )
274
 
275
- # Examples section
276
  gr.Markdown("### 🎯 Try These Examples")
277
  gr.Examples(
278
- examples=[
279
- ["https://images.unsplash.com/photo-1506905925346-21bda4d32df4?ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&auto=format&fit=crop&w=1000&q=80"],
280
- ["https://images.unsplash.com/photo-1518837695005-2083093ee35b?ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&auto=format&fit=crop&w=1000&q=80"],
281
- ["https://images.unsplash.com/photo-1469474968028-56623f02e42e?ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&auto=format&fit=crop&w=1000&q=80"]
282
- ],
283
  inputs=input_image,
284
- outputs=[narrative_output, poetry_output],
285
  fn=storyteller.process_image,
286
- cache_examples=False
287
  )
288
 
289
  # How it works section
@@ -291,16 +449,24 @@ with gr.Blocks(title="AI Image Storyteller", theme="soft") as demo:
291
  gr.Markdown("""
292
  **The Magic Behind the Stories:**
293
 
294
- 1. **Object Detection**: YOLOv5 AI model identifies objects in your image
295
- 2. **Scene Analysis**: The system analyzes the composition and relationships between objects
296
- 3. **Narrative Generation**: AI creates a compelling story based on the detected elements
297
- 4. **Poetry Creation**: Transformers model converts the narrative into beautiful 6-line verses
 
 
 
 
 
 
 
 
298
 
299
  **Perfect for:**
300
  - Personal photos
301
  - Landscape images
302
  - Urban scenes
303
- - Abstract compositions
304
  - Travel memories
305
  """)
306
 
@@ -308,7 +474,7 @@ with gr.Blocks(title="AI Image Storyteller", theme="soft") as demo:
308
  process_btn.click(
309
  fn=storyteller.process_image,
310
  inputs=input_image,
311
- outputs=[narrative_output, poetry_output]
312
  )
313
 
314
  # Launch the application
 
6
  from transformers import pipeline
7
  import requests
8
  from io import BytesIO
9
+ import os
10
+ from ultralytics import YOLO
11
 
12
  class ImageStoryteller:
13
  def __init__(self):
14
+ print("Initializing Image Storyteller with YOLOv8...")
15
 
16
+ # Load YOLOv8 model for object detection
17
  try:
18
+ self.yolo_model = YOLO('yolov8n.pt') # Using nano version for faster inference
19
+ print("YOLOv8 model loaded successfully!")
20
  except Exception as e:
21
+ print(f"YOLOv8 loading failed: {e}")
22
  self.yolo_model = None
23
 
24
  # Initialize text generation pipelines
25
  try:
26
+ # For narrative generation - using a smaller model for Hugging Face Spaces
27
  self.story_pipeline = pipeline(
28
  "text-generation",
29
+ model="distilgpt2", # Lighter model for Spaces
30
+ torch_dtype=torch.float32
31
  )
32
  print("Story pipeline initialized!")
33
+ except Exception as e:
34
+ print(f"Story pipeline failed: {e}")
35
  self.story_pipeline = None
36
 
37
  # Common objects for fallback detection
 
51
  ]
52
 
53
  def detect_objects(self, image):
54
+ """Detect objects in the image using YOLOv8"""
55
  if self.yolo_model is not None:
56
  try:
57
+ # Convert PIL to numpy for YOLOv8
58
  img_np = np.array(image)
59
 
60
+ # Run YOLOv8 detection
61
  results = self.yolo_model(img_np)
 
62
 
 
63
  objects = []
64
+ for result in results:
65
+ boxes = result.boxes
66
+ if boxes is not None:
67
+ for box in boxes:
68
+ confidence = box.conf.item()
69
+ if confidence > 0.5: # Confidence threshold
70
+ class_id = int(box.cls.item())
71
+ class_name = self.yolo_model.names[class_id]
72
+ bbox = box.xyxy[0].tolist()
73
+
74
+ objects.append({
75
+ 'name': class_name,
76
+ 'confidence': confidence,
77
+ 'bbox': bbox
78
+ })
79
 
80
  return objects
81
 
82
  except Exception as e:
83
+ print(f"YOLOv8 detection failed: {e}")
84
 
85
  # Fallback: Simple color-based object detection
86
  return self.fallback_object_detection(image)
87
 
88
+ def draw_detections(self, image, objects):
89
+ """Draw bounding boxes and labels on the image"""
90
+ img_np = np.array(image)
91
+ img_with_boxes = img_np.copy()
92
+
93
+ # Colors for different object types
94
+ colors = {
95
+ 'person': (0, 255, 0), # Green
96
+ 'vehicle': (255, 0, 0), # Blue (cars, bikes, etc.)
97
+ 'animal': (0, 165, 255), # Orange
98
+ 'default': (255, 255, 0) # Yellow
99
+ }
100
+
101
+ for obj in objects:
102
+ bbox = obj['bbox']
103
+ name = obj['name']
104
+ confidence = obj['confidence']
105
+
106
+ # Determine color based on object type
107
+ if 'person' in name:
108
+ color = colors['person']
109
+ elif any(vehicle in name for vehicle in ['car', 'bicycle', 'motorcycle', 'bus', 'truck']):
110
+ color = colors['vehicle']
111
+ elif any(animal in name for animal in ['bird', 'cat', 'dog', 'horse', 'sheep', 'cow']):
112
+ color = colors['animal']
113
+ else:
114
+ color = colors['default']
115
+
116
+ # Convert coordinates to integers
117
+ x1, y1, x2, y2 = map(int, bbox)
118
+
119
+ # Draw bounding box
120
+ cv2.rectangle(img_with_boxes, (x1, y1), (x2, y2), color, 2)
121
+
122
+ # Draw label background
123
+ label = f"{name} {confidence:.2f}"
124
+ label_size = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 2)[0]
125
+ cv2.rectangle(img_with_boxes, (x1, y1 - label_size[1] - 10),
126
+ (x1 + label_size[0], y1), color, -1)
127
+
128
+ # Draw label text
129
+ cv2.putText(img_with_boxes, label, (x1, y1 - 5),
130
+ cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 2)
131
+
132
+ return Image.fromarray(img_with_boxes)
133
+
134
+ def detect_facial_expressions(self, image, objects):
135
+ """Simple facial expression detection based on face analysis"""
136
+ img_np = np.array(image)
137
+ expressions = []
138
+
139
+ # Look for person objects
140
+ person_objects = [obj for obj in objects if obj['name'] == 'person']
141
+
142
+ if not person_objects:
143
+ return expressions
144
+
145
+ # Simple expression detection based on face position and context
146
+ for person in person_objects:
147
+ bbox = person['bbox']
148
+ x1, y1, x2, y2 = map(int, bbox)
149
+
150
+ # Extract face region (approximate)
151
+ face_height = y2 - y1
152
+ face_region = img_np[y1:y2, x1:x2]
153
+
154
+ if face_region.size == 0:
155
+ continue
156
+
157
+ # Simple expression estimation based on face position and context
158
+ expression = self.estimate_expression(face_region, bbox, img_np.shape)
159
+ expressions.append({
160
+ 'person_bbox': bbox,
161
+ 'expression': expression,
162
+ 'confidence': 0.6 # Placeholder confidence
163
+ })
164
+
165
+ return expressions
166
+
167
+ def estimate_expression(self, face_region, bbox, image_shape):
168
+ """Estimate facial expression based on simple heuristics"""
169
+ try:
170
+ # Convert to grayscale for analysis
171
+ if len(face_region.shape) == 3:
172
+ gray_face = cv2.cvtColor(face_region, cv2.COLOR_RGB2GRAY)
173
+ else:
174
+ gray_face = face_region
175
+
176
+ # Simple brightness and contrast analysis
177
+ brightness = np.mean(gray_face)
178
+ contrast = np.std(gray_face)
179
+
180
+ # Face position in image
181
+ x1, y1, x2, y2 = bbox
182
+ img_height, img_width = image_shape[:2]
183
+ face_center_y = (y1 + y2) / 2
184
+
185
+ # Simple expression rules
186
+ if brightness > 150 and contrast < 50:
187
+ return "neutral"
188
+ elif face_center_y < img_height * 0.3: # Face in upper part
189
+ return "surprised"
190
+ elif contrast > 70:
191
+ return "expressive"
192
+ else:
193
+ return "calm"
194
+
195
+ except:
196
+ return "neutral"
197
+
198
  def fallback_object_detection(self, image):
199
  """Simple fallback object detection based on color and composition"""
200
  img_np = np.array(image)
 
228
 
229
  return objects
230
 
231
+ def generate_narrative(self, objects, expressions, image_size):
232
+ """Generate a narrative story based on detected objects and expressions"""
233
  if not objects:
234
  return "In this serene scene, the world holds its breath in quiet contemplation. " \
235
  "Though specific elements remain mysterious, the composition speaks of " \
 
239
  object_names = [obj['name'] for obj in objects]
240
  unique_objects = list(set(object_names))
241
 
242
+ # Include expressions in narrative
243
+ expression_text = ""
244
+ if expressions:
245
+ exp_descriptions = []
246
+ for exp in expressions:
247
+ exp_descriptions.append(f"{exp['expression']} expression")
248
+ expression_text = f" with {', '.join(exp_descriptions)}"
249
+
250
  # Create prompt for story generation
251
+ prompt = f"In an image containing {', '.join(unique_objects)}{expression_text}, "
252
 
253
  if self.story_pipeline is not None:
254
  try:
255
  story = self.story_pipeline(
256
  prompt + "tell a beautiful narrative story about this scene:",
257
+ max_length=200,
258
  num_return_sequences=1,
259
  temperature=0.8,
260
+ do_sample=True,
261
+ pad_token_id=50256
262
  )[0]['generated_text']
263
  return story
264
+ except Exception as e:
265
+ print(f"Story generation failed: {e}")
266
 
267
  # Fallback narrative generation
268
+ return self.fallback_narrative(unique_objects, expressions, image_size)
269
 
270
+ def fallback_narrative(self, objects, expressions, image_size):
271
  """Fallback method for generating narratives"""
272
  width, height = image_size
273
 
274
+ # Include expression information
275
+ expression_context = ""
276
+ if expressions:
277
+ main_expression = expressions[0]['expression']
278
+ expression_context = f" with a {main_expression} demeanor"
279
+
280
  if 'person' in objects:
281
  if 'nature' in objects or 'sky' in objects:
282
+ return f"In this {width}x{height} frame, a solitary figure stands amidst nature's embrace{expression_context}. " \
283
  "The person seems lost in thought, surrounded by the gentle whispers of the environment. " \
284
  "Each element in the scene tells a story of connection between humanity and the natural world."
285
  else:
286
+ return f"Within this {width}x{height} composition, a human presence captures our attention{expression_context}. " \
287
  "Their story unfolds silently, inviting us to imagine their journey, their dreams, " \
288
  "and the moments that led them to this precise point in time."
289
 
 
314
  max_length=200,
315
  num_return_sequences=1,
316
  temperature=0.9,
317
+ do_sample=True,
318
+ pad_token_id=50256
319
  )[0]['generated_text']
320
 
321
  # Extract just the poetic lines
 
323
  poetic_lines = [line for line in lines if line.strip() and len(line.strip()) > 10]
324
  if len(poetic_lines) >= 4:
325
  return '\n'.join(poetic_lines[:6])
326
+ except Exception as e:
327
+ print(f"Poetry generation failed: {e}")
328
 
329
  # Fallback poetry generation
330
  return self.fallback_poetry(narrative)
 
357
  # Detect objects
358
  objects = self.detect_objects(image)
359
 
360
+ # Detect facial expressions if people are present
361
+ expressions = []
362
+ if any(obj['name'] == 'person' for obj in objects):
363
+ expressions = self.detect_facial_expressions(image, objects)
364
+
365
  # Generate narrative
366
+ narrative = self.generate_narrative(objects, expressions, image.size)
367
 
368
  # Generate poetry
369
  poetry = self.generate_poetry(narrative)
370
 
371
+ # Create detection visualization
372
+ detection_image = self.draw_detections(image, objects)
373
+
374
+ return narrative, poetry, detection_image
375
 
376
  except Exception as e:
377
  error_msg = f"An error occurred while processing the image: {str(e)}"
378
+ return error_msg, "Unable to generate poetry due to processing error.", image
379
 
380
  # Initialize the storyteller
381
  storyteller = ImageStoryteller()
382
 
383
+ # Check for local example images
384
+ example_images = []
385
+ for i in range(1, 7):
386
+ filename = f"example_{i:02d}.jpg"
387
+ if os.path.exists(filename):
388
+ example_images.append([filename])
389
+ print(f"Found example image: {filename}")
390
+
391
+ if not example_images:
392
+ print("No local example images found, using placeholder")
393
+ # Create a placeholder if no local images
394
+ example_images = [[np.ones((300, 300, 3), dtype=np.uint8) * 100]]
395
+
396
  # Create Gradio interface
397
+ with gr.Blocks(title="AI Image Storyteller Pro", theme="soft") as demo:
398
+ gr.Markdown("# πŸ“– AI Image Storyteller Pro")
399
+ gr.Markdown("**Upload any image and watch AI detect objects, analyze scenes, and create beautiful stories!**")
400
 
401
  with gr.Row():
402
  with gr.Column():
403
  input_image = gr.Image(
404
  type="pil",
405
  label="πŸ–ΌοΈ Upload Your Image",
406
+ height=300
407
  )
408
+ process_btn = gr.Button("✨ Analyze Image & Create Story", variant="primary", size="lg")
409
 
410
+ with gr.Column():
411
+ detection_output = gr.Image(
412
+ label="πŸ” Object Detection",
413
+ height=300,
414
+ show_download_button=True
415
+ )
416
+
417
+ with gr.Row():
418
  with gr.Column():
419
  with gr.Tab("πŸ“– Narrative Story"):
420
  narrative_output = gr.Textbox(
421
  label="Image Narrative",
422
+ lines=5,
423
+ max_lines=8,
424
  placeholder="Your image's story will appear here...",
425
  show_copy_button=True
426
  )
 
428
  with gr.Tab("🎭 Poetic Verses"):
429
  poetry_output = gr.Textbox(
430
  label="6-Line Poetry",
431
+ lines=6,
432
+ max_lines=7,
433
  placeholder="Poetic interpretation will appear here...",
434
  show_copy_button=True
435
  )
436
 
437
+ # Examples section with local images
438
  gr.Markdown("### 🎯 Try These Examples")
439
  gr.Examples(
440
+ examples=example_images,
 
 
 
 
441
  inputs=input_image,
442
+ outputs=[narrative_output, poetry_output, detection_output],
443
  fn=storyteller.process_image,
444
+ cache_examples=True
445
  )
446
 
447
  # How it works section
 
449
  gr.Markdown("""
450
  **The Magic Behind the Stories:**
451
 
452
+ 1. **Object Detection**: YOLOv8 AI model identifies objects in your image with bounding boxes
453
+ 2. **Facial Analysis**: Simple expression detection for human faces
454
+ 3. **Scene Analysis**: The system analyzes the composition and relationships between objects
455
+ 4. **Narrative Generation**: AI creates a compelling story based on the detected elements
456
+ 5. **Poetry Creation**: Transformers model converts the narrative into beautiful 6-line verses
457
+
458
+ **Features:**
459
+ - Real-time object detection with YOLOv8
460
+ - Visual bounding box display
461
+ - Facial expression estimation
462
+ - Context-aware storytelling
463
+ - Beautiful poetic interpretations
464
 
465
  **Perfect for:**
466
  - Personal photos
467
  - Landscape images
468
  - Urban scenes
469
+ - Group photos
470
  - Travel memories
471
  """)
472
 
 
474
  process_btn.click(
475
  fn=storyteller.process_image,
476
  inputs=input_image,
477
+ outputs=[narrative_output, poetry_output, detection_output]
478
  )
479
 
480
  # Launch the application