Spaces:

Tas01
/

AI_Image_Storyteller_Caption

Running

App Files Files Community

Tas01 commited on Oct 31

Commit

2b56642

verified ·

1 Parent(s): 4577c44

Update app.py

Browse files

Files changed (1) hide show

app.py +230 -64

app.py CHANGED Viewed

@@ -6,30 +6,32 @@ import torch
 from transformers import pipeline
 import requests
 from io import BytesIO
 class ImageStoryteller:
     def __init__(self):
-        print("Initializing Image Storyteller...")
-        # Load YOLO model for object detection
         try:
-            self.yolo_model = torch.hub.load('ultralytics/yolov5', 'yolov5s', pretrained=True)
-            print("YOLOv5 model loaded successfully!")
         except Exception as e:
-            print(f"YOLO loading failed: {e}")
             self.yolo_model = None
         # Initialize text generation pipelines
         try:
-            # For narrative generation
             self.story_pipeline = pipeline(
                 "text-generation",
-                model="microsoft/DialoGPT-medium",
-                torch_dtype=torch.float16
             )
             print("Story pipeline initialized!")
-        except:
-            print("Using fallback story generation")
             self.story_pipeline = None
         # Common objects for fallback detection
@@ -49,35 +51,150 @@ class ImageStoryteller:
         ]
     def detect_objects(self, image):
-        """Detect objects in the image using YOLO or fallback method"""
         if self.yolo_model is not None:
             try:
-                # Convert PIL to numpy for YOLO
                 img_np = np.array(image)
-                # Run YOLO detection
                 results = self.yolo_model(img_np)
-                detections = results.pandas().xyxy[0]
-                # Extract detected objects with confidence > 0.5
                 objects = []
-                for _, detection in detections.iterrows():
-                    if detection['confidence'] > 0.5:
-                        objects.append({
-                            'name': detection['name'],
-                            'confidence': detection['confidence'],
-                            'bbox': [detection['xmin'], detection['ymin'],
-                                   detection['xmax'], detection['ymax']]
-                        })
                 return objects
             except Exception as e:
-                print(f"YOLO detection failed: {e}")
         # Fallback: Simple color-based object detection
         return self.fallback_object_detection(image)
     def fallback_object_detection(self, image):
         """Simple fallback object detection based on color and composition"""
         img_np = np.array(image)
@@ -111,8 +228,8 @@ class ImageStoryteller:
         return objects
-    def generate_narrative(self, objects, image_size):
-        """Generate a narrative story based on detected objects"""
         if not objects:
             return "In this serene scene, the world holds its breath in quiet contemplation. " \
                    "Though specific elements remain mysterious, the composition speaks of " \
@@ -122,36 +239,51 @@ class ImageStoryteller:
         object_names = [obj['name'] for obj in objects]
         unique_objects = list(set(object_names))
         # Create prompt for story generation
-        prompt = f"In an image containing {', '.join(unique_objects)}, "
         if self.story_pipeline is not None:
             try:
                 story = self.story_pipeline(
                     prompt + "tell a beautiful narrative story about this scene:",
-                    max_length=150,
                     num_return_sequences=1,
                     temperature=0.8,
-                    do_sample=True
                 )[0]['generated_text']
                 return story
-            except:
-                pass
         # Fallback narrative generation
-        return self.fallback_narrative(unique_objects, image_size)
-    def fallback_narrative(self, objects, image_size):
         """Fallback method for generating narratives"""
         width, height = image_size
         if 'person' in objects:
             if 'nature' in objects or 'sky' in objects:
-                return f"In this {width}x{height} frame, a solitary figure stands amidst nature's embrace. " \
                        "The person seems lost in thought, surrounded by the gentle whispers of the environment. " \
                        "Each element in the scene tells a story of connection between humanity and the natural world."
             else:
-                return f"Within this {width}x{height} composition, a human presence captures our attention. " \
                        "Their story unfolds silently, inviting us to imagine their journey, their dreams, " \
                        "and the moments that led them to this precise point in time."
@@ -182,7 +314,8 @@ class ImageStoryteller:
                     max_length=200,
                     num_return_sequences=1,
                     temperature=0.9,
-                    do_sample=True
                 )[0]['generated_text']
                 # Extract just the poetic lines
@@ -190,8 +323,8 @@ class ImageStoryteller:
                 poetic_lines = [line for line in lines if line.strip() and len(line.strip()) > 10]
                 if len(poetic_lines) >= 4:
                     return '\n'.join(poetic_lines[:6])
-            except:
-                pass
         # Fallback poetry generation
         return self.fallback_poetry(narrative)
@@ -224,41 +357,70 @@ class ImageStoryteller:
             # Detect objects
             objects = self.detect_objects(image)
             # Generate narrative
-            narrative = self.generate_narrative(objects, image.size)
             # Generate poetry
             poetry = self.generate_poetry(narrative)
-            return narrative, poetry
         except Exception as e:
             error_msg = f"An error occurred while processing the image: {str(e)}"
-            return error_msg, "Unable to generate poetry due to processing error."
 # Initialize the storyteller
 storyteller = ImageStoryteller()
 # Create Gradio interface
-with gr.Blocks(title="AI Image Storyteller", theme="soft") as demo:
-    gr.Markdown("# 📖 AI Image Storyteller")
-    gr.Markdown("**Upload any image and watch AI transform it into beautiful stories and poetry!**")
     with gr.Row():
         with gr.Column():
             input_image = gr.Image(
                 type="pil",
                 label="🖼️ Upload Your Image",
-                height=400
             )
-            process_btn = gr.Button("✨ Create Story & Poetry", variant="primary", size="lg")
         with gr.Column():
             with gr.Tab("📖 Narrative Story"):
                 narrative_output = gr.Textbox(
                     label="Image Narrative",
-                    lines=6,
-                    max_lines=10,
                     placeholder="Your image's story will appear here...",
                     show_copy_button=True
                 )
@@ -266,24 +428,20 @@ with gr.Blocks(title="AI Image Storyteller", theme="soft") as demo:
             with gr.Tab("🎭 Poetic Verses"):
                 poetry_output = gr.Textbox(
                     label="6-Line Poetry",
-                    lines=7,
-                    max_lines=8,
                     placeholder="Poetic interpretation will appear here...",
                     show_copy_button=True
                 )
-    # Examples section
     gr.Markdown("### 🎯 Try These Examples")
     gr.Examples(
-        examples=[
-            ["https://images.unsplash.com/photo-1506905925346-21bda4d32df4?ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&auto=format&fit=crop&w=1000&q=80"],
-            ["https://images.unsplash.com/photo-1518837695005-2083093ee35b?ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&auto=format&fit=crop&w=1000&q=80"],
-            ["https://images.unsplash.com/photo-1469474968028-56623f02e42e?ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&auto=format&fit=crop&w=1000&q=80"]
-        ],
         inputs=input_image,
-        outputs=[narrative_output, poetry_output],
         fn=storyteller.process_image,
-        cache_examples=False
     )
     # How it works section
@@ -291,16 +449,24 @@ with gr.Blocks(title="AI Image Storyteller", theme="soft") as demo:
         gr.Markdown("""
         **The Magic Behind the Stories:**
-        1. **Object Detection**: YOLOv5 AI model identifies objects in your image
-        2. **Scene Analysis**: The system analyzes the composition and relationships between objects
-        3. **Narrative Generation**: AI creates a compelling story based on the detected elements
-        4. **Poetry Creation**: Transformers model converts the narrative into beautiful 6-line verses
         **Perfect for:**
         - Personal photos
         - Landscape images
         - Urban scenes
-        - Abstract compositions
         - Travel memories
         """)
@@ -308,7 +474,7 @@ with gr.Blocks(title="AI Image Storyteller", theme="soft") as demo:
     process_btn.click(
         fn=storyteller.process_image,
         inputs=input_image,
-        outputs=[narrative_output, poetry_output]
     )
 # Launch the application

 from transformers import pipeline
 import requests
 from io import BytesIO
+import os
+from ultralytics import YOLO
 class ImageStoryteller:
     def __init__(self):
+        print("Initializing Image Storyteller with YOLOv8...")
+        # Load YOLOv8 model for object detection
         try:
+            self.yolo_model = YOLO('yolov8n.pt')  # Using nano version for faster inference
+            print("YOLOv8 model loaded successfully!")
         except Exception as e:
+            print(f"YOLOv8 loading failed: {e}")
             self.yolo_model = None
         # Initialize text generation pipelines
         try:
+            # For narrative generation - using a smaller model for Hugging Face Spaces
             self.story_pipeline = pipeline(
                 "text-generation",
+                model="distilgpt2",  # Lighter model for Spaces
+                torch_dtype=torch.float32
             )
             print("Story pipeline initialized!")
+        except Exception as e:
+            print(f"Story pipeline failed: {e}")
             self.story_pipeline = None
         # Common objects for fallback detection
         ]
     def detect_objects(self, image):
+        """Detect objects in the image using YOLOv8"""
         if self.yolo_model is not None:
             try:
+                # Convert PIL to numpy for YOLOv8
                 img_np = np.array(image)
+                # Run YOLOv8 detection
                 results = self.yolo_model(img_np)
                 objects = []
+                for result in results:
+                    boxes = result.boxes
+                    if boxes is not None:
+                        for box in boxes:
+                            confidence = box.conf.item()
+                            if confidence > 0.5:  # Confidence threshold
+                                class_id = int(box.cls.item())
+                                class_name = self.yolo_model.names[class_id]
+                                bbox = box.xyxy[0].tolist()
+                                objects.append({
+                                    'name': class_name,
+                                    'confidence': confidence,
+                                    'bbox': bbox
+                                })
                 return objects
             except Exception as e:
+                print(f"YOLOv8 detection failed: {e}")
         # Fallback: Simple color-based object detection
         return self.fallback_object_detection(image)
+    def draw_detections(self, image, objects):
+        """Draw bounding boxes and labels on the image"""
+        img_np = np.array(image)
+        img_with_boxes = img_np.copy()
+        # Colors for different object types
+        colors = {
+            'person': (0, 255, 0),      # Green
+            'vehicle': (255, 0, 0),     # Blue (cars, bikes, etc.)
+            'animal': (0, 165, 255),    # Orange
+            'default': (255, 255, 0)    # Yellow
+        }
+        for obj in objects:
+            bbox = obj['bbox']
+            name = obj['name']
+            confidence = obj['confidence']
+            # Determine color based on object type
+            if 'person' in name:
+                color = colors['person']
+            elif any(vehicle in name for vehicle in ['car', 'bicycle', 'motorcycle', 'bus', 'truck']):
+                color = colors['vehicle']
+            elif any(animal in name for animal in ['bird', 'cat', 'dog', 'horse', 'sheep', 'cow']):
+                color = colors['animal']
+            else:
+                color = colors['default']
+            # Convert coordinates to integers
+            x1, y1, x2, y2 = map(int, bbox)
+            # Draw bounding box
+            cv2.rectangle(img_with_boxes, (x1, y1), (x2, y2), color, 2)
+            # Draw label background
+            label = f"{name} {confidence:.2f}"
+            label_size = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 2)[0]
+            cv2.rectangle(img_with_boxes, (x1, y1 - label_size[1] - 10),
+                         (x1 + label_size[0], y1), color, -1)
+            # Draw label text
+            cv2.putText(img_with_boxes, label, (x1, y1 - 5),
+                       cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 2)
+        return Image.fromarray(img_with_boxes)
+    def detect_facial_expressions(self, image, objects):
+        """Simple facial expression detection based on face analysis"""
+        img_np = np.array(image)
+        expressions = []
+        # Look for person objects
+        person_objects = [obj for obj in objects if obj['name'] == 'person']
+        if not person_objects:
+            return expressions
+        # Simple expression detection based on face position and context
+        for person in person_objects:
+            bbox = person['bbox']
+            x1, y1, x2, y2 = map(int, bbox)
+            # Extract face region (approximate)
+            face_height = y2 - y1
+            face_region = img_np[y1:y2, x1:x2]
+            if face_region.size == 0:
+                continue
+            # Simple expression estimation based on face position and context
+            expression = self.estimate_expression(face_region, bbox, img_np.shape)
+            expressions.append({
+                'person_bbox': bbox,
+                'expression': expression,
+                'confidence': 0.6  # Placeholder confidence
+            })
+        return expressions
+    def estimate_expression(self, face_region, bbox, image_shape):
+        """Estimate facial expression based on simple heuristics"""
+        try:
+            # Convert to grayscale for analysis
+            if len(face_region.shape) == 3:
+                gray_face = cv2.cvtColor(face_region, cv2.COLOR_RGB2GRAY)
+            else:
+                gray_face = face_region
+            # Simple brightness and contrast analysis
+            brightness = np.mean(gray_face)
+            contrast = np.std(gray_face)
+            # Face position in image
+            x1, y1, x2, y2 = bbox
+            img_height, img_width = image_shape[:2]
+            face_center_y = (y1 + y2) / 2
+            # Simple expression rules
+            if brightness > 150 and contrast < 50:
+                return "neutral"
+            elif face_center_y < img_height * 0.3:  # Face in upper part
+                return "surprised"
+            elif contrast > 70:
+                return "expressive"
+            else:
+                return "calm"
+        except:
+            return "neutral"
     def fallback_object_detection(self, image):
         """Simple fallback object detection based on color and composition"""
         img_np = np.array(image)
         return objects
+    def generate_narrative(self, objects, expressions, image_size):
+        """Generate a narrative story based on detected objects and expressions"""
         if not objects:
             return "In this serene scene, the world holds its breath in quiet contemplation. " \
                    "Though specific elements remain mysterious, the composition speaks of " \
         object_names = [obj['name'] for obj in objects]
         unique_objects = list(set(object_names))
+        # Include expressions in narrative
+        expression_text = ""
+        if expressions:
+            exp_descriptions = []
+            for exp in expressions:
+                exp_descriptions.append(f"{exp['expression']} expression")
+            expression_text = f" with {', '.join(exp_descriptions)}"
         # Create prompt for story generation
+        prompt = f"In an image containing {', '.join(unique_objects)}{expression_text}, "
         if self.story_pipeline is not None:
             try:
                 story = self.story_pipeline(
                     prompt + "tell a beautiful narrative story about this scene:",
+                    max_length=200,
                     num_return_sequences=1,
                     temperature=0.8,
+                    do_sample=True,
+                    pad_token_id=50256
                 )[0]['generated_text']
                 return story
+            except Exception as e:
+                print(f"Story generation failed: {e}")
         # Fallback narrative generation
+        return self.fallback_narrative(unique_objects, expressions, image_size)
+    def fallback_narrative(self, objects, expressions, image_size):
         """Fallback method for generating narratives"""
         width, height = image_size
+        # Include expression information
+        expression_context = ""
+        if expressions:
+            main_expression = expressions[0]['expression']
+            expression_context = f" with a {main_expression} demeanor"
         if 'person' in objects:
             if 'nature' in objects or 'sky' in objects:
+                return f"In this {width}x{height} frame, a solitary figure stands amidst nature's embrace{expression_context}. " \
                        "The person seems lost in thought, surrounded by the gentle whispers of the environment. " \
                        "Each element in the scene tells a story of connection between humanity and the natural world."
             else:
+                return f"Within this {width}x{height} composition, a human presence captures our attention{expression_context}. " \
                        "Their story unfolds silently, inviting us to imagine their journey, their dreams, " \
                        "and the moments that led them to this precise point in time."
                     max_length=200,
                     num_return_sequences=1,
                     temperature=0.9,
+                    do_sample=True,
+                    pad_token_id=50256
                 )[0]['generated_text']
                 # Extract just the poetic lines
                 poetic_lines = [line for line in lines if line.strip() and len(line.strip()) > 10]
                 if len(poetic_lines) >= 4:
                     return '\n'.join(poetic_lines[:6])
+            except Exception as e:
+                print(f"Poetry generation failed: {e}")
         # Fallback poetry generation
         return self.fallback_poetry(narrative)
             # Detect objects
             objects = self.detect_objects(image)
+            # Detect facial expressions if people are present
+            expressions = []
+            if any(obj['name'] == 'person' for obj in objects):
+                expressions = self.detect_facial_expressions(image, objects)
             # Generate narrative
+            narrative = self.generate_narrative(objects, expressions, image.size)
             # Generate poetry
             poetry = self.generate_poetry(narrative)
+            # Create detection visualization
+            detection_image = self.draw_detections(image, objects)
+            return narrative, poetry, detection_image
         except Exception as e:
             error_msg = f"An error occurred while processing the image: {str(e)}"
+            return error_msg, "Unable to generate poetry due to processing error.", image
 # Initialize the storyteller
 storyteller = ImageStoryteller()
+# Check for local example images
+example_images = []
+for i in range(1, 7):
+    filename = f"example_{i:02d}.jpg"
+    if os.path.exists(filename):
+        example_images.append([filename])
+        print(f"Found example image: {filename}")
+if not example_images:
+    print("No local example images found, using placeholder")
+    # Create a placeholder if no local images
+    example_images = [[np.ones((300, 300, 3), dtype=np.uint8) * 100]]
 # Create Gradio interface
+with gr.Blocks(title="AI Image Storyteller Pro", theme="soft") as demo:
+    gr.Markdown("# 📖 AI Image Storyteller Pro")
+    gr.Markdown("**Upload any image and watch AI detect objects, analyze scenes, and create beautiful stories!**")
     with gr.Row():
         with gr.Column():
             input_image = gr.Image(
                 type="pil",
                 label="🖼️ Upload Your Image",
+                height=300
             )
+            process_btn = gr.Button("✨ Analyze Image & Create Story", variant="primary", size="lg")
+        with gr.Column():
+            detection_output = gr.Image(
+                label="🔍 Object Detection",
+                height=300,
+                show_download_button=True
+            )
+    with gr.Row():
         with gr.Column():
             with gr.Tab("📖 Narrative Story"):
                 narrative_output = gr.Textbox(
                     label="Image Narrative",
+                    lines=5,
+                    max_lines=8,
                     placeholder="Your image's story will appear here...",
                     show_copy_button=True
                 )
             with gr.Tab("🎭 Poetic Verses"):
                 poetry_output = gr.Textbox(
                     label="6-Line Poetry",
+                    lines=6,
+                    max_lines=7,
                     placeholder="Poetic interpretation will appear here...",
                     show_copy_button=True
                 )
+    # Examples section with local images
     gr.Markdown("### 🎯 Try These Examples")
     gr.Examples(
+        examples=example_images,
         inputs=input_image,
+        outputs=[narrative_output, poetry_output, detection_output],
         fn=storyteller.process_image,
+        cache_examples=True
     )
     # How it works section
         gr.Markdown("""
         **The Magic Behind the Stories:**
+        1. **Object Detection**: YOLOv8 AI model identifies objects in your image with bounding boxes
+        2. **Facial Analysis**: Simple expression detection for human faces
+        3. **Scene Analysis**: The system analyzes the composition and relationships between objects
+        4. **Narrative Generation**: AI creates a compelling story based on the detected elements
+        5. **Poetry Creation**: Transformers model converts the narrative into beautiful 6-line verses
+        **Features:**
+        - Real-time object detection with YOLOv8
+        - Visual bounding box display
+        - Facial expression estimation
+        - Context-aware storytelling
+        - Beautiful poetic interpretations
         **Perfect for:**
         - Personal photos
         - Landscape images
         - Urban scenes
+        - Group photos
         - Travel memories
         """)
     process_btn.click(
         fn=storyteller.process_image,
         inputs=input_image,
+        outputs=[narrative_output, poetry_output, detection_output]
     )
 # Launch the application