# import gradio as gr # import cv2 # import numpy as np # from PIL import Image # import torch # from transformers import CLIPProcessor, CLIPModel # import rembg # from io import BytesIO # import os # import torch # from transformers import CLIPModel, CLIPProcessor, AutoTokenizer, AutoModelForCausalLM # from huggingface_hub import login # from PIL import Image # import gradio as gr # class ImageStoryteller: # # def __init__(self, llm_model_id="microsoft/phi-2"): microsoft/phi-3-mini-4k-instruct # # def __init__(self, llm_model_id="Qwen/Qwen1.5-1.8B-Chat"): # def __init__(self, llm_model_id="Qwen/Qwen2.5-3B-Instruct"): # print("Initializing Image Storyteller with CLIP-ViT and LLM...") # # Load CLIP model for image understanding # try: # # self.clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32") # # self.clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32") # # For the LAION large model (ViT-H/14) # self.clip_model = CLIPModel.from_pretrained("laion/CLIP-ViT-H-14-laion2B-s32B-b79K") # self.clip_processor = CLIPProcessor.from_pretrained("laion/CLIP-ViT-H-14-laion2B-s32B-b79K") # print("CLIP-ViT model loaded successfully!") # except Exception as e: # print(f"CLIP loading failed: {e}") # self.clip_model = None # self.clip_processor = None # # Load LLM for story generation # try: # # Choose your LLM (phi-2 doesn't require login) # self.llm_model_id = llm_model_id # self.tokenizer = AutoTokenizer.from_pretrained(llm_model_id) # self.llm_model = AutoModelForCausalLM.from_pretrained( # llm_model_id, # torch_dtype=torch.float16, # device_map="auto", # trust_remote_code=True) # # trust_remote_code=True if "phi" in llm_model_id else False # # To this (for Qwen and other models): # # trust_remote_code=True if any(keyword in llm_model_id.lower() for keyword in ["phi", "qwen", "yi", "deepseek"]) else False # print(f"LLM model {llm_model_id} loaded successfully!") # except Exception as e: # print(f"LLM loading failed: {e}") # self.llm_model = None # self.tokenizer = None # # Common objects for scene understanding # # self.common_objects = [ # # 'person', 'people', 'human', 'man', 'woman', 'child', 'baby', # # 'dog', 'cat', 'animal', 'bird', 'horse', 'cow', 'sheep', # # 'car', 'vehicle', 'bus', 'truck', 'bicycle', 'motorcycle', # # 'building', 'house', 'skyscraper', 'architecture', # # 'tree', 'forest', 'nature', 'mountain', 'sky', 'clouds', # # 'water', 'ocean', 'river', 'lake', 'beach', # # 'food', 'fruit', 'vegetable', 'meal', # # 'indoor', 'outdoor', 'urban', 'rural' # # ] # self.common_objects = [ # # PEOPLE & GROUPS (50+) # 'person', 'people', 'human', 'man', 'woman', 'child', 'baby', 'toddler', 'teenager', # 'elderly', 'senior', 'adult', 'boy', 'girl', 'infant', 'family', 'crowd', 'audience', # 'couple', 'friends', 'group', 'team', 'crowd', 'spectator', 'performer', 'dancer', # 'singer', 'musician', 'artist', 'painter', 'photographer', 'worker', 'employee', # 'chef', 'cook', 'waiter', 'doctor', 'nurse', 'teacher', 'student', 'police', # 'firefighter', 'soldier', 'pilot', 'driver', 'athlete', 'player', 'fan', 'spectator', # # ANIMALS & WILDLIFE (100+) # 'dog', 'puppy', 'cat', 'kitten', 'pet', 'animal', 'bird', 'sparrow', 'eagle', 'hawk', # 'owl', 'pigeon', 'seagull', 'duck', 'goose', 'swan', 'parrot', 'crow', 'raven', # 'horse', 'pony', 'foal', 'cow', 'bull', 'calf', 'sheep', 'lamb', 'goat', 'pig', # 'hog', 'donkey', 'mule', 'deer', 'stag', 'doe', 'fawn', 'moose', 'elk', 'bear', # 'wolf', 'fox', 'coyote', 'lion', 'tiger', 'leopard', 'cheetah', 'jaguar', 'elephant', # 'giraffe', 'zebra', 'rhino', 'hippo', 'gorilla', 'chimpanzee', 'monkey', 'kangaroo', # 'koala', 'panda', 'penguin', 'seal', 'walrus', 'whale', 'dolphin', 'shark', 'fish', # 'salmon', 'trout', 'tuna', 'goldfish', 'butterfly', 'moth', 'bee', 'wasp', 'ant', # 'spider', 'scorpion', 'snake', 'lizard', 'turtle', 'tortoise', 'frog', 'toad', # 'crocodile', 'alligator', 'dinosaur', 'dragon', 'unicorn', 'pegasus', # # VEHICLES & TRANSPORT (50+) # 'car', 'automobile', 'vehicle', 'sedan', 'coupe', 'SUV', 'truck', 'pickup', 'van', # 'minivan', 'bus', 'coach', 'motorcycle', 'bike', 'bicycle', 'scooter', 'moped', # 'train', 'locomotive', 'railway', 'airplane', 'aircraft', 'jet', 'helicopter', # 'drone', 'boat', 'ship', 'yacht', 'sailboat', 'canoe', 'kayak', 'submarine', # 'rocket', 'spaceship', 'UFO', 'tractor', 'excavator', 'bulldozer', 'crane', # 'ambulance', 'firetruck', 'police car', 'taxi', 'limousine', 'race car', # # BUILDINGS & STRUCTURES (60+) # 'building', 'house', 'home', 'cottage', 'mansion', 'apartment', 'condo', 'skyscraper', # 'tower', 'architecture', 'office', 'factory', 'warehouse', 'barn', 'shed', 'garage', # 'castle', 'palace', 'fort', 'temple', 'church', 'cathedral', 'mosque', 'synagogue', # 'school', 'university', 'hospital', 'clinic', 'mall', 'store', 'shop', 'market', # 'restaurant', 'cafe', 'hotel', 'motel', 'inn', 'museum', 'library', 'theater', # 'cinema', 'stadium', 'arena', 'bridge', 'tunnel', 'monument', 'statue', 'fountain', # 'lighthouse', 'windmill', 'silo', 'chimney', 'antenna', 'satellite dish', # # NATURE & LANDSCAPES (80+) # 'tree', 'forest', 'woods', 'jungle', 'nature', 'mountain', 'hill', 'valley', 'cliff', # 'volcano', 'cave', 'sky', 'cloud', 'sun', 'moon', 'star', 'planet', 'galaxy', # 'water', 'ocean', 'sea', 'river', 'stream', 'creek', 'lake', 'pond', 'waterfall', # 'beach', 'shore', 'coast', 'island', 'peninsula', 'desert', 'dune', 'oasis', # 'field', 'meadow', 'prairie', 'grassland', 'swamp', 'marsh', 'wetland', 'glacier', # 'iceberg', 'snow', 'rain', 'storm', 'lightning', 'thunder', 'rainbow', 'aurora', # 'flower', 'rose', 'tulip', 'daisy', 'sunflower', 'lily', 'tree', 'oak', 'pine', # 'palm', 'maple', 'birch', 'willow', 'bamboo', 'cactus', 'mushroom', 'fern', # 'rock', 'stone', 'boulder', 'pebble', 'sand', 'dirt', 'soil', 'mud', # # FOOD & DRINKS (70+) # 'food', 'fruit', 'apple', 'banana', 'orange', 'grape', 'strawberry', 'blueberry', # 'watermelon', 'melon', 'pineapple', 'mango', 'peach', 'pear', 'cherry', 'lemon', # 'vegetable', 'carrot', 'potato', 'tomato', 'onion', 'garlic', 'lettuce', 'cabbage', # 'broccoli', 'cauliflower', 'corn', 'pea', 'bean', 'cucumber', 'pepper', 'spinach', # 'meal', 'bread', 'sandwich', 'burger', 'pizza', 'pasta', 'rice', 'noodles', # 'soup', 'salad', 'steak', 'chicken', 'fish', 'seafood', 'egg', 'cheese', 'milk', # 'butter', 'yogurt', 'ice cream', 'cake', 'cookie', 'chocolate', 'candy', 'dessert', # 'coffee', 'tea', 'juice', 'wine', 'beer', 'cocktail', 'water', 'soda', # # HOUSEHOLD ITEMS (60+) # 'chair', 'table', 'desk', 'bed', 'sofa', 'couch', 'furniture', 'lamp', 'light', # 'clock', 'watch', 'phone', 'smartphone', 'computer', 'laptop', 'tablet', 'TV', # 'television', 'screen', 'monitor', 'camera', 'book', 'newspaper', 'magazine', # 'pen', 'pencil', 'paper', 'notebook', 'keyboard', 'mouse', 'remote', 'charger', # 'window', 'door', 'key', 'lock', 'mirror', 'painting', 'picture', 'photo', # 'vase', 'pot', 'pan', 'plate', 'bowl', 'cup', 'glass', 'bottle', 'knife', # 'fork', 'spoon', 'refrigerator', 'oven', 'microwave', 'sink', 'toilet', 'shower', # # CLOTHING & ACCESSORIES (40+) # 'clothing', 'shirt', 't-shirt', 'pants', 'jeans', 'shorts', 'dress', 'skirt', # 'jacket', 'coat', 'sweater', 'hoodie', 'hat', 'cap', 'helmet', 'glasses', # 'sunglasses', 'shoes', 'sneakers', 'boots', 'sandals', 'socks', 'gloves', # 'scarf', 'tie', 'belt', 'bag', 'backpack', 'purse', 'wallet', 'jewelry', # 'necklace', 'bracelet', 'ring', 'earrings', 'watch', 'umbrella', 'cane', # # SPORTS & RECREATION (40+) # 'sports', 'ball', 'football', 'soccer', 'basketball', 'baseball', 'tennis', # 'golf', 'hockey', 'cricket', 'rugby', 'volleyball', 'badminton', 'swimming', # 'running', 'jogging', 'cycling', 'skiing', 'snowboarding', 'skating', # 'surfing', 'diving', 'fishing', 'hiking', 'camping', 'picnic', 'barbecue', # 'game', 'toy', 'doll', 'teddy bear', 'puzzle', 'chess', 'cards', 'dice', # 'instrument', 'guitar', 'piano', 'violin', 'drums', 'trumpet', 'flute', # # TECHNOLOGY & ELECTRONICS (30+) # 'technology', 'robot', 'android', 'cyborg', 'drone', 'satellite', 'microscope', # 'telescope', 'radar', 'antenna', 'engine', 'motor', 'battery', 'wire', 'cable', # 'circuit', 'chip', 'processor', 'sensor', 'lens', 'printer', 'scanner', # 'speaker', 'headphones', 'microphone', 'router', 'server', 'database', # # ABSTRACT & CONCEPTUAL (40+) # 'love', 'heart', 'emotion', 'thought', 'idea', 'concept', 'dream', 'fantasy', # 'magic', 'illusion', 'shadow', 'reflection', 'silhouette', 'pattern', 'texture', # 'color', 'rainbow', 'gradient', 'glow', 'sparkle', 'fire', 'flame', 'smoke', # 'steam', 'fog', 'mist', 'bubble', 'balloon', 'kite', 'flag', 'banner', 'sign', # 'symbol', 'logo', 'icon', 'emoji', 'avatar', 'character', 'hero', 'villain' # ] # # Total count: ~500+ objects across 10 categories # # Scene categories for classification # # self.scene_categories = [ # # "portrait", "landscape", "cityscape", "indoor scene", "outdoor scene", # # "nature", "urban", "beach", "mountain", "forest", "street", # # "party", "celebration", "sports", "action", "still life", # # "abstract", "art", "architecture", "wildlife", "pet" # # ] # self.scene_categories = [ # # PORTRAIT & PEOPLE SCENES (50+) # "portrait", "selfie", "headshot", "profile", "close-up", "full body", "group photo", # "family portrait", "couple", "friends", "team", "crowd", "audience", "classroom", # "office workers", "meeting", "conference", "presentation", "interview", "lecture", # "performance", "concert", "theater", "stage", "dance", "singing", "acting", # "yoga", "meditation", "exercise", "workout", "gym", "sports team", "marching band", # "choir", "orchestra", "parade", "protest", "rally", "ceremony", "graduation", # "wedding", "engagement", "birthday party", "anniversary", "reunion", "festival", # "costume party", "masquerade", "halloween", "cosplay", # # LANDSCAPE & NATURE SCENES (80+) # "landscape", "nature", "outdoor scene", "mountain", "valley", "hill", "cliff", # "canyon", "gorge", "volcano", "cave", "forest", "woods", "jungle", "rainforest", # "desert", "dunes", "oasis", "beach", "shore", "coastline", "cliffside", "seaside", # "island", "tropical", "coral reef", "underwater", "river", "stream", "creek", # "waterfall", "rapids", "lake", "pond", "marsh", "swamp", "wetland", "glacier", # "iceberg", "snowfield", "tundra", "meadow", "field", "pasture", "prairie", # "grassland", "savannah", "farmland", "orchard", "vineyard", "garden", "botanical", # "park", "national park", "wilderness", "camping", "hiking", "trail", "path", # "sunrise", "sunset", "golden hour", "blue hour", "night sky", "stars", "milky way", # "aurora", "northern lights", "storm", "lightning", "rain", "fog", "mist", "dew", # "frost", "ice", "snow", "blizzard", "spring", "summer", "autumn", "winter", # # URBAN & ARCHITECTURE SCENES (70+) # "cityscape", "urban", "skyline", "downtown", "metropolis", "town", "village", # "street", "alley", "road", "highway", "freeway", "bridge", "overpass", "tunnel", # "plaza", "square", "marketplace", "shopping district", "financial district", # "industrial", "factory", "warehouse", "dock", "harbor", "port", "airport", # "train station", "subway", "bus station", "architecture", "building", "skyscraper", # "tower", "modern", "contemporary", "historic", "ancient", "medieval", "gothic", # "renaissance", "baroque", "neoclassical", "art deco", "brutalist", "futuristic", # "castle", "palace", "fort", "temple", "church", "cathedral", "mosque", "synagogue", # "monument", "statue", "fountain", "lighthouse", "windmill", "silo", "stadium", # "arena", "amphitheater", "coliseum", "ruins", "archaeological site", # # INDOOR & INTERIOR SCENES (60+) # "indoor scene", "interior", "room", "living room", "bedroom", "kitchen", "bathroom", # "dining room", "study", "office", "home office", "classroom", "lecture hall", # "library", "museum", "gallery", "exhibition", "theater", "cinema", "concert hall", # "restaurant", "cafe", "bar", "pub", "nightclub", "hotel", "lobby", "hallway", # "staircase", "elevator", "basement", "attic", "garage", "workshop", "studio", # "photo studio", "recording studio", "art studio", "dance studio", "gym", "spa", # "salon", "hospital", "clinic", "laboratory", "server room", "control room", # "cockpit", "cabin", "train car", "airplane interior", "submarine", "space station", # # BEACH & WATER SCENES (40+) # "beach", "seaside", "coast", "shoreline", "pier", "boardwalk", "wharf", "dock", # "harbor", "marina", "boat", "yacht", "sailboat", "speedboat", "kayak", "canoe", # "surfing", "windsurfing", "kitesurfing", "paddleboarding", "swimming", "diving", # "snorkeling", "fishing", "beach volleyball", "sandcastle", "seashells", "tide pools", # "cliff diving", "water sports", "river rafting", "whitewater", "lake house", # "pool", "swimming pool", "hot springs", "geyser", "water park", "aquarium", # # SPORTS & ACTION SCENES (50+) # "sports", "action", "athletics", "game", "match", "competition", "tournament", # "soccer", "football", "basketball", "baseball", "tennis", "golf", "hockey", # "cricket", "rugby", "volleyball", "badminton", "table tennis", "boxing", "MMA", # "wrestling", "martial arts", "gymnastics", "diving", "swimming", "running", # "marathon", "sprinting", "hurdles", "jumping", "pole vault", "shot put", # "cycling", "mountain biking", "bmx", "skateboarding", "rollerblading", "skating", # "ice skating", "skiing", "snowboarding", "surfing", "climbing", "rock climbing", # "bouldering", "parkour", "extreme sports", "adventure", "expedition", # # VEHICLE & TRANSPORT SCENES (40+) # "traffic", "road trip", "highway", "freeway", "race", "racing", "formula 1", # "nascar", "rally", "drift", "car show", "parking lot", "garage", "showroom", # "train", "railway", "subway", "metro", "tram", "bus", "airplane", "airport", # "takeoff", "landing", "helicopter", "drone", "boat", "ship", "cruise", "ferry", # "submarine", "spacecraft", "rocket launch", "space station", "flying car", # "futuristic transport", "vintage car", "classic car", "motorcycle", "bicycle", # # WILDLIFE & ANIMAL SCENES (50+) # "wildlife", "animal", "safari", "zoo", "aquarium", "pet", "domestic", "farm", # "birdwatching", "migration", "hunting", "fishing", "bird nest", "den", "lair", # "underwater", "coral reef", "kelp forest", "deep sea", "riverbank", "lake shore", # "forest floor", "tree canopy", "cave dwelling", "desert life", "arctic", # "antarctic", "jungle", "rainforest", "savannah", "grassland", "wetland", # "bird in flight", "animal running", "animal swimming", "animal hunting", # "animal resting", "animal playing", "animal family", "endangered species", # "butterfly garden", "beehive", "ant colony", "spider web", "reptile", "amphibian", # "insect", "marine life", "whale watching", "dolphin", # # FOOD & DINING SCENES (40+) # "food", "meal", "dining", "restaurant", "cafe", "bakery", "kitchen", "cooking", # "baking", "grilling", "bbq", "picnic", "buffet", "banquet", "feast", "party food", # "street food", "market", "farmers market", "grocery", "supermarket", "harvest", # "vineyard", "winery", "brewery", "coffee shop", "tea ceremony", "sushi bar", # "pizza making", "pasta making", "chocolate factory", "ice cream parlor", # "breakfast", "brunch", "lunch", "dinner", "dessert", "snack", "cocktail", # "wine tasting", "food photography", "still life food", # # ART & CULTURAL SCENES (50+) # "art", "painting", "drawing", "sculpture", "pottery", "weaving", "knitting", # "photography", "film set", "recording", "dance", "ballet", "contemporary dance", # "theater", "play", "opera", "musical", "concert", "orchestra", "band", "choir", # "museum", "gallery", "exhibition", "installation", "street art", "graffiti", # "mural", "body art", "tattoo", "fashion", "runway", "photo shoot", "film noir", # "cinematic", "vintage", "retro", "steampunk", "cyberpunk", "fantasy", "sci-fi", # "historical reenactment", "renaissance fair", "cultural festival", "religious", # "ceremony", "ritual", "meditation", "yoga", "spiritual", # # ABSTRACT & CONCEPTUAL SCENES (40+) # "abstract", "conceptual", "surreal", "dream", "fantasy", "imagination", "illusion", # "pattern", "texture", "color", "gradient", "light", "shadow", "reflection", # "silhouette", "minimalist", "geometric", "symmetry", "asymmetry", "chaos", # "order", "motion", "blur", "long exposure", "time lapse", "macro", "microscopic", # "aerial", "bird's eye view", "worm's eye view", "perspective", "vanishing point", # "double exposure", "composite", "collage", "digital art", "generative art", # "fractal", "mandala", "kaleidoscope", "prism", # # WEATHER & ATMOSPHERIC SCENES (30+) # "sunny", "cloudy", "overcast", "rainy", "stormy", "thunderstorm", "lightning", # "tornado", "hurricane", "snowy", "blizzard", "foggy", "misty", "hazy", "smoky", # "dusty", "windy", "calm", "clear", "sunrise", "sunset", "twilight", "dusk", # "dawn", "night", "moonlit", "starry", "aurora", "rainbow", "halo", # # SEASONAL & HOLIDAY SCENES (30+) # "spring", "summer", "autumn", "fall", "winter", "christmas", "new year", # "valentine's day", "easter", "halloween", "thanksgiving", "diwali", "hannukah", # "ramadan", "eid", "chinese new year", "cherry blossom", "fall foliage", # "winter wonderland", "summer vacation", "beach holiday", "ski holiday", # "festival of lights", "fireworks", "carnival", "mardi gras", "oktoberfest", # "day of the dead", "la tomatina", "running of the bulls", # # TECHNOLOGY & FUTURISTIC SCENES (30+) # "futuristic", "sci-fi", "cyberpunk", "steampunk", "space", "galaxy", "planet", # "space station", "alien", "robot", "android", "cyborg", "AI", "virtual reality", # "augmented reality", "hologram", "neon", "LED", "circuit board", "data center", # "laboratory", "experiment", "invention", "innovation", "smart home", "smart city", # "autonomous vehicle", "drone swarm", "3D printing", "nanotechnology", # # HISTORICAL & PERIOD SCENES (30+) # "historical", "ancient", "medieval", "renaissance", "victorian", "edwardian", # "art deco", "retro", "vintage", "old west", "pirate", "samurai", "feudal", # "prehistoric", "dinosaur", "ice age", "stone age", "bronze age", "iron age", # "roman", "greek", "egyptian", "mayan", "aztec", "inca", "mongol", "vikings", # "crusades", "world war", "colonial", "industrial revolution" # ] # # Total: 500+ scene categories across 15 major themes # def analyze_image_with_clip(self, image): # """Analyze image using CLIP to understand content and scene""" # if self.clip_model is None or self.clip_processor is None: # return self.fallback_image_analysis(image) # try: # # Convert PIL to RGB # image_rgb = image.convert('RGB') # # Analyze objects in the image # object_inputs = self.clip_processor( # text=self.common_objects, # images=image_rgb, # return_tensors="pt", # padding=True # ) # with torch.no_grad(): # object_outputs = self.clip_model(**object_inputs) # object_logits = object_outputs.logits_per_image # object_probs = object_logits.softmax(dim=1) # # Get top objects # top_object_indices = torch.topk(object_probs, 5, dim=1).indices[0] # detected_objects = [] # for idx in top_object_indices: # obj_name = self.common_objects[idx] # confidence = object_probs[0][idx].item() # if confidence > 0.1: # Confidence threshold # detected_objects.append({ # 'name': obj_name, # 'confidence': confidence # }) # # Analyze scene type # scene_inputs = self.clip_processor( # text=self.scene_categories, # images=image_rgb, # return_tensors="pt", # padding=True # ) # with torch.no_grad(): # scene_outputs = self.clip_model(**scene_inputs) # scene_logits = scene_outputs.logits_per_image # scene_probs = scene_logits.softmax(dim=1) # top_scene_indices = torch.topk(scene_probs, 3, dim=1).indices[0] # scene_types = [] # for idx in top_scene_indices: # scene_name = self.scene_categories[idx] # confidence = scene_probs[0][idx].item() # scene_types.append({ # 'type': scene_name, # 'confidence': confidence # }) # return { # 'objects': detected_objects, # 'scenes': scene_types, # 'success': True # } # except Exception as e: # print(f"CLIP analysis failed: {e}") # return self.fallback_image_analysis(image) # def fallback_image_analysis(self, image): # """Fallback analysis when CLIP fails""" # return { # 'objects': [{'name': 'scene', 'confidence': 1.0}], # 'scenes': [{'type': 'general image', 'confidence': 1.0}], # 'success': False # } # def generate_story(self, analysis_result, creativity_level=0.7): # """Generate a story with caption based on detected objects and scene using Qwen""" # if self.llm_model is None: # return "Story generation model not available." # try: # # Extract detected objects and scene # objects = [obj['name'] for obj in analysis_result['objects']] # scenes = [scene['type'] for scene in analysis_result['scenes']] # # Create a prompt for the LLM # objects_str = ", ".join(objects) # scene_str = scenes[0] if scenes else "general scene" # # Convert creativity_level to float if it's a tuple # if isinstance(creativity_level, (tuple, list)): # creativity_level = float(creativity_level[0]) # # SIMPLIFIED PROMPT - No numbered lists or complex formatting # if creativity_level > 0.8: # prompt = f"Write a catchy 5-7 word YouTube-style caption, then a creative 3-4 paragraph story about {objects_str} in a {scene_str}." # elif creativity_level > 0.5: # prompt = f"Create a short caption and a 2-3 paragraph story about {objects_str} in a {scene_str}." # else: # prompt = f"Write a caption and a 1-2 paragraph description of {objects_str} in a {scene_str}." # # QWEN FORMATTING # if "qwen" in self.llm_model_id.lower(): # formatted_prompt = f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n" # # elif "phi" in self.llm_model_id: # # formatted_prompt = f"Instruct: {prompt}\nOutput:" # # elif "gemma" in self.llm_model_id: # # formatted_prompt = f"user\n{prompt}\nmodel\n" # else: # formatted_prompt = f"User: {prompt}\nAssistant:" # # Tokenize and generate # inputs = self.tokenizer(formatted_prompt, return_tensors="pt").to(self.llm_model.device) # with torch.no_grad(): # outputs = self.llm_model.generate( # **inputs, # max_new_tokens=300, # temperature=creativity_level, # do_sample=True, # top_p=0.9, # repetition_penalty=1.1, # eos_token_id=self.tokenizer.eos_token_id, # pad_token_id=self.tokenizer.eos_token_id, # no_repeat_ngram_size=3 # ) # # Decode and clean up # raw_output = self.tokenizer.decode(outputs[0], skip_special_tokens=True) # # Extract only the assistant's response # if "assistant" in raw_output.lower(): # parts = raw_output.lower().split("assistant") # if len(parts) > 1: # story = parts[-1].strip() # else: # story = raw_output # elif "Assistant:" in raw_output: # parts = raw_output.split("Assistant:") # story = parts[-1].strip() if len(parts) > 1 else raw_output # else: # story = raw_output # # Clean Qwen tokens if present # qwen_tokens = ["<|im_start|>", "<|im_end|>", "<|endoftext|>"] # for token in qwen_tokens: # story = story.replace(token, "").strip() # # Clean any remaining prompt text # story = story.replace(prompt, "").strip() # # Extract or create caption from the story # sentences = story.split('. ') # if sentences: # # Take first sentence as caption # caption = sentences[0].strip() # if not caption.endswith('.'): # caption += '.' # # Rest of the story # if len(sentences) > 1: # story_text = '. '.join(sentences[1:]) # else: # story_text = story.replace(caption, "").strip() # # Format with caption at top and separator # formatted_output = f"{caption}\n{'─' * 40}\n{story_text}" # else: # formatted_output = story # # Clean up any extra whitespace # formatted_output = '\n'.join([line.strip() for line in formatted_output.split('\n') if line.strip()]) # return formatted_output # except Exception as e: # print(f"Story generation failed: {e}") # objects_str = ", ".join(objects) if 'objects' in locals() else "unknown" # scene_str = scenes[0] if 'scenes' in locals() and scenes else "unknown scene" # return f"Caption: Analysis of {objects_str}\n{'─' * 40}\nFailed to generate story. Detected: {objects_str} in {scene_str}." # def process_image_and_generate_story(self, image, creativity_level=0.7): # """Complete pipeline: analyze image and generate story""" # print("Analyzing image...") # analysis = self.analyze_image_with_clip(image) # print("Generating story...") # story = self.generate_story(analysis, creativity_level) # # Return both analysis and story # detected_objects = [obj['name'] for obj in analysis['objects']] # scene_type = analysis['scenes'][0]['type'] if analysis['scenes'] else "unknown" # return story, detected_objects, scene_type # def create_story_overlay(self, image, story): # """Create formatted text with caption and story for textbox display""" # # Generate caption (first sentence of the story) # caption = "" # sentences = story.split('. ') # if sentences: # caption = sentences[0].strip() # if not caption.endswith('.'): # caption += '.' # # Format the text with caption separated from story # # Using a separator line of dashes # separator = "─" * 40 # # Format the complete text for the textbox # formatted_text = f"{caption}\n{separator}\n{story}" # return formatted_text # def remove_background(self, image): # """Remove background using rembg""" # try: # # Convert PIL image to bytes # img_byte_arr = BytesIO() # image.save(img_byte_arr, format='PNG') # img_byte_arr = img_byte_arr.getvalue() # # Remove background # output = rembg.remove(img_byte_arr) # # Convert back to PIL Image # result_image = Image.open(BytesIO(output)) # return result_image # except Exception as e: # print(f"Background removal failed: {e}") # return image # def remove_foreground(self, image): # """Remove foreground and keep only background using inpainting""" # try: # # First remove background to get foreground mask # img_byte_arr = BytesIO() # image.save(img_byte_arr, format='PNG') # img_byte_arr = img_byte_arr.getvalue() # # Remove background to get alpha channel # output = rembg.remove(img_byte_arr) # foreground_image = Image.open(BytesIO(output)) # # Convert to numpy arrays # original_np = np.array(image.convert('RGB')) # foreground_np = np.array(foreground_image.convert('RGBA')) # # Create mask where foreground exists (alpha > 0) # mask = foreground_np[:, :, 3] > 0 # # Create background-only image by filling foreground areas # background_np = original_np.copy() # # Simple inpainting: fill foreground areas with average background color # # Calculate average background color from areas without foreground # bg_pixels = original_np[~mask] # if len(bg_pixels) > 0: # avg_color = np.mean(bg_pixels, axis=0) # background_np[mask] = avg_color.astype(np.uint8) # return Image.fromarray(background_np) # except Exception as e: # print(f"Foreground removal failed: {e}") # return image # def process_image(self, image): # """Main processing function""" # try: # # Analyze image with CLIP-ViT # analysis_result = self.analyze_image_with_clip(image) # # Generate story # story = self.generate_story(analysis_result, creativity_level=0.7) # # # Create analysis overlay # # analysis_image = self.create_analysis_overlay(image, analysis_result) # # Create story overlay # story_image = self.create_story_overlay(image, story) # return story_image # except Exception as e: # error_msg = f"An error occurred: {str(e)}" # print(error_msg) # # Return original images on error # return image, image # # Initialize the storyteller # storyteller = ImageStoryteller() # # Get example images from local directory # def get_example_images(): # """Get example images from local directory""" # example_images = [] # for i in range(1, 21): # img_path = f"obj_{i:02d}.jpg" # if os.path.exists(img_path): # # Load and resize the image for the gallery # img = Image.open(img_path) # # Resize to smaller size for gallery display # img.thumbnail((150, 150)) # example_images.append(img) # else: # print(f"Warning: {img_path} not found") # # Create a simple placeholder image # placeholder = Image.new('RGB', (150, 150), color=(73, 109, 137)) # example_images.append(placeholder) # return example_images # def load_selected_example(evt: gr.SelectData): # """Load the full-size version of the selected example image""" # if evt.index < 20: # We have 8 example images # img_path = f"obj_{evt.index+1:02d}.jpg" # if os.path.exists(img_path): # return Image.open(img_path) # return None # # Create Gradio interface # with gr.Blocks(title="Who says AI isn’t creative? Watch it turn a single image into a beautifully written story", theme=gr.themes.Soft()) as demo: # gr.Markdown("# Image Story Teller") # gr.Markdown("**Upload an image to analyse content and generate stories**") # # Load example images # example_images_list = get_example_images() # custom_css = """ # # """ # javascript = """ # # """ # with gr.Row(): # with gr.Column(): # input_image = gr.Image( # type="pil", # label="πŸ–ΌοΈ Upload Your Image", # height=400 # ) # # Buttons row # with gr.Row(): # process_btn = gr.Button("✨ Generate Story", variant="primary", size="lg") # clear_btn = gr.Button("πŸ—‘οΈ Clear Image", variant="secondary", size="lg") # # Example images section # gr.Markdown("### πŸ“Έ Example Images (Click to load)") # # Display example images in a gallery with custom CSS to remove frames # example_gallery = gr.Gallery( # value=example_images_list, # label="", # columns=4, # rows=2, # height="auto", # object_fit="contain", # show_label=False, # container=True, # preview=False, # allow_preview=False, # elem_id="example-gallery" # ) # with gr.Column(): # story_output = gr.Textbox( # label="πŸ” Story", # lines=10, # max_lines=20, # interactive=False, # autoscroll=False # ) # # with gr.Row(): # # with gr.Column(): # # story_output = gr.Image( # # label="πŸ“– Story", # # height=400, # # show_download_button=True # # ) # # Background removal section # with gr.Row(): # with gr.Column(): # bg_remove_btn = gr.Button("🎯 Remove Background", variant="secondary", size="lg") # background_output = gr.Image( # label="Background Removed", # height=400, # ) # with gr.Column(): # fg_remove_btn = gr.Button("🎯 Remove Foreground", variant="secondary", size="lg") # foreground_output = gr.Image( # label="Foreground Removed", # height=400, # ) # def clear_all(): # """Clear all images and outputs""" # return None, None, None, None, None # # Set up the processing # process_btn.click( # fn=storyteller.process_image, # inputs=input_image, # outputs=[story_output] # ) # # Clear button functionality # clear_btn.click( # fn=clear_all, # inputs=[], # outputs=[input_image, story_output, background_output, foreground_output] # ) # # Example gallery selection - load full-size image when clicked # example_gallery.select( # fn=load_selected_example, # inputs=[], # outputs=input_image # ) # # Background removal # bg_remove_btn.click( # fn=storyteller.remove_background, # inputs=input_image, # outputs=background_output # ) # # Foreground removal # fg_remove_btn.click( # fn=storyteller.remove_foreground, # inputs=input_image, # outputs=foreground_output # ) # # Launch the application # if __name__ == "__main__": # demo.launch( # server_name="0.0.0.0", # server_port=7860, # share=False # ) import gradio as gr import cv2 import numpy as np from PIL import Image import torch from transformers import CLIPProcessor, CLIPModel, AutoTokenizer, AutoModelForCausalLM import rembg from io import BytesIO import os import warnings # Suppress warnings warnings.filterwarnings("ignore") class ImageStoryteller: def __init__(self, llm_model_id="Qwen/Qwen2.5-3B-Instruct"): print("Initializing Image Storyteller with CLIP-ViT and LLM...") # Load CLIP model for image understanding try: self.clip_model = CLIPModel.from_pretrained("laion/CLIP-ViT-H-14-laion2B-s32B-b79K") self.clip_processor = CLIPProcessor.from_pretrained("laion/CLIP-ViT-H-14-laion2B-s32B-b79K") print("CLIP-ViT model loaded successfully!") except Exception as e: print(f"CLIP loading failed: {e}") self.clip_model = None self.clip_processor = None # Load LLM for story generation try: self.llm_model_id = llm_model_id self.tokenizer = AutoTokenizer.from_pretrained(llm_model_id) self.llm_model = AutoModelForCausalLM.from_pretrained( llm_model_id, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, device_map="auto" if torch.cuda.is_available() else None, trust_remote_code=True ) print(f"LLM model {llm_model_id} loaded successfully!") except Exception as e: print(f"LLM loading failed: {e}") self.llm_model = None self.tokenizer = None # Common objects and scenes (truncated for brevity - keep your full lists) self.common_objects = [ # People & Faces (25 items) 'person', 'people', 'human', 'man', 'woman', 'child', 'baby', 'face', 'head', 'hand', 'foot', 'body', 'crowd', 'group', 'family', 'couple', 'friends', 'audience', 'team', 'worker', 'athlete', 'dancer', 'singer', 'artist', 'doctor', # Animals (25 items) 'dog', 'cat', 'bird', 'horse', 'cow', 'sheep', 'pig', 'elephant', 'lion', 'tiger', 'bear', 'wolf', 'fox', 'deer', 'rabbit', 'squirrel', 'butterfly', 'fish', 'shark', 'whale', 'dolphin', 'turtle', 'snake', 'spider', 'insect', # Vehicles (20 items) 'car', 'truck', 'bus', 'motorcycle', 'bicycle', 'airplane', 'helicopter', 'train', 'boat', 'ship', 'sailboat', 'submarine', 'rocket', 'tractor', 'ambulance', 'fire truck', 'police car', 'taxi', 'racing car', 'bike', # Buildings & Structures (20 items) 'building', 'house', 'skyscraper', 'tower', 'castle', 'bridge', 'monument', 'statue', 'fountain', 'church', 'temple', 'mosque', 'school', 'hospital', 'hotel', 'restaurant', 'store', 'mall', 'factory', 'lighthouse', # Nature & Outdoor (30 items) 'tree', 'forest', 'flower', 'plant', 'grass', 'mountain', 'hill', 'valley', 'cliff', 'cave', 'water', 'ocean', 'sea', 'river', 'lake', 'waterfall', 'beach', 'sand', 'rock', 'stone', 'sky', 'cloud', 'sun', 'moon', 'star', 'rain', 'snow', 'ice', 'fire', 'smoke', # Food & Drinks (20 items) 'food', 'fruit', 'vegetable', 'bread', 'pizza', 'cake', 'dessert', 'ice cream', 'chocolate', 'coffee', 'tea', 'wine', 'beer', 'water', 'meal', 'breakfast', 'lunch', 'dinner', 'restaurant', 'kitchen', # Furniture & Household (20 items) 'chair', 'table', 'bed', 'sofa', 'couch', 'desk', 'lamp', 'clock', 'mirror', 'window', 'door', 'stairs', 'shelf', 'cabinet', 'refrigerator', 'oven', 'sink', 'toilet', 'shower', 'bathtub', # Electronics & Items (20 items) 'computer', 'laptop', 'phone', 'television', 'camera', 'book', 'newspaper', 'pen', 'paper', 'keyboard', 'mouse', 'headphones', 'speaker', 'microphone', 'watch', 'glasses', 'sunglasses', 'umbrella', 'bag', 'backpack', # Clothing (20 items) 'clothing', 'shirt', 'pants', 'dress', 'skirt', 'jacket', 'coat', 'shoes', 'boots', 'sneakers', 'hat', 'cap', 'helmet', 'gloves', 'scarf', 'tie', 'belt', 'jewelry', 'necklace', 'ring' ] self.scene_categories = [ # 50 Most Common Scene Types "portrait", "landscape", "cityscape", "indoor", "outdoor", "nature", "urban", "beach", "mountain", "forest", "street", "road", "park", "garden", "field", "room", "kitchen", "bedroom", "living room", "office", "restaurant", "cafe", "store", "mall", "school", "sports", "game", "concert", "party", "wedding", "food", "meal", "cooking", "drinking", "eating", "animal", "pet", "wildlife", "zoo", "farm", "vehicle", "traffic", "transportation", "travel", "journey", "art", "painting", "drawing", "photography", "design" ] def analyze_image_with_clip(self, image): """Analyze image using CLIP to understand content and scene""" if self.clip_model is None or self.clip_processor is None: return self.fallback_image_analysis(image) try: # Convert PIL to RGB image_rgb = image.convert('RGB') # Analyze objects in the image (process in smaller batches) batch_size = 50 all_object_probs = [] for i in range(0, len(self.common_objects), batch_size): batch_objects = self.common_objects[i:i + batch_size] object_inputs = self.clip_processor( text=batch_objects, images=image_rgb, return_tensors="pt", padding=True ) with torch.no_grad(): object_outputs = self.clip_model(**object_inputs) object_logits = object_outputs.logits_per_image object_probs = object_logits.softmax(dim=1) all_object_probs.append(object_probs) # Combine results if len(all_object_probs) > 1: # Stack and normalize combined_probs = torch.cat(all_object_probs, dim=1) combined_probs = combined_probs / combined_probs.sum(dim=1, keepdim=True) else: combined_probs = all_object_probs[0] # Get top objects top_k = min(5, len(self.common_objects)) top_object_indices = torch.topk(combined_probs, top_k, dim=1).indices[0] detected_objects = [] for idx in top_object_indices: obj_name = self.common_objects[idx] confidence = combined_probs[0][idx].item() if confidence > 0.1: detected_objects.append({ 'name': obj_name, 'confidence': confidence }) # Analyze scene type (similar batching) batch_size = 30 all_scene_probs = [] for i in range(0, len(self.scene_categories), batch_size): batch_scenes = self.scene_categories[i:i + batch_size] scene_inputs = self.clip_processor( text=batch_scenes, images=image_rgb, return_tensors="pt", padding=True ) with torch.no_grad(): scene_outputs = self.clip_model(**scene_inputs) scene_logits = scene_outputs.logits_per_image scene_probs = scene_logits.softmax(dim=1) all_scene_probs.append(scene_probs) # Combine scene results if len(all_scene_probs) > 1: combined_scene_probs = torch.cat(all_scene_probs, dim=1) combined_scene_probs = combined_scene_probs / combined_scene_probs.sum(dim=1, keepdim=True) else: combined_scene_probs = all_scene_probs[0] top_scene_indices = torch.topk(combined_scene_probs, 3, dim=1).indices[0] scene_types = [] for idx in top_scene_indices: scene_name = self.scene_categories[idx] confidence = combined_scene_probs[0][idx].item() scene_types.append({ 'type': scene_name, 'confidence': confidence }) return { 'objects': detected_objects, 'scenes': scene_types, 'success': True } except Exception as e: print(f"CLIP analysis failed: {e}") return self.fallback_image_analysis(image) def fallback_image_analysis(self, image): """Fallback analysis when CLIP fails""" return { 'objects': [{'name': 'scene', 'confidence': 1.0}], 'scenes': [{'type': 'general image', 'confidence': 1.0}], 'success': False } def generate_story(self, analysis_result, creativity_level=0.7): """Generate a story with caption based on detected objects and scene""" if self.llm_model is None: return "Story generation model not available." try: # Extract detected objects and scene objects = [obj['name'] for obj in analysis_result['objects']] scenes = [scene['type'] for scene in analysis_result['scenes']] # Create a prompt for the LLM objects_str = ", ".join(objects[:5]) # Use top 5 objects scene_str = scenes[0] if scenes else "general scene" # Convert creativity_level to float if needed if isinstance(creativity_level, (tuple, list)): creativity_level = float(creativity_level[0]) # Simple prompt prompt = f"Write a creative story about {objects_str} in a {scene_str}. First give a short caption, then a story." # Format for Qwen if "qwen" in self.llm_model_id.lower(): messages = [ {"role": "user", "content": prompt} ] formatted_prompt = self.tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) else: formatted_prompt = f"User: {prompt}\nAssistant:" # Tokenize and generate inputs = self.tokenizer(formatted_prompt, return_tensors="pt") if torch.cuda.is_available(): inputs = {k: v.to(self.llm_model.device) for k, v in inputs.items()} with torch.no_grad(): outputs = self.llm_model.generate( **inputs, max_new_tokens=300, temperature=creativity_level, do_sample=True, top_p=0.9, repetition_penalty=1.1, pad_token_id=self.tokenizer.eos_token_id, ) # Decode output story = self.tokenizer.decode(outputs[0], skip_special_tokens=True) # Extract only the assistant's response if "assistant" in story.lower(): parts = story.lower().split("assistant") story = parts[-1].strip() elif "Assistant:" in story: parts = story.split("Assistant:") story = parts[-1].strip() if len(parts) > 1 else story # Clean up story = story.replace(prompt, "").strip() # Format with separator lines = story.split('\n') if len(lines) > 1: formatted = f"{lines[0]}\n{'─' * 40}\n" + '\n'.join(lines[1:]) else: formatted = story return formatted except Exception as e: print(f"Story generation failed: {e}") return f"Error generating story: {str(e)}" def process_image_and_generate_story(self, image, creativity_level=0.7): """Complete pipeline: analyze image and generate story""" if image is None: return "Please upload an image first.", [], "No image" print("Analyzing image...") analysis = self.analyze_image_with_clip(image) print("Generating story...") story = self.generate_story(analysis, creativity_level) # Return analysis details detected_objects = [obj['name'] for obj in analysis['objects']] scene_type = analysis['scenes'][0]['type'] if analysis['scenes'] else "unknown" return story, detected_objects, scene_type def remove_background(self, image): """Remove background using rembg""" if image is None: return None try: img_byte_arr = BytesIO() image.save(img_byte_arr, format='PNG') img_byte_arr = img_byte_arr.getvalue() output = rembg.remove(img_byte_arr) result_image = Image.open(BytesIO(output)) return result_image except Exception as e: print(f"Background removal failed: {e}") return image def remove_foreground(self, image): """Remove foreground and keep only background""" if image is None: return None try: # Remove background first img_byte_arr = BytesIO() image.save(img_byte_arr, format='PNG') img_byte_arr = img_byte_arr.getvalue() output = rembg.remove(img_byte_arr) foreground_image = Image.open(BytesIO(output)) # Convert to numpy arrays original_np = np.array(image.convert('RGB')) foreground_np = np.array(foreground_image.convert('RGBA')) # Create mask mask = foreground_np[:, :, 3] > 0 # Create background-only image background_np = original_np.copy() # Fill foreground areas with average background color bg_pixels = original_np[~mask] if len(bg_pixels) > 0: avg_color = np.mean(bg_pixels, axis=0) background_np[mask] = avg_color.astype(np.uint8) return Image.fromarray(background_np) except Exception as e: print(f"Foreground removal failed: {e}") return image # Initialize the storyteller storyteller = ImageStoryteller() def get_example_images(): """Get example images from local directory""" example_images = [] for i in range(1, 21): img_path = f"obj_{i:02d}.jpg" if os.path.exists(img_path): try: img = Image.open(img_path) img.thumbnail((150, 150)) example_images.append((img, f"Example {i}")) except: placeholder = Image.new('RGB', (150, 150), color=(73, 109, 137)) example_images.append((placeholder, f"Placeholder {i}")) return example_images def load_selected_example(evt: gr.SelectData): """Load the full-size version of the selected example image""" if evt.index < 20: img_path = f"obj_{evt.index+1:02d}.jpg" if os.path.exists(img_path): return Image.open(img_path) return None def clear_all(): """Clear all inputs and outputs""" return None, "", "", "", None, None # Create Gradio interface with gr.Blocks( title="Image Story Teller - Turn images into stories", theme=gr.themes.Soft(), css=""" .gradio-container { max-width: 1400px !important; margin: auto !important; } .gallery .thumb { border: none !important; box-shadow: none !important; } .gallery .thumb.selected { border: 2px solid #4CAF50 !important; } """ ) as demo: gr.Markdown("# 🎨 Image Story Teller") gr.Markdown("Upload an image to analyze content and generate creative stories") # Get example images example_images_list = get_example_images() with gr.Row(): with gr.Column(scale=1): input_image = gr.Image( type="pil", label="πŸ“€ Upload Your Image", height=400, interactive=True ) with gr.Row(): process_btn = gr.Button( "✨ Generate Story", variant="primary", size="lg" ) clear_btn = gr.Button( "πŸ—‘οΈ Clear All", variant="secondary", size="lg" ) # Creativity slider creativity_slider = gr.Slider( minimum=0.1, maximum=1.0, value=0.7, step=0.1, label="Creativity Level", info="Higher = more creative, Lower = more factual" ) gr.Markdown("### πŸ“Έ Example Images") example_gallery = gr.Gallery( value=[img for img, _ in example_images_list], label="Click an image to load it", columns=4, rows=2, height="auto", object_fit="contain", show_label=True, allow_preview=False, preview=False ) with gr.Column(scale=1): story_output = gr.Textbox( label="πŸ“– Generated Story", lines=15, max_lines=20, interactive=False, # show_copy_button=True ) with gr.Accordion("πŸ“Š Analysis Details", open=False): objects_output = gr.Textbox( label="Detected Objects", interactive=False, lines=3 ) scene_output = gr.Textbox( label="Scene Type", interactive=False, lines=2 ) with gr.Row(): with gr.Column(): bg_remove_btn = gr.Button( "🎯 Remove Background", variant="secondary", size="lg" ) background_output = gr.Image( label="Background Removed", height=300, interactive=False ) with gr.Column(): fg_remove_btn = gr.Button( "🎯 Remove Foreground", variant="secondary", size="lg" ) foreground_output = gr.Image( label="Foreground Removed", height=300, interactive=False ) # Event handlers process_btn.click( fn=lambda img, creativity: storyteller.process_image_and_generate_story(img, creativity), inputs=[input_image, creativity_slider], outputs=[story_output, objects_output, scene_output] ) clear_btn.click( fn=clear_all, inputs=[], outputs=[input_image, story_output, objects_output, scene_output, background_output, foreground_output] # ALL outputs here ) example_gallery.select( fn=load_selected_example, inputs=[], outputs=input_image ) bg_remove_btn.click( fn=storyteller.remove_background, inputs=input_image, outputs=background_output ) fg_remove_btn.click( fn=storyteller.remove_foreground, inputs=input_image, outputs=foreground_output ) # Launch the application if __name__ == "__main__": demo.launch( server_name="0.0.0.0", server_port=7860, share=False, show_error=True, favicon_path=None, inbrowser=True )