Spaces:

Tas01
/

background-removal-api

Running

App Files Files Community

Tas01 commited on 10 days ago

Commit

4888ca6

verified ·

1 Parent(s): bc37146

Update app.py

Browse files

Files changed (1) hide show

app.py +65 -112

app.py CHANGED Viewed

@@ -273,43 +273,22 @@ class ImageStoryteller:
             scenes = [scene['type'] for scene in analysis_result['scenes']]
             # Create a prompt for the LLM
-            objects_str = ", ".join(objects)  # Use top 3 objects
             scene_str = scenes[0] if scenes else "general scene"
-            # FIXED: Convert creativity_level to float if it's a tuple
             if isinstance(creativity_level, (tuple, list)):
                 creativity_level = float(creativity_level[0])
-            # Enhanced prompt with caption generation
             if creativity_level > 0.8:
-                prompt = f"""Based on this image containing {objects_str} in a {scene_str}:
-    1. First, write a catchy 5-7 word YouTube-style caption (engaging, attention-grabbing)
-    2. Then, write a creative and imaginative short story (3-4 paragraphs)
-    Format exactly like this:
-    CAPTION: [your catchy caption here]
-    STORY: [your creative story here]"""
             elif creativity_level > 0.5:
-                prompt = f"""For an image with {objects_str} in a {scene_str}:
-    1. Create a short, interesting caption (5-7 words)
-    2. Write a 2-3 paragraph story about what's happening in this scene
-    Format:
-    CAPTION: [your caption here]
-    STORY: [your story here]"""
             else:
-                prompt = f"""Describe an image containing {objects_str} in a {scene_str}:
-    1. Give a simple, descriptive caption
-    2. Write a 1-2 paragraph description
-    Format:
-    CAPTION: [caption here]
-    STORY: [description here]"""
-            # QWEN 1.8B SPECIFIC FORMATTING
             if "qwen" in self.llm_model_id.lower():
                 formatted_prompt = f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
             elif "phi" in self.llm_model_id:
@@ -317,103 +296,77 @@ class ImageStoryteller:
             elif "gemma" in self.llm_model_id:
                 formatted_prompt = f"<start_of_turn>user\n{prompt}<end_of_turn>\n<start_of_turn>model\n"
             else:
-                formatted_prompt = f"{prompt}\n\n"
             # Tokenize and generate
             inputs = self.tokenizer(formatted_prompt, return_tensors="pt").to(self.llm_model.device)
             with torch.no_grad():
-                if "qwen" in self.llm_model_id.lower():
-                    outputs = self.llm_model.generate(
-                        **inputs,
-                        max_new_tokens=350,  # Increased for caption + story
-                        temperature=creativity_level,
-                        do_sample=True,
-                        top_p=0.9,
-                        repetition_penalty=1.1,
-                        eos_token_id=self.tokenizer.eos_token_id,
-                        pad_token_id=self.tokenizer.eos_token_id,
-                        no_repeat_ngram_size=3
-                    )
-                else:
-                    outputs = self.llm_model.generate(
-                        **inputs,
-                        max_new_tokens=300,
-                        temperature=creativity_level,
-                        do_sample=True,
-                        top_p=0.9,
-                        repetition_penalty=1.1,
-                        pad_token_id=self.tokenizer.eos_token_id
-                    )
             # Decode and clean up
-            story = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
-            # Clean up Qwen specific tokens
-            if "qwen" in self.llm_model_id.lower():
-                story = story.replace(formatted_prompt, "").strip()
-                story = story.replace("<|im_end|>", "").strip()
-                story = story.replace("<|im_start|>", "").strip()
-                story = story.replace("<|endoftext|>", "").strip()
-            elif story.startswith(formatted_prompt):
-                story = story[len(formatted_prompt):].strip()
-            # Additional cleanup
-            story = story.strip()
-            # Ensure proper formatting for caption and story
-            lines = story.split('\n')
-            formatted_lines = []
-            for line in lines:
-                line = line.strip()
-                if line and not line.startswith('CAPTION:') and not line.startswith('STORY:'):
-                    # If we have caption/story markers but missing the prefix
-                    if 'caption:' in line.lower() and 'caption:' not in line:
-                        line = 'CAPTION: ' + line.split('caption:')[-1].strip()
-                    elif 'story:' in line.lower() and 'story:' not in line:
-                        line = 'STORY: ' + line.split('story:')[-1].strip()
-                formatted_lines.append(line)
-            story = '\n'.join(formatted_lines)
-            # Add visual separator if not already present
-            if 'STORY:' in story:
-                parts = story.split('STORY:', 1)
-                if len(parts) == 2:
-                    caption_part = parts[0].replace('CAPTION:', '').strip()
-                    story_part = parts[1].strip()
-                    # Format with separator
-                    story = f"{caption_part}\n{'─' * 40}\n{story_part}"
-            # Fallback if generation is too short
-            if len(story.split()) < 15:
-                fallback_prompt = f"Create a caption and story for {objects_str} in {scene_str}."
-                simple_inputs = self.tokenizer(fallback_prompt, return_tensors="pt").to(self.llm_model.device)
-                with torch.no_grad():
-                    simple_outputs = self.llm_model.generate(
-                        **simple_inputs,
-                        max_new_tokens=250,
-                        temperature=0.8,
-                        do_sample=True
-                    )
-                story = self.tokenizer.decode(simple_outputs[0], skip_special_tokens=True)
-                story = story.replace(fallback_prompt, "").strip()
-                # Add separator
-                sentences = story.split('. ')
-                if sentences:
-                    caption = sentences[0].strip()
-                    if not caption.endswith('.'):
-                        caption += '.'
-                    rest_of_story = '. '.join(sentences[1:]) if len(sentences) > 1 else story
-                    story = f"{caption}\n{'─' * 40}\n{rest_of_story}"
-            return story
         except Exception as e:
             print(f"Story generation failed: {e}")
             objects_str = ", ".join(objects) if 'objects' in locals() else "unknown"
             scene_str = scenes[0] if 'scenes' in locals() and scenes else "unknown scene"
-            return f"Failed to generate story. Detected objects: {objects_str} in a {scene_str}. Error: {str(e)}"
     def process_image_and_generate_story(self, image, creativity_level=0.7):
         """Complete pipeline: analyze image and generate story"""

             scenes = [scene['type'] for scene in analysis_result['scenes']]
             # Create a prompt for the LLM
+            objects_str = ", ".join(objects)
             scene_str = scenes[0] if scenes else "general scene"
+            # Convert creativity_level to float if it's a tuple
             if isinstance(creativity_level, (tuple, list)):
                 creativity_level = float(creativity_level[0])
+            # SIMPLIFIED PROMPT - No numbered lists or complex formatting
             if creativity_level > 0.8:
+                prompt = f"Write a catchy 5-7 word YouTube-style caption, then a creative 3-4 paragraph story about {objects_str} in a {scene_str}."
             elif creativity_level > 0.5:
+                prompt = f"Create a short caption and a 2-3 paragraph story about {objects_str} in a {scene_str}."
             else:
+                prompt = f"Write a caption and a 1-2 paragraph description of {objects_str} in a {scene_str}."
+            # QWEN FORMATTING
             if "qwen" in self.llm_model_id.lower():
                 formatted_prompt = f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
             elif "phi" in self.llm_model_id:
             elif "gemma" in self.llm_model_id:
                 formatted_prompt = f"<start_of_turn>user\n{prompt}<end_of_turn>\n<start_of_turn>model\n"
             else:
+                formatted_prompt = f"User: {prompt}\nAssistant:"
             # Tokenize and generate
             inputs = self.tokenizer(formatted_prompt, return_tensors="pt").to(self.llm_model.device)
             with torch.no_grad():
+                outputs = self.llm_model.generate(
+                    **inputs,
+                    max_new_tokens=300,
+                    temperature=creativity_level,
+                    do_sample=True,
+                    top_p=0.9,
+                    repetition_penalty=1.1,
+                    eos_token_id=self.tokenizer.eos_token_id,
+                    pad_token_id=self.tokenizer.eos_token_id,
+                    no_repeat_ngram_size=3
+                )
             # Decode and clean up
+            raw_output = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
+            # Extract only the assistant's response
+            if "assistant" in raw_output.lower():
+                parts = raw_output.lower().split("assistant")
+                if len(parts) > 1:
+                    story = parts[-1].strip()
+                else:
+                    story = raw_output
+            elif "Assistant:" in raw_output:
+                parts = raw_output.split("Assistant:")
+                story = parts[-1].strip() if len(parts) > 1 else raw_output
+            else:
+                story = raw_output
+            # Clean Qwen tokens if present
+            qwen_tokens = ["<|im_start|>", "<|im_end|>", "<|endoftext|>"]
+            for token in qwen_tokens:
+                story = story.replace(token, "").strip()
+            # Clean any remaining prompt text
+            story = story.replace(prompt, "").strip()
+            # Extract or create caption from the story
+            sentences = story.split('. ')
+            if sentences:
+                # Take first sentence as caption
+                caption = sentences[0].strip()
+                if not caption.endswith('.'):
+                    caption += '.'
+                # Rest of the story
+                if len(sentences) > 1:
+                    story_text = '. '.join(sentences[1:])
+                else:
+                    story_text = story.replace(caption, "").strip()
+                # Format with caption at top and separator
+                formatted_output = f"{caption}\n{'─' * 40}\n{story_text}"
+            else:
+                formatted_output = story
+            # Clean up any extra whitespace
+            formatted_output = '\n'.join([line.strip() for line in formatted_output.split('\n') if line.strip()])
+            return formatted_output
         except Exception as e:
             print(f"Story generation failed: {e}")
             objects_str = ", ".join(objects) if 'objects' in locals() else "unknown"
             scene_str = scenes[0] if 'scenes' in locals() and scenes else "unknown scene"
+            return f"Caption: Analysis of {objects_str}\n{'─' * 40}\nFailed to generate story. Detected: {objects_str} in {scene_str}."
     def process_image_and_generate_story(self, image, creativity_level=0.7):
         """Complete pipeline: analyze image and generate story"""