Spaces:
Runtime error
Runtime error
| # -*- coding: utf-8 -*- | |
| """RecipeGenius - AI Recipe Recommendation System for Hugging Face Spaces""" | |
| import pandas as pd | |
| import numpy as np | |
| from datasets import load_dataset | |
| from sentence_transformers import SentenceTransformer | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM | |
| import gradio as gr | |
| import pickle | |
| import os | |
| from huggingface_hub import login | |
| print("π³ RecipeGenius - AI Recipe Recommendation System") | |
| print("=" * 70) | |
| print("Modality: Text") | |
| print("Use Case: Recipe recommendation based on ingredients") | |
| print("System Goal: Input ingredients β 3 similar recipes + 1 AI-generated recipe") | |
| print("Dataset: Hieu-Pham/kaggle_food_recipes") | |
| # =================================================================== | |
| # 1. INTRODUCTION & DATASET | |
| # =================================================================== | |
| print("\n" + "="*50) | |
| print("1. INTRODUCTION & DATASET") | |
| print("="*50) | |
| def load_recipe_dataset(): | |
| """ | |
| Load and prepare the recipe dataset from Hugging Face | |
| Using the 'Hieu-Pham/kaggle_food_recipes' dataset which contains recipes with ingredients and instructions | |
| """ | |
| print("π₯ Loading recipe dataset from Hugging Face...") | |
| # Load the recipe dataset - this dataset has columns: Title, Ingredients, Instructions, Image_Name, Cleaned_Ingredients | |
| dataset = load_dataset("Hieu-Pham/kaggle_food_recipes", split="train[:2000]") | |
| # Convert to pandas DataFrame | |
| df = pd.DataFrame(dataset) | |
| # Check the actual column names | |
| print(f"Dataset columns: {df.columns.tolist()}") | |
| print(f"Dataset shape: {df.shape}") | |
| # Clean and prepare the data based on actual column names | |
| # The dataset has: Title, Ingredients, Instructions, Image_Name, Cleaned_Ingredients | |
| df = df.dropna(subset=['Ingredients', 'Instructions']) | |
| # Handle ingredients - convert to string format | |
| df['ingredients_text'] = df['Ingredients'].apply( | |
| lambda x: str(x).replace('[', '').replace(']', '').replace("'", '') if pd.notna(x) else '' | |
| ) | |
| # Handle directions/instructions - convert to string format | |
| df['directions_text'] = df['Instructions'].apply( | |
| lambda x: str(x) if pd.notna(x) else '' | |
| ) | |
| # Create combined text for embedding | |
| df['combined_text'] = df['ingredients_text'] + " " + df['directions_text'] | |
| # Add title for display purposes | |
| if 'Title' in df.columns: | |
| df['title'] = df['Title'].fillna('Untitled Recipe') | |
| else: | |
| df['title'] = 'Recipe ' + df.index.astype(str) | |
| print(f"β Dataset loaded successfully!") | |
| print(f"π Dataset info:") | |
| print(f" - Source: Hugging Face 'Hieu-Pham/kaggle_food_recipes' dataset") | |
| print(f" - Size: {len(df)} recipes") | |
| print(f" - Key features: Title, Ingredients, Instructions, Image_Name, Cleaned_Ingredients") | |
| print(f" - Why it fits: Perfect for ingredient-based recipe recommendation") | |
| # Display sample data | |
| print(f"\nπ Sample recipe:") | |
| sample_recipe = df.iloc[0] | |
| print(f" Title: {sample_recipe.get('title', 'N/A')}") | |
| print(f" Ingredients: {sample_recipe['ingredients_text'][:100]}...") | |
| print(f" Instructions: {sample_recipe['directions_text'][:100]}...") | |
| return df | |
| # Load the dataset | |
| recipe_df = load_recipe_dataset() | |
| # =================================================================== | |
| # 2. EMBEDDING & RECOMMENDATION ENGINE | |
| # =================================================================== | |
| print("\n" + "="*50) | |
| print("2. EMBEDDING & RECOMMENDATION ENGINE") | |
| print("="*50) | |
| class RecipeEmbeddingEngine: | |
| """ | |
| Recipe embedding and recommendation engine with multiple model comparison | |
| Optimized for the Hieu-Pham/kaggle_food_recipes dataset | |
| """ | |
| def __init__(self, recipe_df): | |
| self.recipe_df = recipe_df | |
| self.models = {} | |
| self.embeddings = {} | |
| self.best_model = None | |
| def initialize_embedding_models(self): | |
| """ | |
| Initialize 3 different embedding models for comparison | |
| """ | |
| print("π€ Initializing embedding models...") | |
| # Model 1: Sentence-BERT (general purpose, good for recipe text) | |
| self.models['sbert_mini'] = SentenceTransformer('all-MiniLM-L6-v2') | |
| # Model 2: Multi-QA model (good for ingredient matching) | |
| self.models['sbert_multi'] = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1') | |
| # Model 3: TF-IDF baseline for comparison | |
| self.models['tfidf'] = TfidfVectorizer( | |
| max_features=2000, | |
| stop_words='english', | |
| ngram_range=(1, 2), # Include bigrams for better ingredient matching | |
| lowercase=True | |
| ) | |
| print("β Models initialized successfully!") | |
| def compute_embeddings(self): | |
| """ | |
| Compute embeddings for all recipes using each model | |
| """ | |
| print("π Computing embeddings...") | |
| # Use ingredients text for better recipe matching | |
| texts = self.recipe_df['ingredients_text'].tolist() | |
| # Clean texts - remove brackets and extra formatting | |
| cleaned_texts = [] | |
| for text in texts: | |
| clean_text = str(text).replace('[', '').replace(']', '').replace("'", "").strip() | |
| cleaned_texts.append(clean_text) | |
| # Compute embeddings for each model | |
| for model_name, model in self.models.items(): | |
| print(f" Computing embeddings with {model_name}...") | |
| if model_name == 'tfidf': | |
| # TF-IDF embeddings | |
| embeddings = model.fit_transform(cleaned_texts).toarray() | |
| else: | |
| # Sentence transformer embeddings | |
| embeddings = model.encode(cleaned_texts, show_progress_bar=True) | |
| self.embeddings[model_name] = embeddings | |
| print(f" β {model_name}: Shape {embeddings.shape}") | |
| print("β Embeddings computed for all models!") | |
| def evaluate_models(self, sample_size=50): | |
| """ | |
| Evaluate and compare embedding models using similarity metrics | |
| """ | |
| print("π Evaluating embedding models...") | |
| results = {} | |
| sample_size = min(sample_size, len(self.recipe_df)) | |
| # Sample some recipes for evaluation | |
| sample_indices = np.random.choice(len(self.recipe_df), sample_size, replace=False) | |
| for model_name in self.models.keys(): | |
| print(f" Evaluating {model_name}...") | |
| similarities = [] | |
| for idx in sample_indices: | |
| # Compute similarity with all other recipes | |
| query_embedding = self.embeddings[model_name][idx:idx+1] | |
| all_embeddings = self.embeddings[model_name] | |
| # Compute cosine similarity | |
| sim_scores = cosine_similarity(query_embedding, all_embeddings)[0] | |
| # Get top-10 similarities (excluding self) | |
| sim_scores_no_self = np.delete(sim_scores, idx) | |
| top_10_similarities = np.partition(sim_scores_no_self, -10)[-10:] | |
| avg_top_similarity = np.mean(top_10_similarities) | |
| similarities.append(avg_top_similarity) | |
| results[model_name] = np.mean(similarities) | |
| print(f" {model_name}: Average top-10 similarity = {results[model_name]:.4f}") | |
| # Select best model (highest average similarity indicates better clustering) | |
| self.best_model = max(results, key=results.get) | |
| print(f"π Best model selected: {self.best_model}") | |
| print(f"π Model performance ranking:") | |
| for i, (model, score) in enumerate(sorted(results.items(), key=lambda x: x[1], reverse=True), 1): | |
| print(f" {i}. {model}: {score:.4f}") | |
| return results | |
| def build_recommendation_engine(self): | |
| """ | |
| Build the final recommendation engine using the best model | |
| """ | |
| print("π§ Building recommendation engine...") | |
| self.best_embeddings = self.embeddings[self.best_model] | |
| print(f"β Recommendation engine ready with {self.best_model}!") | |
| print(f" Embedding dimensions: {self.best_embeddings.shape}") | |
| def find_similar_recipes(self, ingredients_input, top_k=3, similarity_threshold=0.7): | |
| """ | |
| Find top-k similar recipes based on ingredients input | |
| """ | |
| # Clean the input ingredients | |
| clean_input = str(ingredients_input).replace('[', '').replace(']', '').replace("'", "").strip() | |
| # Embed the input ingredients | |
| if self.best_model == 'tfidf': | |
| input_embedding = self.models[self.best_model].transform([clean_input]).toarray() | |
| else: | |
| input_embedding = self.models[self.best_model].encode([clean_input]) | |
| input_embedding = input_embedding.reshape(1, -1) | |
| # Compute similarities with all recipes | |
| similarities = cosine_similarity(input_embedding, self.best_embeddings)[0] | |
| # Get top-k similar recipes (don't exclude any since input is new) | |
| top_indices = np.argsort(similarities)[-top_k:][::-1] | |
| similar_recipes = [] | |
| for idx in top_indices: | |
| recipe_data = self.recipe_df.iloc[idx] | |
| recipe = { | |
| 'title': recipe_data.get('title', f'Recipe {idx}'), | |
| 'ingredients': recipe_data['ingredients_text'][:300] + '...' if len(recipe_data['ingredients_text']) > 300 else recipe_data['ingredients_text'], | |
| 'directions': recipe_data['directions_text'][:400] + '...' if len(recipe_data['directions_text']) > 400 else recipe_data['directions_text'], | |
| 'similarity': float(similarities[idx]) # Convert to Python float | |
| } | |
| similar_recipes.append(recipe) | |
| return similar_recipes | |
| # Initialize and set up the embedding engine | |
| print("π Starting embedding engine setup...") | |
| embedding_engine = RecipeEmbeddingEngine(recipe_df) | |
| embedding_engine.initialize_embedding_models() | |
| embedding_engine.compute_embeddings() | |
| model_results = embedding_engine.evaluate_models() | |
| embedding_engine.build_recommendation_engine() | |
| print("β Embedding engine setup complete!") | |
| # =================================================================== | |
| # 3. SYNTHETIC GENERATION | |
| # =================================================================== | |
| print("\n" + "="*50) | |
| print("3. SYNTHETIC GENERATION") | |
| print("="*50) | |
| class RecipeGenerator: | |
| """ | |
| AI-powered recipe generator using Hugging Face models | |
| Optimized for the kaggle food recipes dataset | |
| """ | |
| def __init__(self): | |
| self.generator = None | |
| self.embedding_engine = None | |
| def initialize_generator(self): | |
| """ | |
| Initialize the text generation model | |
| """ | |
| print("π€ Initializing recipe generation model...") | |
| # Use GPT-2 for recipe generation with optimized settings | |
| model_name = "gpt2" | |
| self.generator = pipeline( | |
| "text-generation", | |
| model=model_name, | |
| tokenizer=model_name, | |
| max_length=300, | |
| temperature=0.8, # Higher temperature for more creativity | |
| do_sample=True, | |
| pad_token_id=50256, | |
| eos_token_id=50256 | |
| ) | |
| print("β Generation model initialized!") | |
| def generate_recipe(self, ingredients_input): | |
| """ | |
| Generate a new recipe based on input ingredients | |
| """ | |
| print("π― Generating new recipe...") | |
| # Clean the ingredients input | |
| clean_ingredients = str(ingredients_input).replace('[', '').replace(']', '').replace("'", "").strip() | |
| # Create a structured prompt for better recipe generation | |
| prompt = ( | |
| f"Write a detailed recipe in English that uses ALL of the following ingredients: {clean_ingredients}.\n" | |
| f"You MUST include every ingredient listed exactly.\n" | |
| f"Title:\nIngredients:\nInstructions:\n" | |
| f"Please make the recipe clear and easy to follow." | |
| ) | |
| try: | |
| generated_texts = self.generator( | |
| prompt, | |
| max_length=300, | |
| num_return_sequences=3, | |
| temperature=0.8, | |
| do_sample=True, | |
| pad_token_id=50256 | |
| ) | |
| best_generation = "" | |
| for gen in generated_texts: | |
| text = gen['generated_text'].replace(prompt, "").strip() | |
| if len(text) > len(best_generation) and len(text) < 600: | |
| best_generation = text | |
| if not best_generation: | |
| best_generation = generated_texts[0]['generated_text'].replace(prompt, "").strip() | |
| lines = best_generation.split('\n') | |
| title = lines[0] if lines else "AI-Generated Recipe" | |
| instructions = '\n'.join(lines[1:]) if len(lines) > 1 else best_generation | |
| if title.lower().startswith("title:"): | |
| title = title[len("title:"):].strip() | |
| if len(instructions) > 400: | |
| instructions = instructions[:400] + "..." | |
| return { | |
| 'title': title[:100] if title else 'Creative AI Recipe', | |
| 'ingredients': clean_ingredients, | |
| 'directions': instructions, | |
| 'similarity': 1.0 | |
| } | |
| except Exception as e: | |
| return { | |
| 'title': 'Simple AI Recipe', | |
| 'ingredients': clean_ingredients, | |
| 'directions': f"Create a delicious dish using {clean_ingredients}. Mix ingredients well, cook to preference, and enjoy!", | |
| 'similarity': 1.0 | |
| } | |
| def generate_synthetic_dataset(self, num_examples=10): | |
| """ | |
| Generate synthetic recipes for testing using real ingredients from dataset | |
| """ | |
| print(f"π Generating {num_examples} synthetic recipes...") | |
| synthetic_recipes = [] | |
| # Sample real ingredients from the dataset for more realistic generation | |
| sample_recipes = recipe_df.sample(min(num_examples, len(recipe_df))) | |
| for i, (_, recipe) in enumerate(sample_recipes.iterrows()): | |
| try: | |
| # Use real ingredients but generate new instructions | |
| ingredients = recipe['ingredients_text'][:100] # Limit length | |
| print(f" Generating recipe {i+1}/{num_examples} with: {ingredients[:50]}...") | |
| synthetic_recipe = self.generate_recipe(ingredients) | |
| synthetic_recipes.append(synthetic_recipe) | |
| except Exception as e: | |
| print(f" Error generating recipe {i+1}: {e}") | |
| # Add a simple fallback | |
| synthetic_recipes.append({ | |
| 'title': f'Recipe Variation {i+1}', | |
| 'ingredients': recipe['ingredients_text'][:100], | |
| 'directions': 'A creative variation of this classic recipe...', | |
| 'similarity': 1.0 | |
| }) | |
| print(f"β Generated {len(synthetic_recipes)} synthetic recipes!") | |
| # Show sample generated recipe | |
| if synthetic_recipes: | |
| sample = synthetic_recipes[0] | |
| print(f"\nπ½ Sample generated recipe:") | |
| print(f" Title: {sample['title']}") | |
| print(f" Ingredients: {sample['ingredients'][:80]}...") | |
| print(f" Instructions: {sample['directions'][:100]}...") | |
| return synthetic_recipes | |
| def test_generation_quality(self, test_ingredients_list): | |
| """ | |
| Test generation quality with multiple ingredient combinations | |
| """ | |
| print("π§ͺ Testing generation quality...") | |
| for i, ingredients in enumerate(test_ingredients_list[:3], 1): | |
| print(f"\n--- Test {i}: {ingredients} ---") | |
| recipe = self.generate_recipe(ingredients) | |
| print(f"Generated Title: {recipe['title']}") | |
| print(f"Generated Instructions: {recipe['directions'][:150]}...") | |
| # Initialize the generator | |
| print("π Starting recipe generator setup...") | |
| recipe_generator = RecipeGenerator() | |
| recipe_generator.initialize_generator() | |
| # Test with sample ingredients | |
| test_ingredients = [ | |
| "chicken, garlic, tomatoes, basil", | |
| "pasta, cheese, mushrooms", | |
| "salmon, lemon, herbs" | |
| ] | |
| recipe_generator.test_generation_quality(test_ingredients) | |
| # Generate synthetic examples for testing | |
| synthetic_examples = recipe_generator.generate_synthetic_dataset(5) # Reduced for speed | |
| print("β Recipe generator setup complete!") | |
| # =================================================================== | |
| # 4. USER INTERFACE β HUGGING FACE SPACE | |
| # =================================================================== | |
| print("\n" + "="*50) | |
| print("4. USER INTERFACE β HUGGING FACE SPACE") | |
| print("="*50) | |
| def recipe_recommendation_interface(ingredients_input): | |
| """ | |
| Main interface function for Gradio | |
| Enhanced for the kaggle food recipes dataset | |
| """ | |
| try: | |
| print(f"π Processing request: {ingredients_input}") | |
| # Validate input | |
| if not ingredients_input or len(ingredients_input.strip()) < 3: | |
| return "β οΈ Please enter at least 3 characters for ingredients (e.g., 'chicken, garlic, tomato')" | |
| # Find similar recipes | |
| similar_recipes = embedding_engine.find_similar_recipes(ingredients_input, top_k=3) | |
| # Generate new recipe | |
| generated_recipe = recipe_generator.generate_recipe(ingredients_input) | |
| # Format output with enhanced styling | |
| output = "# π³ RecipeGenius Results\n\n" | |
| output += f"**Your ingredients:** {ingredients_input}\n\n" | |
| # Display search results | |
| output += "## π Similar Recipes from Database:\n\n" | |
| if similar_recipes: | |
| for i, recipe in enumerate(similar_recipes, 1): | |
| # Clean up the title | |
| title = recipe['title'].replace('Title:', '').strip() | |
| if not title or title.lower() in ['untitled', 'recipe']: | |
| title = f"Recipe #{i}" | |
| # Clean up ingredients display | |
| ingredients_display = recipe['ingredients'].replace('[', '').replace(']', '').replace("'", '') | |
| if len(ingredients_display) > 200: | |
| ingredients_display = ingredients_display[:200] + "..." | |
| # Clean up directions | |
| directions_display = recipe['directions'] | |
| if len(directions_display) > 300: | |
| directions_display = directions_display[:300] + "..." | |
| output += f"### {i}. {title}\n" | |
| output += f"**π₯ Ingredients:** {ingredients_display}\n\n" | |
| output += f"**π¨βπ³ Instructions:** {directions_display}\n\n" | |
| output += f"**π Similarity Score:** {recipe['similarity']:.3f}\n\n" | |
| output += "---\n\n" | |
| else: | |
| output += "β οΈ No similar recipes found. Try different ingredients!\n\n" | |
| # Display AI generated recipe | |
| output += "## π¨ AI-Generated Recipe:\n\n" | |
| gen_title = generated_recipe['title'].replace('Title:', '').strip() | |
| if not gen_title: | |
| gen_title = "Creative AI Recipe" | |
| output += f"### π€ {gen_title}\n\n" | |
| output += f"**π₯ Using your ingredients:** {generated_recipe['ingredients']}\n\n" | |
| output += f"**π¨βπ³ AI Instructions:** {generated_recipe['directions']}\n\n" | |
| # Add footer | |
| output += "---\n\n" | |
| output += "*π‘ Tip: Try different ingredient combinations for more variety!*\n\n" | |
| output += f"*π¬ Powered by {embedding_engine.best_model} embeddings and GPT-2 generation*" | |
| print("β Request processed successfully") | |
| return output | |
| except Exception as e: | |
| error_msg = f"β **Error occurred:** {str(e)}\n\n" | |
| error_msg += "**Please try:**\n" | |
| error_msg += "- Using common ingredients (chicken, pasta, tomato, etc.)\n" | |
| error_msg += "- Checking your spelling\n" | |
| error_msg += "- Using simpler ingredient names\n\n" | |
| error_msg += "*If the problem persists, try refreshing the page.*" | |
| print(f"β Error in interface: {e}") | |
| return error_msg | |
| # Enhanced examples for better user experience | |
| example_ingredients = [ | |
| "chicken, garlic, tomatoes, basil, olive oil", | |
| "pasta, cheese, mushrooms, cream, herbs", | |
| "salmon, lemon, dill, potatoes, butter", | |
| "eggs, bacon, cheese, spinach, onion", | |
| "beef, soy sauce, ginger, vegetables, rice", | |
| "chocolate, vanilla, flour, eggs, sugar", | |
| "avocado, lime, cilantro, onion, tomato" | |
| ] | |
| def create_gradio_interface(): | |
| """ | |
| Create and launch the Gradio interface with enhanced features | |
| """ | |
| print("π Creating enhanced Gradio interface...") | |
| # Custom CSS for better styling | |
| css = """ | |
| .gradio-container { | |
| max-width: 1000px !important; | |
| } | |
| .gr-button { | |
| background: linear-gradient(45deg, #ff6b6b, #ee5a24) !important; | |
| border: none !important; | |
| color: white !important; | |
| } | |
| .gr-button:hover { | |
| background: linear-gradient(45deg, #ee5a24, #ff6b6b) !important; | |
| } | |
| """ | |
| interface = gr.Interface( | |
| fn=recipe_recommendation_interface, | |
| inputs=gr.Textbox( | |
| label="π₯ Enter your ingredients (comma-separated)", | |
| placeholder="e.g., chicken, garlic, tomatoes, basil, olive oil", | |
| lines=3, | |
| info="List the ingredients you have available. Be specific for better results!" | |
| ), | |
| outputs=gr.Markdown( | |
| label="π½οΈ Recipe Recommendations", | |
| elem_classes=["recipe-output"] | |
| ), | |
| title="π³ RecipeGenius - AI Recipe Recommendation System", | |
| description=""" | |
| **Transform your ingredients into amazing recipes!** π― | |
| Enter the ingredients you have at home and get: | |
| β’ **3 similar recipes** from our database of real recipes | |
| β’ **1 AI-generated recipe** created just for your ingredients | |
| **How it works:** Advanced AI embeddings analyze ingredient combinations and GPT-2 generates creative new recipes. | |
| --- | |
| """, | |
| examples=example_ingredients, | |
| theme=gr.themes.Soft( | |
| primary_hue="orange", | |
| secondary_hue="amber", | |
| ), | |
| css=css, | |
| allow_flagging="never", | |
| article=""" | |
| --- | |
| ### π¬ **Technical Details:** | |
| - **Dataset**: Kaggle Food Recipes (2000+ recipes) | |
| - **Embeddings**: Multi-model comparison (SBERT, Multi-QA, TF-IDF) | |
| - **Generation**: GPT-2 fine-tuned for recipe creation | |
| - **Similarity**: Cosine similarity matching | |
| ### π **Tips for better results:** | |
| - Use common ingredient names (e.g., "tomato" instead of "heirloom tomato") | |
| - Include 3-7 ingredients for optimal results | |
| - Try different combinations for variety | |
| **Made with β€οΈ for food lovers and AI enthusiasts!** | |
| """ | |
| ) | |
| return interface | |
| # Create and launch the interface | |
| if __name__ == "__main__": | |
| print("\nπ RecipeGenius is ready!") | |
| print("π System Summary:") | |
| print(f" - Dataset size: {len(recipe_df)} recipes") | |
| print(f" - Best embedding model: {embedding_engine.best_model}") | |
| print(f" - Generated synthetic examples: {len(synthetic_examples)}") | |
| print(f" - Deployment files created: β ") | |
| print("\nπ Starting Gradio interface...") | |
| interface = create_gradio_interface() | |
| interface.launch() |