RecipeGenius22 / app.py
omer43's picture
Create app.py
ec8b69f verified
# -*- coding: utf-8 -*-
"""RecipeGenius - AI Recipe Recommendation System for Hugging Face Spaces"""
import pandas as pd
import numpy as np
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
import gradio as gr
import pickle
import os
from huggingface_hub import login
print("🍳 RecipeGenius - AI Recipe Recommendation System")
print("=" * 70)
print("Modality: Text")
print("Use Case: Recipe recommendation based on ingredients")
print("System Goal: Input ingredients β†’ 3 similar recipes + 1 AI-generated recipe")
print("Dataset: Hieu-Pham/kaggle_food_recipes")
# ===================================================================
# 1. INTRODUCTION & DATASET
# ===================================================================
print("\n" + "="*50)
print("1. INTRODUCTION & DATASET")
print("="*50)
def load_recipe_dataset():
"""
Load and prepare the recipe dataset from Hugging Face
Using the 'Hieu-Pham/kaggle_food_recipes' dataset which contains recipes with ingredients and instructions
"""
print("πŸ”₯ Loading recipe dataset from Hugging Face...")
# Load the recipe dataset - this dataset has columns: Title, Ingredients, Instructions, Image_Name, Cleaned_Ingredients
dataset = load_dataset("Hieu-Pham/kaggle_food_recipes", split="train[:2000]")
# Convert to pandas DataFrame
df = pd.DataFrame(dataset)
# Check the actual column names
print(f"Dataset columns: {df.columns.tolist()}")
print(f"Dataset shape: {df.shape}")
# Clean and prepare the data based on actual column names
# The dataset has: Title, Ingredients, Instructions, Image_Name, Cleaned_Ingredients
df = df.dropna(subset=['Ingredients', 'Instructions'])
# Handle ingredients - convert to string format
df['ingredients_text'] = df['Ingredients'].apply(
lambda x: str(x).replace('[', '').replace(']', '').replace("'", '') if pd.notna(x) else ''
)
# Handle directions/instructions - convert to string format
df['directions_text'] = df['Instructions'].apply(
lambda x: str(x) if pd.notna(x) else ''
)
# Create combined text for embedding
df['combined_text'] = df['ingredients_text'] + " " + df['directions_text']
# Add title for display purposes
if 'Title' in df.columns:
df['title'] = df['Title'].fillna('Untitled Recipe')
else:
df['title'] = 'Recipe ' + df.index.astype(str)
print(f"βœ… Dataset loaded successfully!")
print(f"πŸ“Š Dataset info:")
print(f" - Source: Hugging Face 'Hieu-Pham/kaggle_food_recipes' dataset")
print(f" - Size: {len(df)} recipes")
print(f" - Key features: Title, Ingredients, Instructions, Image_Name, Cleaned_Ingredients")
print(f" - Why it fits: Perfect for ingredient-based recipe recommendation")
# Display sample data
print(f"\nπŸ“‹ Sample recipe:")
sample_recipe = df.iloc[0]
print(f" Title: {sample_recipe.get('title', 'N/A')}")
print(f" Ingredients: {sample_recipe['ingredients_text'][:100]}...")
print(f" Instructions: {sample_recipe['directions_text'][:100]}...")
return df
# Load the dataset
recipe_df = load_recipe_dataset()
# ===================================================================
# 2. EMBEDDING & RECOMMENDATION ENGINE
# ===================================================================
print("\n" + "="*50)
print("2. EMBEDDING & RECOMMENDATION ENGINE")
print("="*50)
class RecipeEmbeddingEngine:
"""
Recipe embedding and recommendation engine with multiple model comparison
Optimized for the Hieu-Pham/kaggle_food_recipes dataset
"""
def __init__(self, recipe_df):
self.recipe_df = recipe_df
self.models = {}
self.embeddings = {}
self.best_model = None
def initialize_embedding_models(self):
"""
Initialize 3 different embedding models for comparison
"""
print("πŸ€– Initializing embedding models...")
# Model 1: Sentence-BERT (general purpose, good for recipe text)
self.models['sbert_mini'] = SentenceTransformer('all-MiniLM-L6-v2')
# Model 2: Multi-QA model (good for ingredient matching)
self.models['sbert_multi'] = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')
# Model 3: TF-IDF baseline for comparison
self.models['tfidf'] = TfidfVectorizer(
max_features=2000,
stop_words='english',
ngram_range=(1, 2), # Include bigrams for better ingredient matching
lowercase=True
)
print("βœ… Models initialized successfully!")
def compute_embeddings(self):
"""
Compute embeddings for all recipes using each model
"""
print("πŸ“Š Computing embeddings...")
# Use ingredients text for better recipe matching
texts = self.recipe_df['ingredients_text'].tolist()
# Clean texts - remove brackets and extra formatting
cleaned_texts = []
for text in texts:
clean_text = str(text).replace('[', '').replace(']', '').replace("'", "").strip()
cleaned_texts.append(clean_text)
# Compute embeddings for each model
for model_name, model in self.models.items():
print(f" Computing embeddings with {model_name}...")
if model_name == 'tfidf':
# TF-IDF embeddings
embeddings = model.fit_transform(cleaned_texts).toarray()
else:
# Sentence transformer embeddings
embeddings = model.encode(cleaned_texts, show_progress_bar=True)
self.embeddings[model_name] = embeddings
print(f" βœ“ {model_name}: Shape {embeddings.shape}")
print("βœ… Embeddings computed for all models!")
def evaluate_models(self, sample_size=50):
"""
Evaluate and compare embedding models using similarity metrics
"""
print("πŸ“ˆ Evaluating embedding models...")
results = {}
sample_size = min(sample_size, len(self.recipe_df))
# Sample some recipes for evaluation
sample_indices = np.random.choice(len(self.recipe_df), sample_size, replace=False)
for model_name in self.models.keys():
print(f" Evaluating {model_name}...")
similarities = []
for idx in sample_indices:
# Compute similarity with all other recipes
query_embedding = self.embeddings[model_name][idx:idx+1]
all_embeddings = self.embeddings[model_name]
# Compute cosine similarity
sim_scores = cosine_similarity(query_embedding, all_embeddings)[0]
# Get top-10 similarities (excluding self)
sim_scores_no_self = np.delete(sim_scores, idx)
top_10_similarities = np.partition(sim_scores_no_self, -10)[-10:]
avg_top_similarity = np.mean(top_10_similarities)
similarities.append(avg_top_similarity)
results[model_name] = np.mean(similarities)
print(f" {model_name}: Average top-10 similarity = {results[model_name]:.4f}")
# Select best model (highest average similarity indicates better clustering)
self.best_model = max(results, key=results.get)
print(f"πŸ† Best model selected: {self.best_model}")
print(f"πŸ“ Model performance ranking:")
for i, (model, score) in enumerate(sorted(results.items(), key=lambda x: x[1], reverse=True), 1):
print(f" {i}. {model}: {score:.4f}")
return results
def build_recommendation_engine(self):
"""
Build the final recommendation engine using the best model
"""
print("πŸ”§ Building recommendation engine...")
self.best_embeddings = self.embeddings[self.best_model]
print(f"βœ… Recommendation engine ready with {self.best_model}!")
print(f" Embedding dimensions: {self.best_embeddings.shape}")
def find_similar_recipes(self, ingredients_input, top_k=3, similarity_threshold=0.7):
"""
Find top-k similar recipes based on ingredients input
"""
# Clean the input ingredients
clean_input = str(ingredients_input).replace('[', '').replace(']', '').replace("'", "").strip()
# Embed the input ingredients
if self.best_model == 'tfidf':
input_embedding = self.models[self.best_model].transform([clean_input]).toarray()
else:
input_embedding = self.models[self.best_model].encode([clean_input])
input_embedding = input_embedding.reshape(1, -1)
# Compute similarities with all recipes
similarities = cosine_similarity(input_embedding, self.best_embeddings)[0]
# Get top-k similar recipes (don't exclude any since input is new)
top_indices = np.argsort(similarities)[-top_k:][::-1]
similar_recipes = []
for idx in top_indices:
recipe_data = self.recipe_df.iloc[idx]
recipe = {
'title': recipe_data.get('title', f'Recipe {idx}'),
'ingredients': recipe_data['ingredients_text'][:300] + '...' if len(recipe_data['ingredients_text']) > 300 else recipe_data['ingredients_text'],
'directions': recipe_data['directions_text'][:400] + '...' if len(recipe_data['directions_text']) > 400 else recipe_data['directions_text'],
'similarity': float(similarities[idx]) # Convert to Python float
}
similar_recipes.append(recipe)
return similar_recipes
# Initialize and set up the embedding engine
print("πŸš€ Starting embedding engine setup...")
embedding_engine = RecipeEmbeddingEngine(recipe_df)
embedding_engine.initialize_embedding_models()
embedding_engine.compute_embeddings()
model_results = embedding_engine.evaluate_models()
embedding_engine.build_recommendation_engine()
print("βœ… Embedding engine setup complete!")
# ===================================================================
# 3. SYNTHETIC GENERATION
# ===================================================================
print("\n" + "="*50)
print("3. SYNTHETIC GENERATION")
print("="*50)
class RecipeGenerator:
"""
AI-powered recipe generator using Hugging Face models
Optimized for the kaggle food recipes dataset
"""
def __init__(self):
self.generator = None
self.embedding_engine = None
def initialize_generator(self):
"""
Initialize the text generation model
"""
print("πŸ€– Initializing recipe generation model...")
# Use GPT-2 for recipe generation with optimized settings
model_name = "gpt2"
self.generator = pipeline(
"text-generation",
model=model_name,
tokenizer=model_name,
max_length=300,
temperature=0.8, # Higher temperature for more creativity
do_sample=True,
pad_token_id=50256,
eos_token_id=50256
)
print("βœ… Generation model initialized!")
def generate_recipe(self, ingredients_input):
"""
Generate a new recipe based on input ingredients
"""
print("🎯 Generating new recipe...")
# Clean the ingredients input
clean_ingredients = str(ingredients_input).replace('[', '').replace(']', '').replace("'", "").strip()
# Create a structured prompt for better recipe generation
prompt = (
f"Write a detailed recipe in English that uses ALL of the following ingredients: {clean_ingredients}.\n"
f"You MUST include every ingredient listed exactly.\n"
f"Title:\nIngredients:\nInstructions:\n"
f"Please make the recipe clear and easy to follow."
)
try:
generated_texts = self.generator(
prompt,
max_length=300,
num_return_sequences=3,
temperature=0.8,
do_sample=True,
pad_token_id=50256
)
best_generation = ""
for gen in generated_texts:
text = gen['generated_text'].replace(prompt, "").strip()
if len(text) > len(best_generation) and len(text) < 600:
best_generation = text
if not best_generation:
best_generation = generated_texts[0]['generated_text'].replace(prompt, "").strip()
lines = best_generation.split('\n')
title = lines[0] if lines else "AI-Generated Recipe"
instructions = '\n'.join(lines[1:]) if len(lines) > 1 else best_generation
if title.lower().startswith("title:"):
title = title[len("title:"):].strip()
if len(instructions) > 400:
instructions = instructions[:400] + "..."
return {
'title': title[:100] if title else 'Creative AI Recipe',
'ingredients': clean_ingredients,
'directions': instructions,
'similarity': 1.0
}
except Exception as e:
return {
'title': 'Simple AI Recipe',
'ingredients': clean_ingredients,
'directions': f"Create a delicious dish using {clean_ingredients}. Mix ingredients well, cook to preference, and enjoy!",
'similarity': 1.0
}
def generate_synthetic_dataset(self, num_examples=10):
"""
Generate synthetic recipes for testing using real ingredients from dataset
"""
print(f"πŸ“„ Generating {num_examples} synthetic recipes...")
synthetic_recipes = []
# Sample real ingredients from the dataset for more realistic generation
sample_recipes = recipe_df.sample(min(num_examples, len(recipe_df)))
for i, (_, recipe) in enumerate(sample_recipes.iterrows()):
try:
# Use real ingredients but generate new instructions
ingredients = recipe['ingredients_text'][:100] # Limit length
print(f" Generating recipe {i+1}/{num_examples} with: {ingredients[:50]}...")
synthetic_recipe = self.generate_recipe(ingredients)
synthetic_recipes.append(synthetic_recipe)
except Exception as e:
print(f" Error generating recipe {i+1}: {e}")
# Add a simple fallback
synthetic_recipes.append({
'title': f'Recipe Variation {i+1}',
'ingredients': recipe['ingredients_text'][:100],
'directions': 'A creative variation of this classic recipe...',
'similarity': 1.0
})
print(f"βœ… Generated {len(synthetic_recipes)} synthetic recipes!")
# Show sample generated recipe
if synthetic_recipes:
sample = synthetic_recipes[0]
print(f"\n🍽 Sample generated recipe:")
print(f" Title: {sample['title']}")
print(f" Ingredients: {sample['ingredients'][:80]}...")
print(f" Instructions: {sample['directions'][:100]}...")
return synthetic_recipes
def test_generation_quality(self, test_ingredients_list):
"""
Test generation quality with multiple ingredient combinations
"""
print("πŸ§ͺ Testing generation quality...")
for i, ingredients in enumerate(test_ingredients_list[:3], 1):
print(f"\n--- Test {i}: {ingredients} ---")
recipe = self.generate_recipe(ingredients)
print(f"Generated Title: {recipe['title']}")
print(f"Generated Instructions: {recipe['directions'][:150]}...")
# Initialize the generator
print("πŸš€ Starting recipe generator setup...")
recipe_generator = RecipeGenerator()
recipe_generator.initialize_generator()
# Test with sample ingredients
test_ingredients = [
"chicken, garlic, tomatoes, basil",
"pasta, cheese, mushrooms",
"salmon, lemon, herbs"
]
recipe_generator.test_generation_quality(test_ingredients)
# Generate synthetic examples for testing
synthetic_examples = recipe_generator.generate_synthetic_dataset(5) # Reduced for speed
print("βœ… Recipe generator setup complete!")
# ===================================================================
# 4. USER INTERFACE – HUGGING FACE SPACE
# ===================================================================
print("\n" + "="*50)
print("4. USER INTERFACE – HUGGING FACE SPACE")
print("="*50)
def recipe_recommendation_interface(ingredients_input):
"""
Main interface function for Gradio
Enhanced for the kaggle food recipes dataset
"""
try:
print(f"πŸ” Processing request: {ingredients_input}")
# Validate input
if not ingredients_input or len(ingredients_input.strip()) < 3:
return "⚠️ Please enter at least 3 characters for ingredients (e.g., 'chicken, garlic, tomato')"
# Find similar recipes
similar_recipes = embedding_engine.find_similar_recipes(ingredients_input, top_k=3)
# Generate new recipe
generated_recipe = recipe_generator.generate_recipe(ingredients_input)
# Format output with enhanced styling
output = "# 🍳 RecipeGenius Results\n\n"
output += f"**Your ingredients:** {ingredients_input}\n\n"
# Display search results
output += "## πŸ“š Similar Recipes from Database:\n\n"
if similar_recipes:
for i, recipe in enumerate(similar_recipes, 1):
# Clean up the title
title = recipe['title'].replace('Title:', '').strip()
if not title or title.lower() in ['untitled', 'recipe']:
title = f"Recipe #{i}"
# Clean up ingredients display
ingredients_display = recipe['ingredients'].replace('[', '').replace(']', '').replace("'", '')
if len(ingredients_display) > 200:
ingredients_display = ingredients_display[:200] + "..."
# Clean up directions
directions_display = recipe['directions']
if len(directions_display) > 300:
directions_display = directions_display[:300] + "..."
output += f"### {i}. {title}\n"
output += f"**πŸ₯˜ Ingredients:** {ingredients_display}\n\n"
output += f"**πŸ‘¨β€πŸ³ Instructions:** {directions_display}\n\n"
output += f"**πŸ“Š Similarity Score:** {recipe['similarity']:.3f}\n\n"
output += "---\n\n"
else:
output += "⚠️ No similar recipes found. Try different ingredients!\n\n"
# Display AI generated recipe
output += "## 🎨 AI-Generated Recipe:\n\n"
gen_title = generated_recipe['title'].replace('Title:', '').strip()
if not gen_title:
gen_title = "Creative AI Recipe"
output += f"### πŸ€– {gen_title}\n\n"
output += f"**πŸ₯˜ Using your ingredients:** {generated_recipe['ingredients']}\n\n"
output += f"**πŸ‘¨β€πŸ³ AI Instructions:** {generated_recipe['directions']}\n\n"
# Add footer
output += "---\n\n"
output += "*πŸ’‘ Tip: Try different ingredient combinations for more variety!*\n\n"
output += f"*πŸ”¬ Powered by {embedding_engine.best_model} embeddings and GPT-2 generation*"
print("βœ… Request processed successfully")
return output
except Exception as e:
error_msg = f"❌ **Error occurred:** {str(e)}\n\n"
error_msg += "**Please try:**\n"
error_msg += "- Using common ingredients (chicken, pasta, tomato, etc.)\n"
error_msg += "- Checking your spelling\n"
error_msg += "- Using simpler ingredient names\n\n"
error_msg += "*If the problem persists, try refreshing the page.*"
print(f"❌ Error in interface: {e}")
return error_msg
# Enhanced examples for better user experience
example_ingredients = [
"chicken, garlic, tomatoes, basil, olive oil",
"pasta, cheese, mushrooms, cream, herbs",
"salmon, lemon, dill, potatoes, butter",
"eggs, bacon, cheese, spinach, onion",
"beef, soy sauce, ginger, vegetables, rice",
"chocolate, vanilla, flour, eggs, sugar",
"avocado, lime, cilantro, onion, tomato"
]
def create_gradio_interface():
"""
Create and launch the Gradio interface with enhanced features
"""
print("πŸš€ Creating enhanced Gradio interface...")
# Custom CSS for better styling
css = """
.gradio-container {
max-width: 1000px !important;
}
.gr-button {
background: linear-gradient(45deg, #ff6b6b, #ee5a24) !important;
border: none !important;
color: white !important;
}
.gr-button:hover {
background: linear-gradient(45deg, #ee5a24, #ff6b6b) !important;
}
"""
interface = gr.Interface(
fn=recipe_recommendation_interface,
inputs=gr.Textbox(
label="πŸ₯˜ Enter your ingredients (comma-separated)",
placeholder="e.g., chicken, garlic, tomatoes, basil, olive oil",
lines=3,
info="List the ingredients you have available. Be specific for better results!"
),
outputs=gr.Markdown(
label="🍽️ Recipe Recommendations",
elem_classes=["recipe-output"]
),
title="🍳 RecipeGenius - AI Recipe Recommendation System",
description="""
**Transform your ingredients into amazing recipes!** 🎯
Enter the ingredients you have at home and get:
β€’ **3 similar recipes** from our database of real recipes
β€’ **1 AI-generated recipe** created just for your ingredients
**How it works:** Advanced AI embeddings analyze ingredient combinations and GPT-2 generates creative new recipes.
---
""",
examples=example_ingredients,
theme=gr.themes.Soft(
primary_hue="orange",
secondary_hue="amber",
),
css=css,
allow_flagging="never",
article="""
---
### πŸ”¬ **Technical Details:**
- **Dataset**: Kaggle Food Recipes (2000+ recipes)
- **Embeddings**: Multi-model comparison (SBERT, Multi-QA, TF-IDF)
- **Generation**: GPT-2 fine-tuned for recipe creation
- **Similarity**: Cosine similarity matching
### πŸ” **Tips for better results:**
- Use common ingredient names (e.g., "tomato" instead of "heirloom tomato")
- Include 3-7 ingredients for optimal results
- Try different combinations for variety
**Made with ❀️ for food lovers and AI enthusiasts!**
"""
)
return interface
# Create and launch the interface
if __name__ == "__main__":
print("\nπŸŽ‰ RecipeGenius is ready!")
print("πŸ“Š System Summary:")
print(f" - Dataset size: {len(recipe_df)} recipes")
print(f" - Best embedding model: {embedding_engine.best_model}")
print(f" - Generated synthetic examples: {len(synthetic_examples)}")
print(f" - Deployment files created: βœ…")
print("\nπŸš€ Starting Gradio interface...")
interface = create_gradio_interface()
interface.launch()