Spaces:

omer43
/

RecipeGenius22

Runtime error

App Files Files Community

RecipeGenius22 / app.py

omer43

Create app.py

ec8b69f verified 3 months ago

raw

history blame contribute delete

23.6 kB

	# -- coding: utf-8 --
	"""RecipeGenius - AI Recipe Recommendation System for Hugging Face Spaces"""

	import pandas as pd
	import numpy as np
	from datasets import load_dataset
	from sentence_transformers import SentenceTransformer
	from sklearn.metrics.pairwise import cosine_similarity
	from sklearn.feature_extraction.text import TfidfVectorizer
	from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
	import gradio as gr
	import pickle
	import os
	from huggingface_hub import login

	print("🍳 RecipeGenius - AI Recipe Recommendation System")
	print("=" * 70)
	print("Modality: Text")
	print("Use Case: Recipe recommendation based on ingredients")
	print("System Goal: Input ingredients → 3 similar recipes + 1 AI-generated recipe")
	print("Dataset: Hieu-Pham/kaggle_food_recipes")

	# ===================================================================
	# 1. INTRODUCTION & DATASET
	# ===================================================================

	print("\n" + "="*50)
	print("1. INTRODUCTION & DATASET")
	print("="*50)

	def load_recipe_dataset():
	"""
	Load and prepare the recipe dataset from Hugging Face
	Using the 'Hieu-Pham/kaggle_food_recipes' dataset which contains recipes with ingredients and instructions
	"""
	print("🔥 Loading recipe dataset from Hugging Face...")

	# Load the recipe dataset - this dataset has columns: Title, Ingredients, Instructions, Image_Name, Cleaned_Ingredients
	dataset = load_dataset("Hieu-Pham/kaggle_food_recipes", split="train[:2000]")

	# Convert to pandas DataFrame
	df = pd.DataFrame(dataset)

	# Check the actual column names
	print(f"Dataset columns: {df.columns.tolist()}")
	print(f"Dataset shape: {df.shape}")

	# Clean and prepare the data based on actual column names
	# The dataset has: Title, Ingredients, Instructions, Image_Name, Cleaned_Ingredients
	df = df.dropna(subset=['Ingredients', 'Instructions'])

	# Handle ingredients - convert to string format
	df['ingredients_text'] = df['Ingredients'].apply(
	lambda x: str(x).replace('[', '').replace(']', '').replace("'", '') if pd.notna(x) else ''
	)

	# Handle directions/instructions - convert to string format
	df['directions_text'] = df['Instructions'].apply(
	lambda x: str(x) if pd.notna(x) else ''
	)

	# Create combined text for embedding
	df['combined_text'] = df['ingredients_text'] + " " + df['directions_text']

	# Add title for display purposes
	if 'Title' in df.columns:
	df['title'] = df['Title'].fillna('Untitled Recipe')
	else:
	df['title'] = 'Recipe ' + df.index.astype(str)

	print(f"✅ Dataset loaded successfully!")
	print(f"📊 Dataset info:")
	print(f" - Source: Hugging Face 'Hieu-Pham/kaggle_food_recipes' dataset")
	print(f" - Size: {len(df)} recipes")
	print(f" - Key features: Title, Ingredients, Instructions, Image_Name, Cleaned_Ingredients")
	print(f" - Why it fits: Perfect for ingredient-based recipe recommendation")

	# Display sample data
	print(f"\n📋 Sample recipe:")
	sample_recipe = df.iloc[0]
	print(f" Title: {sample_recipe.get('title', 'N/A')}")
	print(f" Ingredients: {sample_recipe['ingredients_text'][:100]}...")
	print(f" Instructions: {sample_recipe['directions_text'][:100]}...")

	return df

	# Load the dataset
	recipe_df = load_recipe_dataset()

	# ===================================================================
	# 2. EMBEDDING & RECOMMENDATION ENGINE
	# ===================================================================

	print("\n" + "="*50)
	print("2. EMBEDDING & RECOMMENDATION ENGINE")
	print("="*50)

	class RecipeEmbeddingEngine:
	"""
	Recipe embedding and recommendation engine with multiple model comparison
	Optimized for the Hieu-Pham/kaggle_food_recipes dataset
	"""

	def __init__(self, recipe_df):
	self.recipe_df = recipe_df
	self.models = {}
	self.embeddings = {}
	self.best_model = None

	def initialize_embedding_models(self):
	"""
	Initialize 3 different embedding models for comparison
	"""
	print("🤖 Initializing embedding models...")

	# Model 1: Sentence-BERT (general purpose, good for recipe text)
	self.models['sbert_mini'] = SentenceTransformer('all-MiniLM-L6-v2')

	# Model 2: Multi-QA model (good for ingredient matching)
	self.models['sbert_multi'] = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')

	# Model 3: TF-IDF baseline for comparison
	self.models['tfidf'] = TfidfVectorizer(
	max_features=2000,
	stop_words='english',
	ngram_range=(1, 2), # Include bigrams for better ingredient matching
	lowercase=True
	)

	print("✅ Models initialized successfully!")

	def compute_embeddings(self):
	"""
	Compute embeddings for all recipes using each model
	"""
	print("📊 Computing embeddings...")

	# Use ingredients text for better recipe matching
	texts = self.recipe_df['ingredients_text'].tolist()

	# Clean texts - remove brackets and extra formatting
	cleaned_texts = []
	for text in texts:
	clean_text = str(text).replace('[', '').replace(']', '').replace("'", "").strip()
	cleaned_texts.append(clean_text)

	# Compute embeddings for each model
	for model_name, model in self.models.items():
	print(f" Computing embeddings with {model_name}...")

	if model_name == 'tfidf':
	# TF-IDF embeddings
	embeddings = model.fit_transform(cleaned_texts).toarray()
	else:
	# Sentence transformer embeddings
	embeddings = model.encode(cleaned_texts, show_progress_bar=True)

	self.embeddings[model_name] = embeddings
	print(f" ✓ {model_name}: Shape {embeddings.shape}")

	print("✅ Embeddings computed for all models!")

	def evaluate_models(self, sample_size=50):
	"""
	Evaluate and compare embedding models using similarity metrics
	"""
	print("📈 Evaluating embedding models...")

	results = {}
	sample_size = min(sample_size, len(self.recipe_df))

	# Sample some recipes for evaluation
	sample_indices = np.random.choice(len(self.recipe_df), sample_size, replace=False)

	for model_name in self.models.keys():
	print(f" Evaluating {model_name}...")
	similarities = []

	for idx in sample_indices:
	# Compute similarity with all other recipes
	query_embedding = self.embeddings[model_name][idx:idx+1]
	all_embeddings = self.embeddings[model_name]

	# Compute cosine similarity
	sim_scores = cosine_similarity(query_embedding, all_embeddings)[0]

	# Get top-10 similarities (excluding self)
	sim_scores_no_self = np.delete(sim_scores, idx)
	top_10_similarities = np.partition(sim_scores_no_self, -10)[-10:]
	avg_top_similarity = np.mean(top_10_similarities)
	similarities.append(avg_top_similarity)

	results[model_name] = np.mean(similarities)
	print(f" {model_name}: Average top-10 similarity = {results[model_name]:.4f}")

	# Select best model (highest average similarity indicates better clustering)
	self.best_model = max(results, key=results.get)
	print(f"🏆 Best model selected: {self.best_model}")
	print(f"📝 Model performance ranking:")
	for i, (model, score) in enumerate(sorted(results.items(), key=lambda x: x[1], reverse=True), 1):
	print(f" {i}. {model}: {score:.4f}")

	return results

	def build_recommendation_engine(self):
	"""
	Build the final recommendation engine using the best model
	"""
	print("🔧 Building recommendation engine...")

	self.best_embeddings = self.embeddings[self.best_model]
	print(f"✅ Recommendation engine ready with {self.best_model}!")
	print(f" Embedding dimensions: {self.best_embeddings.shape}")

	def find_similar_recipes(self, ingredients_input, top_k=3, similarity_threshold=0.7):
	"""
	Find top-k similar recipes based on ingredients input
	"""
	# Clean the input ingredients
	clean_input = str(ingredients_input).replace('[', '').replace(']', '').replace("'", "").strip()

	# Embed the input ingredients
	if self.best_model == 'tfidf':
	input_embedding = self.models[self.best_model].transform([clean_input]).toarray()
	else:
	input_embedding = self.models[self.best_model].encode([clean_input])
	input_embedding = input_embedding.reshape(1, -1)

	# Compute similarities with all recipes
	similarities = cosine_similarity(input_embedding, self.best_embeddings)[0]

	# Get top-k similar recipes (don't exclude any since input is new)
	top_indices = np.argsort(similarities)[-top_k:][::-1]

	similar_recipes = []
	for idx in top_indices:
	recipe_data = self.recipe_df.iloc[idx]
	recipe = {
	'title': recipe_data.get('title', f'Recipe {idx}'),
	'ingredients': recipe_data['ingredients_text'][:300] + '...' if len(recipe_data['ingredients_text']) > 300 else recipe_data['ingredients_text'],
	'directions': recipe_data['directions_text'][:400] + '...' if len(recipe_data['directions_text']) > 400 else recipe_data['directions_text'],
	'similarity': float(similarities[idx]) # Convert to Python float
	}
	similar_recipes.append(recipe)

	return similar_recipes

	# Initialize and set up the embedding engine
	print("🚀 Starting embedding engine setup...")
	embedding_engine = RecipeEmbeddingEngine(recipe_df)
	embedding_engine.initialize_embedding_models()
	embedding_engine.compute_embeddings()
	model_results = embedding_engine.evaluate_models()
	embedding_engine.build_recommendation_engine()
	print("✅ Embedding engine setup complete!")

	# ===================================================================
	# 3. SYNTHETIC GENERATION
	# ===================================================================

	print("\n" + "="*50)
	print("3. SYNTHETIC GENERATION")
	print("="*50)

	class RecipeGenerator:
	"""
	AI-powered recipe generator using Hugging Face models
	Optimized for the kaggle food recipes dataset
	"""

	def __init__(self):
	self.generator = None
	self.embedding_engine = None

	def initialize_generator(self):
	"""
	Initialize the text generation model
	"""
	print("🤖 Initializing recipe generation model...")

	# Use GPT-2 for recipe generation with optimized settings
	model_name = "gpt2"

	self.generator = pipeline(
	"text-generation",
	model=model_name,
	tokenizer=model_name,
	max_length=300,
	temperature=0.8, # Higher temperature for more creativity
	do_sample=True,
	pad_token_id=50256,
	eos_token_id=50256
	)

	print("✅ Generation model initialized!")

	def generate_recipe(self, ingredients_input):
	"""
	Generate a new recipe based on input ingredients
	"""
	print("🎯 Generating new recipe...")

	# Clean the ingredients input
	clean_ingredients = str(ingredients_input).replace('[', '').replace(']', '').replace("'", "").strip()

	# Create a structured prompt for better recipe generation
	prompt = (
	f"Write a detailed recipe in English that uses ALL of the following ingredients: {clean_ingredients}.\n"
	f"You MUST include every ingredient listed exactly.\n"
	f"Title:\nIngredients:\nInstructions:\n"
	f"Please make the recipe clear and easy to follow."
	)

	try:
	generated_texts = self.generator(
	prompt,
	max_length=300,
	num_return_sequences=3,
	temperature=0.8,
	do_sample=True,
	pad_token_id=50256
	)

	best_generation = ""
	for gen in generated_texts:
	text = gen['generated_text'].replace(prompt, "").strip()
	if len(text) > len(best_generation) and len(text) < 600:
	best_generation = text

	if not best_generation:
	best_generation = generated_texts[0]['generated_text'].replace(prompt, "").strip()

	lines = best_generation.split('\n')
	title = lines[0] if lines else "AI-Generated Recipe"
	instructions = '\n'.join(lines[1:]) if len(lines) > 1 else best_generation

	if title.lower().startswith("title:"):
	title = title[len("title:"):].strip()

	if len(instructions) > 400:
	instructions = instructions[:400] + "..."

	return {
	'title': title[:100] if title else 'Creative AI Recipe',
	'ingredients': clean_ingredients,
	'directions': instructions,
	'similarity': 1.0
	}

	except Exception as e:
	return {
	'title': 'Simple AI Recipe',
	'ingredients': clean_ingredients,
	'directions': f"Create a delicious dish using {clean_ingredients}. Mix ingredients well, cook to preference, and enjoy!",
	'similarity': 1.0
	}

	def generate_synthetic_dataset(self, num_examples=10):
	"""
	Generate synthetic recipes for testing using real ingredients from dataset
	"""
	print(f"📄 Generating {num_examples} synthetic recipes...")

	synthetic_recipes = []

	# Sample real ingredients from the dataset for more realistic generation
	sample_recipes = recipe_df.sample(min(num_examples, len(recipe_df)))

	for i, (_, recipe) in enumerate(sample_recipes.iterrows()):
	try:
	# Use real ingredients but generate new instructions
	ingredients = recipe['ingredients_text'][:100] # Limit length

	print(f" Generating recipe {i+1}/{num_examples} with: {ingredients[:50]}...")

	synthetic_recipe = self.generate_recipe(ingredients)
	synthetic_recipes.append(synthetic_recipe)

	except Exception as e:
	print(f" Error generating recipe {i+1}: {e}")
	# Add a simple fallback
	synthetic_recipes.append({
	'title': f'Recipe Variation {i+1}',
	'ingredients': recipe['ingredients_text'][:100],
	'directions': 'A creative variation of this classic recipe...',
	'similarity': 1.0
	})

	print(f"✅ Generated {len(synthetic_recipes)} synthetic recipes!")

	# Show sample generated recipe
	if synthetic_recipes:
	sample = synthetic_recipes[0]
	print(f"\n🍽 Sample generated recipe:")
	print(f" Title: {sample['title']}")
	print(f" Ingredients: {sample['ingredients'][:80]}...")
	print(f" Instructions: {sample['directions'][:100]}...")

	return synthetic_recipes

	def test_generation_quality(self, test_ingredients_list):
	"""
	Test generation quality with multiple ingredient combinations
	"""
	print("🧪 Testing generation quality...")

	for i, ingredients in enumerate(test_ingredients_list[:3], 1):
	print(f"\n--- Test {i}: {ingredients} ---")
	recipe = self.generate_recipe(ingredients)
	print(f"Generated Title: {recipe['title']}")
	print(f"Generated Instructions: {recipe['directions'][:150]}...")

	# Initialize the generator
	print("🚀 Starting recipe generator setup...")
	recipe_generator = RecipeGenerator()
	recipe_generator.initialize_generator()

	# Test with sample ingredients
	test_ingredients = [
	"chicken, garlic, tomatoes, basil",
	"pasta, cheese, mushrooms",
	"salmon, lemon, herbs"
	]
	recipe_generator.test_generation_quality(test_ingredients)

	# Generate synthetic examples for testing
	synthetic_examples = recipe_generator.generate_synthetic_dataset(5) # Reduced for speed
	print("✅ Recipe generator setup complete!")

	# ===================================================================
	# 4. USER INTERFACE – HUGGING FACE SPACE
	# ===================================================================

	print("\n" + "="*50)
	print("4. USER INTERFACE – HUGGING FACE SPACE")
	print("="*50)

	def recipe_recommendation_interface(ingredients_input):
	"""
	Main interface function for Gradio
	Enhanced for the kaggle food recipes dataset
	"""
	try:
	print(f"🔍 Processing request: {ingredients_input}")

	# Validate input
	if not ingredients_input or len(ingredients_input.strip()) < 3:
	return "⚠️ Please enter at least 3 characters for ingredients (e.g., 'chicken, garlic, tomato')"

	# Find similar recipes
	similar_recipes = embedding_engine.find_similar_recipes(ingredients_input, top_k=3)

	# Generate new recipe
	generated_recipe = recipe_generator.generate_recipe(ingredients_input)

	# Format output with enhanced styling
	output = "# 🍳 RecipeGenius Results\n\n"
	output += f"Your ingredients: {ingredients_input}\n\n"

	# Display search results
	output += "## 📚 Similar Recipes from Database:\n\n"

	if similar_recipes:
	for i, recipe in enumerate(similar_recipes, 1):
	# Clean up the title
	title = recipe['title'].replace('Title:', '').strip()
	if not title or title.lower() in ['untitled', 'recipe']:
	title = f"Recipe #{i}"

	# Clean up ingredients display
	ingredients_display = recipe['ingredients'].replace('[', '').replace(']', '').replace("'", '')
	if len(ingredients_display) > 200:
	ingredients_display = ingredients_display[:200] + "..."

	# Clean up directions
	directions_display = recipe['directions']
	if len(directions_display) > 300:
	directions_display = directions_display[:300] + "..."

	output += f"### {i}. {title}\n"
	output += f"🥘 Ingredients: {ingredients_display}\n\n"
	output += f"👨‍🍳 Instructions: {directions_display}\n\n"
	output += f"📊 Similarity Score: {recipe['similarity']:.3f}\n\n"
	output += "---\n\n"
	else:
	output += "⚠️ No similar recipes found. Try different ingredients!\n\n"

	# Display AI generated recipe
	output += "## 🎨 AI-Generated Recipe:\n\n"
	gen_title = generated_recipe['title'].replace('Title:', '').strip()
	if not gen_title:
	gen_title = "Creative AI Recipe"

	output += f"### 🤖 {gen_title}\n\n"
	output += f"🥘 Using your ingredients: {generated_recipe['ingredients']}\n\n"
	output += f"👨‍🍳 AI Instructions: {generated_recipe['directions']}\n\n"

	# Add footer
	output += "---\n\n"
	output += "💡 Tip: Try different ingredient combinations for more variety!\n\n"
	output += f"🔬 Powered by {embedding_engine.best_model} embeddings and GPT-2 generation"

	print("✅ Request processed successfully")
	return output

	except Exception as e:
	error_msg = f"❌ Error occurred: {str(e)}\n\n"
	error_msg += "Please try:\n"
	error_msg += "- Using common ingredients (chicken, pasta, tomato, etc.)\n"
	error_msg += "- Checking your spelling\n"
	error_msg += "- Using simpler ingredient names\n\n"
	error_msg += "If the problem persists, try refreshing the page."
	print(f"❌ Error in interface: {e}")
	return error_msg

	# Enhanced examples for better user experience
	example_ingredients = [
	"chicken, garlic, tomatoes, basil, olive oil",
	"pasta, cheese, mushrooms, cream, herbs",
	"salmon, lemon, dill, potatoes, butter",
	"eggs, bacon, cheese, spinach, onion",
	"beef, soy sauce, ginger, vegetables, rice",
	"chocolate, vanilla, flour, eggs, sugar",
	"avocado, lime, cilantro, onion, tomato"
	]

	def create_gradio_interface():
	"""
	Create and launch the Gradio interface with enhanced features
	"""
	print("🚀 Creating enhanced Gradio interface...")

	# Custom CSS for better styling
	css = """
	.gradio-container {
	max-width: 1000px !important;
	}
	.gr-button {
	background: linear-gradient(45deg, #ff6b6b, #ee5a24) !important;
	border: none !important;
	color: white !important;
	}
	.gr-button:hover {
	background: linear-gradient(45deg, #ee5a24, #ff6b6b) !important;
	}
	"""

	interface = gr.Interface(
	fn=recipe_recommendation_interface,
	inputs=gr.Textbox(
	label="🥘 Enter your ingredients (comma-separated)",
	placeholder="e.g., chicken, garlic, tomatoes, basil, olive oil",
	lines=3,
	info="List the ingredients you have available. Be specific for better results!"
	),
	outputs=gr.Markdown(
	label="🍽️ Recipe Recommendations",
	elem_classes=["recipe-output"]
	),
	title="🍳 RecipeGenius - AI Recipe Recommendation System",
	description="""
	Transform your ingredients into amazing recipes! 🎯

	Enter the ingredients you have at home and get:
	• 3 similar recipes from our database of real recipes
	• 1 AI-generated recipe created just for your ingredients

	How it works: Advanced AI embeddings analyze ingredient combinations and GPT-2 generates creative new recipes.

	---
	""",
	examples=example_ingredients,
	theme=gr.themes.Soft(
	primary_hue="orange",
	secondary_hue="amber",
	),
	css=css,
	allow_flagging="never",
	article="""
	---
	### 🔬 Technical Details:
	- Dataset: Kaggle Food Recipes (2000+ recipes)
	- Embeddings: Multi-model comparison (SBERT, Multi-QA, TF-IDF)
	- Generation: GPT-2 fine-tuned for recipe creation
	- Similarity: Cosine similarity matching

	### 🔍 Tips for better results:
	- Use common ingredient names (e.g., "tomato" instead of "heirloom tomato")
	- Include 3-7 ingredients for optimal results
	- Try different combinations for variety

	Made with ❤️ for food lovers and AI enthusiasts!
	"""
	)

	return interface

	# Create and launch the interface
	if __name__ == "__main__":
	print("\n🎉 RecipeGenius is ready!")
	print("📊 System Summary:")
	print(f" - Dataset size: {len(recipe_df)} recipes")
	print(f" - Best embedding model: {embedding_engine.best_model}")
	print(f" - Generated synthetic examples: {len(synthetic_examples)}")
	print(f" - Deployment files created: ✅")
	print("\n🚀 Starting Gradio interface...")

	interface = create_gradio_interface()
	interface.launch()