from typing import List

from src.embedding.embedding_service import EmbeddingService


def test_embedding_service_initialization():
    """Test EmbeddingService initialization"""
    service = EmbeddingService()

    assert service is not None
    assert service.model_name == "sentence-transformers/all-MiniLM-L6-v2"
    assert service.device == "cpu"


def test_embedding_service_with_custom_config():
    """Test EmbeddingService initialization with custom configuration"""
    service = EmbeddingService(model_name="all-MiniLM-L12-v2", device="cpu", batch_size=16)

    assert service.model_name == "all-MiniLM-L12-v2"
    assert service.device == "cpu"
    assert service.batch_size == 16


def test_single_text_embedding():
    """Test embedding generation for a single text"""
    service = EmbeddingService()

    text = "This is a test document about company policies."
    embedding = service.embed_text(text)

    # Should return a list of floats (embedding vector)
    assert isinstance(embedding, list)
    assert len(embedding) == 384  # paraphrase-MiniLM-L3-v2 dimension
    assert all(isinstance(x, (float, int)) for x in embedding)


def test_batch_text_embedding():
    """Test embedding generation for multiple texts"""
    service = EmbeddingService()

    texts = [
        "This is the first document about remote work policy.",
        "This is the second document about employee benefits.",
        "This is the third document about code of conduct.",
    ]

    embeddings = service.embed_texts(texts)

    # Should return list of embeddings
    assert isinstance(embeddings, list)
    assert len(embeddings) == 3

    # Each embedding should be correct dimension
    for embedding in embeddings:
        assert isinstance(embedding, list)
        assert len(embedding) == 384
        assert all(isinstance(x, (float, int)) for x in embedding)


def test_embedding_consistency():
    """Test that same text produces same embedding"""
    service = EmbeddingService()

    text = "Consistent embedding test text."

    embedding1 = service.embed_text(text)
    embedding2 = service.embed_text(text)

    # Should be identical (deterministic)
    assert embedding1 == embedding2


def test_different_texts_different_embeddings():
    """Test that different texts produce different embeddings"""
    service = EmbeddingService()

    text1 = "This is about remote work policy."
    text2 = "This is about employee benefits and healthcare."

    embedding1 = service.embed_text(text1)
    embedding2 = service.embed_text(text2)

    # Should be different
    assert embedding1 != embedding2

    # But should have same dimension
    assert len(embedding1) == len(embedding2) == 384


def test_empty_text_handling():
    """Test handling of empty or whitespace-only text"""
    service = EmbeddingService()

    # Empty string
    embedding_empty = service.embed_text("")
    assert isinstance(embedding_empty, list)
    assert len(embedding_empty) == 384

    # Whitespace only
    embedding_whitespace = service.embed_text("   \n\t  ")
    assert isinstance(embedding_whitespace, list)
    assert len(embedding_whitespace) == 384


def test_very_long_text_handling():
    """Test handling of very long texts"""
    service = EmbeddingService()

    # Create a very long text (should test tokenization limits)
    long_text = "This is a very long document. " * 1000  # ~30,000 characters

    embedding = service.embed_text(long_text)
    assert isinstance(embedding, list)
    assert len(embedding) == 384


def test_batch_size_handling():
    """Test that batch processing works correctly"""
    service = EmbeddingService(batch_size=2)  # Small batch for testing

    texts = [
        "Text one about policy",
        "Text two about procedures",
        "Text three about guidelines",
        "Text four about regulations",
        "Text five about rules",
    ]

    embeddings = service.embed_texts(texts)

    # Should process all texts despite small batch size
    assert len(embeddings) == 5

    # All embeddings should be valid
    for embedding in embeddings:
        assert len(embedding) == 384


def test_special_characters_handling():
    """Test handling of special characters and unicode"""
    service = EmbeddingService()

    texts_with_special_chars = [
        "Policy with emojis 😀 and úñicode",
        "Text with numbers: 123,456.78 and symbols @#$%",
        "Markdown: # Header\n## Subheader\n- List item",
        "Mixed: Policy-2024 (v1.2) — updated 12/01/2025",
    ]

    embeddings = service.embed_texts(texts_with_special_chars)

    assert len(embeddings) == 4
    for embedding in embeddings:
        assert len(embedding) == 384


def test_similarity_makes_sense():
    """Test that semantically similar texts have similar embeddings"""
    service = EmbeddingService()

    # Similar texts
    text1 = "Employee remote work policy guidelines"
    text2 = "Guidelines for working from home policies"

    # Different text
    text3 = "Financial expense reimbursement procedures"

    embed1 = service.embed_text(text1)
    embed2 = service.embed_text(text2)
    embed3 = service.embed_text(text3)

    # Calculate simple cosine similarity (for validation)
    def cosine_similarity(a: List[float], b: List[float]) -> float:
        import numpy as np

        a_np = np.array(a)
        b_np = np.array(b)
        return float(np.dot(a_np, b_np) / (np.linalg.norm(a_np) * np.linalg.norm(b_np)))

    sim_1_2 = cosine_similarity(embed1, embed2)  # Similar texts
    sim_1_3 = cosine_similarity(embed1, embed3)  # Different texts

    # Similar texts should have higher similarity than different texts
    assert sim_1_2 > sim_1_3
    assert sim_1_2 > 0.5  # Should be reasonably similar


def test_model_loading_performance():
    """Test that model loading doesn't happen repeatedly"""
    # This test ensures model is cached after first load
    import time

    start_time = time.time()
    EmbeddingService()  # First service
    first_load_time = time.time() - start_time

    start_time = time.time()
    EmbeddingService()  # Second service
    second_load_time = time.time() - start_time

    # Second initialization should be faster (model already cached)
    # Note: This might not always be true depending on implementation
    # but it's good to test the general behavior
    assert second_load_time <= first_load_time * 2  # Allow some variance