Spaces:

sethmcknight
/

msse-ai-engineering

Sleeping

File size: 4,596 Bytes

ffa0f3d
 
7793bb6
ffa0f3d
 
 
7793bb6
ffa0f3d
 
7793bb6
ffa0f3d
7793bb6
 
ffa0f3d
 
 
7793bb6
 
ffa0f3d
 
 
 
7793bb6
 
 
ffa0f3d
 
7793bb6
ffa0f3d
7793bb6
 
 
 
 
ffa0f3d
 
 
 
 
7793bb6
ffa0f3d
7793bb6
ffa0f3d
 
7793bb6
ffa0f3d
 
7793bb6
 
ffa0f3d
 
 
 
7793bb6
ffa0f3d
 
 
7793bb6
ffa0f3d
 
 
7793bb6
 
ffa0f3d
 
 
 
7793bb6
ffa0f3d
89aa2b4
ffa0f3d
 
 
 
 
 
 
 
 
89aa2b4
ffa0f3d
89aa2b4
ffa0f3d
 
 
 
89aa2b4
 
 
7793bb6
ffa0f3d
7793bb6
 
 
ffa0f3d
7793bb6
ffa0f3d
7793bb6
ffa0f3d
 
7793bb6
ffa0f3d
 
7793bb6
 
 
 
 
 
ffa0f3d
 
89aa2b4
7793bb6
 
ffa0f3d
 
 
 
7793bb6
ffa0f3d
7793bb6
 
 
 
ffa0f3d
7793bb6
ffa0f3d
 
7793bb6
ffa0f3d
 
7793bb6
 
 
 
 
 
ffa0f3d
7793bb6

from src.ingestion.document_chunker import DocumentChunker


def test_chunk_by_characters():
    """Test basic character-based chunking"""
    chunker = DocumentChunker(chunk_size=50, overlap=10)

    text = "This is a test document. " * 10  # 250 characters
    chunks = chunker.chunk_text(text)

    assert len(chunks) > 1  # Should create multiple chunks
    assert all(len(chunk["content"]) <= 50 for chunk in chunks)

    # Test overlap
    if len(chunks) > 1:
        # Check that there's overlap between consecutive chunks
        assert chunks[0]["content"][-10:] in chunks[1]["content"][:20]


def test_chunk_with_metadata():
    """Test that chunks preserve document metadata"""
    chunker = DocumentChunker(chunk_size=100, overlap=20)

    doc_metadata = {"filename": "test.txt", "file_type": "txt", "source_id": "doc_001"}

    text = "Content that will be chunked. " * 20
    chunks = chunker.chunk_document(text, doc_metadata)

    for chunk in chunks:
        assert chunk["metadata"]["filename"] == "test.txt"
        assert chunk["metadata"]["file_type"] == "txt"
        assert "chunk_id" in chunk["metadata"]
        assert "chunk_index" in chunk["metadata"]


def test_reproducible_chunking():
    """Test that chunking is deterministic with fixed seed"""
    chunker1 = DocumentChunker(chunk_size=100, overlap=20, seed=42)
    chunker2 = DocumentChunker(chunk_size=100, overlap=20, seed=42)

    text = "This text will be chunked reproducibly. " * 30

    chunks1 = chunker1.chunk_text(text)
    chunks2 = chunker2.chunk_text(text)

    assert len(chunks1) == len(chunks2)
    for c1, c2 in zip(chunks1, chunks2):
        assert c1["content"] == c2["content"]


def test_empty_text_chunking():
    """Test handling of empty or very short text"""
    chunker = DocumentChunker(chunk_size=100, overlap=20)

    # Empty text
    chunks = chunker.chunk_text("")
    assert len(chunks) == 0

    # Very short text
    chunks = chunker.chunk_text("Short")
    assert len(chunks) == 1
    assert chunks[0]["content"] == "Short"


def test_chunk_real_policy_content():
    """Test chunking actual policy document content"""
    chunker = DocumentChunker(chunk_size=500, overlap=100, seed=42)

    # Use content that resembles our policy documents
    policy_intro = """# HR-POL-001: Employee Handbook

**Effective Date:** 2025-01-01
**Revision:** 1.1
**Owner:** Human Resources

## 1. Introduction

### 1.1. A Message from Our CEO

Welcome to Innovate Inc.! We are thrilled to have you as part of our team."""

    policy_conduct = """
## 2. Company Policies

### 2.1. Code of Conduct

All employees must adhere to our code of conduct."""

    policy_content = (policy_intro + policy_conduct) * 3

    doc_metadata = {
        "filename": "employee_handbook.md",
        "file_type": "md",
        "file_path": "/path/to/employee_handbook.md",
    }

    chunks = chunker.chunk_document(policy_content, doc_metadata)

    # Verify chunking worked
    assert len(chunks) > 1

    # Verify all chunks have proper metadata
    for i, chunk in enumerate(chunks):
        assert chunk["metadata"]["filename"] == "employee_handbook.md"
        assert chunk["metadata"]["file_type"] == "md"
        assert chunk["metadata"]["chunk_index"] == i
        assert "chunk_id" in chunk["metadata"]
        assert len(chunk["content"]) <= 500

    # Verify overlap exists between consecutive chunks
    if len(chunks) > 1:
        overlap_check = chunks[0]["content"][-100:] in chunks[1]["content"][:200]
        assert overlap_check


def test_chunk_metadata_inheritance():
    """Test that document metadata is properly inherited by chunks"""
    chunker = DocumentChunker(chunk_size=100, overlap=20)

    doc_metadata = {
        "filename": "test_policy.md",
        "file_type": "md",
        "file_size": 1500,
        "file_path": "/absolute/path/to/test_policy.md",
    }

    text = "Policy content goes here. " * 20
    chunks = chunker.chunk_document(text, doc_metadata)

    for chunk in chunks:
        # Original metadata should be preserved
        assert chunk["metadata"]["filename"] == "test_policy.md"
        assert chunk["metadata"]["file_type"] == "md"
        assert chunk["metadata"]["file_size"] == 1500
        expected_path = "/absolute/path/to/test_policy.md"
        assert chunk["metadata"]["file_path"] == expected_path

        # New chunk-specific metadata should be added
        assert "chunk_index" in chunk["metadata"]
        assert "chunk_id" in chunk["metadata"]
        assert "start_pos" in chunk["metadata"]
        assert "end_pos" in chunk["metadata"]