msse-ai-engineering / tests /test_ingestion /test_document_chunker.py
Tobias Pasquale
feat: Add comprehensive local CI/CD testing infrastructure
89aa2b4
from src.ingestion.document_chunker import DocumentChunker
def test_chunk_by_characters():
"""Test basic character-based chunking"""
chunker = DocumentChunker(chunk_size=50, overlap=10)
text = "This is a test document. " * 10 # 250 characters
chunks = chunker.chunk_text(text)
assert len(chunks) > 1 # Should create multiple chunks
assert all(len(chunk["content"]) <= 50 for chunk in chunks)
# Test overlap
if len(chunks) > 1:
# Check that there's overlap between consecutive chunks
assert chunks[0]["content"][-10:] in chunks[1]["content"][:20]
def test_chunk_with_metadata():
"""Test that chunks preserve document metadata"""
chunker = DocumentChunker(chunk_size=100, overlap=20)
doc_metadata = {"filename": "test.txt", "file_type": "txt", "source_id": "doc_001"}
text = "Content that will be chunked. " * 20
chunks = chunker.chunk_document(text, doc_metadata)
for chunk in chunks:
assert chunk["metadata"]["filename"] == "test.txt"
assert chunk["metadata"]["file_type"] == "txt"
assert "chunk_id" in chunk["metadata"]
assert "chunk_index" in chunk["metadata"]
def test_reproducible_chunking():
"""Test that chunking is deterministic with fixed seed"""
chunker1 = DocumentChunker(chunk_size=100, overlap=20, seed=42)
chunker2 = DocumentChunker(chunk_size=100, overlap=20, seed=42)
text = "This text will be chunked reproducibly. " * 30
chunks1 = chunker1.chunk_text(text)
chunks2 = chunker2.chunk_text(text)
assert len(chunks1) == len(chunks2)
for c1, c2 in zip(chunks1, chunks2):
assert c1["content"] == c2["content"]
def test_empty_text_chunking():
"""Test handling of empty or very short text"""
chunker = DocumentChunker(chunk_size=100, overlap=20)
# Empty text
chunks = chunker.chunk_text("")
assert len(chunks) == 0
# Very short text
chunks = chunker.chunk_text("Short")
assert len(chunks) == 1
assert chunks[0]["content"] == "Short"
def test_chunk_real_policy_content():
"""Test chunking actual policy document content"""
chunker = DocumentChunker(chunk_size=500, overlap=100, seed=42)
# Use content that resembles our policy documents
policy_intro = """# HR-POL-001: Employee Handbook
**Effective Date:** 2025-01-01
**Revision:** 1.1
**Owner:** Human Resources
## 1. Introduction
### 1.1. A Message from Our CEO
Welcome to Innovate Inc.! We are thrilled to have you as part of our team."""
policy_conduct = """
## 2. Company Policies
### 2.1. Code of Conduct
All employees must adhere to our code of conduct."""
policy_content = (policy_intro + policy_conduct) * 3
doc_metadata = {
"filename": "employee_handbook.md",
"file_type": "md",
"file_path": "/path/to/employee_handbook.md",
}
chunks = chunker.chunk_document(policy_content, doc_metadata)
# Verify chunking worked
assert len(chunks) > 1
# Verify all chunks have proper metadata
for i, chunk in enumerate(chunks):
assert chunk["metadata"]["filename"] == "employee_handbook.md"
assert chunk["metadata"]["file_type"] == "md"
assert chunk["metadata"]["chunk_index"] == i
assert "chunk_id" in chunk["metadata"]
assert len(chunk["content"]) <= 500
# Verify overlap exists between consecutive chunks
if len(chunks) > 1:
overlap_check = chunks[0]["content"][-100:] in chunks[1]["content"][:200]
assert overlap_check
def test_chunk_metadata_inheritance():
"""Test that document metadata is properly inherited by chunks"""
chunker = DocumentChunker(chunk_size=100, overlap=20)
doc_metadata = {
"filename": "test_policy.md",
"file_type": "md",
"file_size": 1500,
"file_path": "/absolute/path/to/test_policy.md",
}
text = "Policy content goes here. " * 20
chunks = chunker.chunk_document(text, doc_metadata)
for chunk in chunks:
# Original metadata should be preserved
assert chunk["metadata"]["filename"] == "test_policy.md"
assert chunk["metadata"]["file_type"] == "md"
assert chunk["metadata"]["file_size"] == 1500
expected_path = "/absolute/path/to/test_policy.md"
assert chunk["metadata"]["file_path"] == expected_path
# New chunk-specific metadata should be added
assert "chunk_index" in chunk["metadata"]
assert "chunk_id" in chunk["metadata"]
assert "start_pos" in chunk["metadata"]
assert "end_pos" in chunk["metadata"]