Spaces:
Sleeping
Sleeping
File size: 4,596 Bytes
ffa0f3d 7793bb6 ffa0f3d 7793bb6 ffa0f3d 7793bb6 ffa0f3d 7793bb6 ffa0f3d 7793bb6 ffa0f3d 7793bb6 ffa0f3d 7793bb6 ffa0f3d 7793bb6 ffa0f3d 7793bb6 ffa0f3d 7793bb6 ffa0f3d 7793bb6 ffa0f3d 7793bb6 ffa0f3d 7793bb6 ffa0f3d 7793bb6 ffa0f3d 7793bb6 ffa0f3d 7793bb6 ffa0f3d 89aa2b4 ffa0f3d 89aa2b4 ffa0f3d 89aa2b4 ffa0f3d 89aa2b4 7793bb6 ffa0f3d 7793bb6 ffa0f3d 7793bb6 ffa0f3d 7793bb6 ffa0f3d 7793bb6 ffa0f3d 7793bb6 ffa0f3d 89aa2b4 7793bb6 ffa0f3d 7793bb6 ffa0f3d 7793bb6 ffa0f3d 7793bb6 ffa0f3d 7793bb6 ffa0f3d 7793bb6 ffa0f3d 7793bb6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 |
from src.ingestion.document_chunker import DocumentChunker
def test_chunk_by_characters():
"""Test basic character-based chunking"""
chunker = DocumentChunker(chunk_size=50, overlap=10)
text = "This is a test document. " * 10 # 250 characters
chunks = chunker.chunk_text(text)
assert len(chunks) > 1 # Should create multiple chunks
assert all(len(chunk["content"]) <= 50 for chunk in chunks)
# Test overlap
if len(chunks) > 1:
# Check that there's overlap between consecutive chunks
assert chunks[0]["content"][-10:] in chunks[1]["content"][:20]
def test_chunk_with_metadata():
"""Test that chunks preserve document metadata"""
chunker = DocumentChunker(chunk_size=100, overlap=20)
doc_metadata = {"filename": "test.txt", "file_type": "txt", "source_id": "doc_001"}
text = "Content that will be chunked. " * 20
chunks = chunker.chunk_document(text, doc_metadata)
for chunk in chunks:
assert chunk["metadata"]["filename"] == "test.txt"
assert chunk["metadata"]["file_type"] == "txt"
assert "chunk_id" in chunk["metadata"]
assert "chunk_index" in chunk["metadata"]
def test_reproducible_chunking():
"""Test that chunking is deterministic with fixed seed"""
chunker1 = DocumentChunker(chunk_size=100, overlap=20, seed=42)
chunker2 = DocumentChunker(chunk_size=100, overlap=20, seed=42)
text = "This text will be chunked reproducibly. " * 30
chunks1 = chunker1.chunk_text(text)
chunks2 = chunker2.chunk_text(text)
assert len(chunks1) == len(chunks2)
for c1, c2 in zip(chunks1, chunks2):
assert c1["content"] == c2["content"]
def test_empty_text_chunking():
"""Test handling of empty or very short text"""
chunker = DocumentChunker(chunk_size=100, overlap=20)
# Empty text
chunks = chunker.chunk_text("")
assert len(chunks) == 0
# Very short text
chunks = chunker.chunk_text("Short")
assert len(chunks) == 1
assert chunks[0]["content"] == "Short"
def test_chunk_real_policy_content():
"""Test chunking actual policy document content"""
chunker = DocumentChunker(chunk_size=500, overlap=100, seed=42)
# Use content that resembles our policy documents
policy_intro = """# HR-POL-001: Employee Handbook
**Effective Date:** 2025-01-01
**Revision:** 1.1
**Owner:** Human Resources
## 1. Introduction
### 1.1. A Message from Our CEO
Welcome to Innovate Inc.! We are thrilled to have you as part of our team."""
policy_conduct = """
## 2. Company Policies
### 2.1. Code of Conduct
All employees must adhere to our code of conduct."""
policy_content = (policy_intro + policy_conduct) * 3
doc_metadata = {
"filename": "employee_handbook.md",
"file_type": "md",
"file_path": "/path/to/employee_handbook.md",
}
chunks = chunker.chunk_document(policy_content, doc_metadata)
# Verify chunking worked
assert len(chunks) > 1
# Verify all chunks have proper metadata
for i, chunk in enumerate(chunks):
assert chunk["metadata"]["filename"] == "employee_handbook.md"
assert chunk["metadata"]["file_type"] == "md"
assert chunk["metadata"]["chunk_index"] == i
assert "chunk_id" in chunk["metadata"]
assert len(chunk["content"]) <= 500
# Verify overlap exists between consecutive chunks
if len(chunks) > 1:
overlap_check = chunks[0]["content"][-100:] in chunks[1]["content"][:200]
assert overlap_check
def test_chunk_metadata_inheritance():
"""Test that document metadata is properly inherited by chunks"""
chunker = DocumentChunker(chunk_size=100, overlap=20)
doc_metadata = {
"filename": "test_policy.md",
"file_type": "md",
"file_size": 1500,
"file_path": "/absolute/path/to/test_policy.md",
}
text = "Policy content goes here. " * 20
chunks = chunker.chunk_document(text, doc_metadata)
for chunk in chunks:
# Original metadata should be preserved
assert chunk["metadata"]["filename"] == "test_policy.md"
assert chunk["metadata"]["file_type"] == "md"
assert chunk["metadata"]["file_size"] == 1500
expected_path = "/absolute/path/to/test_policy.md"
assert chunk["metadata"]["file_path"] == expected_path
# New chunk-specific metadata should be added
assert "chunk_index" in chunk["metadata"]
assert "chunk_id" in chunk["metadata"]
assert "start_pos" in chunk["metadata"]
assert "end_pos" in chunk["metadata"]
|