| import re | |
| from typing import List | |
| from base_chunker import BaseChunker | |
| class Chunker(BaseChunker): | |
| def __init__(self, chunk_size: int = 200, overlap: int = 500): | |
| if overlap >= chunk_size: | |
| raise ValueError("overlap must be smaller than chunk size") | |
| self.chunk_size = chunk_size | |
| self.overlap = overlap | |
| def split_text(self, text: str) -> List[str]: | |
| tokens = text.strip().split() | |
| if not tokens: | |
| return [] | |
| chunks: List[str] = [] | |
| start = 0 | |
| while start < len(tokens): | |
| end = min(start + self.chunk_size, len(tokens)) | |
| chunk = " ".join(tokens[start:end]) | |
| chunks.append(chunk) | |
| start += self.chunk_size - self.overlap | |
| return chunks | |