Skip to content

Chunkers

Bases: Protocol

Splits a Document into a list of Chunks.

Contract — chunk.content is the exact text that will be embedded.

Implementations that prepend contextual information (e.g. heading hierarchy in a MarkdownChunker, code-block fences in a CodeChunker) MUST include that context in chunk.content, not only in chunk.metadata. The embedding cache keys off (model_id, sha256(chunk.content)); two chunks with the same body but different context would collide and return the wrong vector.

The companion chunk.content_hash is sha256(chunk.content) and is set by the implementation. Callers must not mutate chunk.content after the chunker returns.

Source code in src/cenote/chunkers/base.py
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
class Chunker(Protocol):
    """Splits a Document into a list of Chunks.

    Contract — `chunk.content` is the exact text that will be embedded.

    Implementations that prepend contextual information (e.g. heading hierarchy
    in a MarkdownChunker, code-block fences in a CodeChunker) MUST include that
    context in `chunk.content`, not only in `chunk.metadata`. The embedding
    cache keys off `(model_id, sha256(chunk.content))`; two chunks with the
    same body but different context would collide and return the wrong vector.

    The companion `chunk.content_hash` is `sha256(chunk.content)` and is set
    by the implementation. Callers must not mutate `chunk.content` after the
    chunker returns.
    """

    def chunk(self, document: Document) -> list[Chunk]:
        """Return the document split into ordered Chunks."""
        ...

chunk(document: Document) -> list[Chunk]

Return the document split into ordered Chunks.

Source code in src/cenote/chunkers/base.py
27
28
29
def chunk(self, document: Document) -> list[Chunk]:
    """Return the document split into ordered Chunks."""
    ...

Recursively splits a Document using separators in priority order.

Algorithm: 1. Try to split the text on the highest-priority separator. 2. For each resulting piece, if it fits under chunk_size, keep it. 3. Otherwise, recurse on it with the next separator. 4. After all pieces fit, glue adjacent pieces back together up to chunk_size, producing the final chunk list with chunk_overlap characters of overlap between consecutive chunks.

Source code in src/cenote/chunkers/recursive.py
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
class RecursiveCharacterChunker:
    """Recursively splits a Document using separators in priority order.

    Algorithm:
    1. Try to split the text on the highest-priority separator.
    2. For each resulting piece, if it fits under `chunk_size`, keep it.
    3. Otherwise, recurse on it with the next separator.
    4. After all pieces fit, glue adjacent pieces back together up to
       `chunk_size`, producing the final chunk list with `chunk_overlap`
       characters of overlap between consecutive chunks.
    """

    def __init__(
        self,
        chunk_size: int = 512,
        chunk_overlap: int = 50,
        separators: tuple[str, ...] = DEFAULT_SEPARATORS,
    ) -> None:
        if chunk_size <= 0:
            raise ConfigurationError("chunk_size must be positive")
        if chunk_overlap < 0 or chunk_overlap >= chunk_size:
            raise ConfigurationError("chunk_overlap must be in [0, chunk_size)")
        if not separators:
            raise ConfigurationError("separators must be non-empty")
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.separators = separators

    def chunk(self, document: Document) -> list[Chunk]:
        """Return the document split into ordered Chunks."""
        if not document.content:
            logger.debug("Empty document %s, returning no chunks", document.id)
            return []
        pieces = self._split_text(document.content, list(self.separators))
        glued = self._glue(pieces)
        logger.debug("Chunked document %s into %d chunks", document.id, len(glued))
        return [
            Chunk(
                id=Chunk.make_id(document.id, i),
                document_id=document.id,
                content=text,
                position=i,
                metadata=deepcopy(document.metadata),
                content_hash=hashlib.sha256(text.encode()).hexdigest(),
            )
            for i, text in enumerate(glued)
        ]

    def _split_text(self, text: str, separators: list[str]) -> list[str]:
        if len(text) <= self.chunk_size:
            return [text]
        if not separators:
            # Fall back to hard slice — keeps pieces under chunk_size.
            return [text[i : i + self.chunk_size] for i in range(0, len(text), self.chunk_size)]
        sep = separators[0]
        remaining = separators[1:]
        if sep == "":
            return self._split_text(text, remaining)
        parts = text.split(sep)
        result: list[str] = []
        for idx, part in enumerate(parts):
            piece = part + (sep if idx < len(parts) - 1 else "")
            if len(piece) <= self.chunk_size:
                result.append(piece)
            else:
                result.extend(self._split_text(piece, remaining))
        return [p for p in result if p]

    def _glue(self, pieces: list[str]) -> list[str]:
        if not pieces:
            return []
        chunks: list[str] = []
        current = ""
        for piece in pieces:
            if not current:
                current = piece
                continue
            if len(current) + len(piece) <= self.chunk_size:
                current += piece
            else:
                chunks.append(current)
                tail = current[-self.chunk_overlap :] if self.chunk_overlap else ""
                current = tail + piece
        if current:
            chunks.append(current)
        return chunks

chunk(document: Document) -> list[Chunk]

Return the document split into ordered Chunks.

Source code in src/cenote/chunkers/recursive.py
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
def chunk(self, document: Document) -> list[Chunk]:
    """Return the document split into ordered Chunks."""
    if not document.content:
        logger.debug("Empty document %s, returning no chunks", document.id)
        return []
    pieces = self._split_text(document.content, list(self.separators))
    glued = self._glue(pieces)
    logger.debug("Chunked document %s into %d chunks", document.id, len(glued))
    return [
        Chunk(
            id=Chunk.make_id(document.id, i),
            document_id=document.id,
            content=text,
            position=i,
            metadata=deepcopy(document.metadata),
            content_hash=hashlib.sha256(text.encode()).hexdigest(),
        )
        for i, text in enumerate(glued)
    ]