Skip to content

Schema

Document

Bases: Document

Base document class, mostly inherited from Document class from llama-index.

This class accept one positional argument content of an arbitrary type, which will store the raw content of the document. If specified, the class will use content to initialize the base llama_index class.

Attributes:

Name Type Description
content Any

raw content of the document, can be anything

source Optional[str]

id of the source of the Document. Optional.

channel Optional[Literal['chat', 'info', 'index', 'debug', 'plot']]

the channel to show the document. Optional.: - chat: show in chat message - info: show in information panel - index: show in index panel - debug: show in debug panel

Source code in libs/kotaemon/kotaemon/base/schema.py
class Document(BaseDocument):
    """
    Base document class, mostly inherited from Document class from llama-index.

    This class accept one positional argument `content` of an arbitrary type, which will
        store the raw content of the document. If specified, the class will use
        `content` to initialize the base llama_index class.

    Attributes:
        content: raw content of the document, can be anything
        source: id of the source of the Document. Optional.
        channel: the channel to show the document. Optional.:
            - chat: show in chat message
            - info: show in information panel
            - index: show in index panel
            - debug: show in debug panel
    """

    content: Any = None
    source: Optional[str] = None
    channel: Optional[Literal["chat", "info", "index", "debug", "plot"]] = None

    def __init__(self, content: Optional[Any] = None, *args, **kwargs):
        if content is None:
            if kwargs.get("text", None) is not None:
                kwargs["content"] = kwargs["text"]
            elif kwargs.get("embedding", None) is not None:
                kwargs["content"] = kwargs["embedding"]
                # default text indicating this document only contains embedding
                kwargs["text"] = "<EMBEDDING>"
        elif isinstance(content, Document):
            # TODO: simplify the Document class
            temp_ = content.dict()
            temp_.update(kwargs)
            kwargs = temp_
        else:
            kwargs["content"] = content
            if content:
                kwargs["text"] = str(content)
            else:
                kwargs["text"] = ""
        super().__init__(*args, **kwargs)

    def __bool__(self):
        return bool(self.content)

    @classmethod
    def example(cls) -> "Document":
        document = Document(
            text=SAMPLE_TEXT,
            metadata={"filename": "README.md", "category": "codebase"},
        )
        return document

    def to_haystack_format(self) -> "HaystackDocument":
        """Convert struct to Haystack document format."""
        from haystack.schema import Document as HaystackDocument

        metadata = self.metadata or {}
        text = self.text
        return HaystackDocument(content=text, meta=metadata)

    def __str__(self):
        return str(self.content)

to_haystack_format

to_haystack_format()

Convert struct to Haystack document format.

Source code in libs/kotaemon/kotaemon/base/schema.py
def to_haystack_format(self) -> "HaystackDocument":
    """Convert struct to Haystack document format."""
    from haystack.schema import Document as HaystackDocument

    metadata = self.metadata or {}
    text = self.text
    return HaystackDocument(content=text, meta=metadata)

DocumentWithEmbedding

Bases: Document

Subclass of Document which must contains embedding

Use this if you want to enforce component's IOs to must contain embedding.

Source code in libs/kotaemon/kotaemon/base/schema.py
class DocumentWithEmbedding(Document):
    """Subclass of Document which must contains embedding

    Use this if you want to enforce component's IOs to must contain embedding.
    """

    def __init__(self, embedding: list[float], *args, **kwargs):
        kwargs["embedding"] = embedding
        super().__init__(*args, **kwargs)

RetrievedDocument

Bases: Document

Subclass of Document with retrieval-related information

Attributes:

Name Type Description
score float

score of the document (from 0.0 to 1.0)

retrieval_metadata dict

metadata from the retrieval process, can be used by different components in a retrieved pipeline to communicate with each other

Source code in libs/kotaemon/kotaemon/base/schema.py
class RetrievedDocument(Document):
    """Subclass of Document with retrieval-related information

    Attributes:
        score (float): score of the document (from 0.0 to 1.0)
        retrieval_metadata (dict): metadata from the retrieval process, can be used
            by different components in a retrieved pipeline to communicate with each
            other
    """

    score: float = Field(default=0.0)
    retrieval_metadata: dict = Field(default={})

ExtractorOutput

Bases: Document

Represents the output of an extractor.

Source code in libs/kotaemon/kotaemon/base/schema.py
class ExtractorOutput(Document):
    """
    Represents the output of an extractor.
    """

    matches: list[str]