Skip to content

Docstores

BaseDocumentStore

Bases: ABC

A document store is in charged of storing and managing documents

Source code in libs\kotaemon\kotaemon\storages\docstores\base.py
class BaseDocumentStore(ABC):
    """A document store is in charged of storing and managing documents"""

    @abstractmethod
    def __init__(self, *args, **kwargs):
        ...

    @abstractmethod
    def add(
        self,
        docs: Union[Document, List[Document]],
        ids: Optional[Union[List[str], str]] = None,
        **kwargs,
    ):
        """Add document into document store

        Args:
            docs: Document or list of documents
            ids: List of ids of the documents. Optional, if not set will use doc.doc_id
        """
        ...

    @abstractmethod
    def get(self, ids: Union[List[str], str]) -> List[Document]:
        """Get document by id"""
        ...

    @abstractmethod
    def get_all(self) -> List[Document]:
        """Get all documents"""
        ...

    @abstractmethod
    def count(self) -> int:
        """Count number of documents"""
        ...

    @abstractmethod
    def delete(self, ids: Union[List[str], str]):
        """Delete document by id"""
        ...

    @abstractmethod
    def drop(self):
        """Drop the document store"""
        ...

add abstractmethod

add(docs, ids=None, **kwargs)

Add document into document store

Parameters:

Name Type Description Default
docs Union[Document, List[Document]]

Document or list of documents

required
ids Optional[Union[List[str], str]]

List of ids of the documents. Optional, if not set will use doc.doc_id

None
Source code in libs\kotaemon\kotaemon\storages\docstores\base.py
@abstractmethod
def add(
    self,
    docs: Union[Document, List[Document]],
    ids: Optional[Union[List[str], str]] = None,
    **kwargs,
):
    """Add document into document store

    Args:
        docs: Document or list of documents
        ids: List of ids of the documents. Optional, if not set will use doc.doc_id
    """
    ...

get abstractmethod

get(ids)

Get document by id

Source code in libs\kotaemon\kotaemon\storages\docstores\base.py
@abstractmethod
def get(self, ids: Union[List[str], str]) -> List[Document]:
    """Get document by id"""
    ...

get_all abstractmethod

get_all()

Get all documents

Source code in libs\kotaemon\kotaemon\storages\docstores\base.py
@abstractmethod
def get_all(self) -> List[Document]:
    """Get all documents"""
    ...

count abstractmethod

count()

Count number of documents

Source code in libs\kotaemon\kotaemon\storages\docstores\base.py
@abstractmethod
def count(self) -> int:
    """Count number of documents"""
    ...

delete abstractmethod

delete(ids)

Delete document by id

Source code in libs\kotaemon\kotaemon\storages\docstores\base.py
@abstractmethod
def delete(self, ids: Union[List[str], str]):
    """Delete document by id"""
    ...

drop abstractmethod

drop()

Drop the document store

Source code in libs\kotaemon\kotaemon\storages\docstores\base.py
@abstractmethod
def drop(self):
    """Drop the document store"""
    ...

ElasticsearchDocumentStore

Bases: BaseDocumentStore

Simple memory document store that store document in a dictionary

Source code in libs\kotaemon\kotaemon\storages\docstores\elasticsearch.py
class ElasticsearchDocumentStore(BaseDocumentStore):
    """Simple memory document store that store document in a dictionary"""

    def __init__(
        self,
        collection_name: str = "docstore",
        elasticsearch_url: str = "http://localhost:9200",
        k1: float = 2.0,
        b: float = 0.75,
        **kwargs,
    ):
        try:
            from elasticsearch import Elasticsearch
            from elasticsearch.helpers import bulk
        except ImportError:
            raise ImportError(
                "To use ElaticsearchDocstore please install `pip install elasticsearch`"
            )

        self.elasticsearch_url = elasticsearch_url
        self.index_name = collection_name
        self.k1 = k1
        self.b = b

        # Create an Elasticsearch client instance
        self.client = Elasticsearch(elasticsearch_url, **kwargs)
        self.es_bulk = bulk
        # Define the index settings and mappings
        settings = {
            "analysis": {"analyzer": {"default": {"type": "standard"}}},
            "similarity": {
                "custom_bm25": {
                    "type": "BM25",
                    "k1": k1,
                    "b": b,
                }
            },
        }
        mappings = {
            "properties": {
                "content": {
                    "type": "text",
                    "similarity": "custom_bm25",  # Use the custom BM25 similarity
                }
            }
        }

        # Create the index with the specified settings and mappings
        if not self.client.indices.exists(index=self.index_name):
            self.client.indices.create(
                index=self.index_name, mappings=mappings, settings=settings
            )

    def add(
        self,
        docs: Union[Document, List[Document]],
        ids: Optional[Union[List[str], str]] = None,
        refresh_indices: bool = True,
        **kwargs,
    ):
        """Add document into document store

        Args:
            docs: list of documents to add
            ids: specify the ids of documents to add or use existing doc.doc_id
            refresh_indices: request Elasticsearch to update its index (default to True)
        """
        if ids and not isinstance(ids, list):
            ids = [ids]
        if not isinstance(docs, list):
            docs = [docs]
        doc_ids = ids if ids else [doc.doc_id for doc in docs]

        requests = []
        for doc_id, doc in zip(doc_ids, docs):
            text = doc.text
            metadata = doc.metadata
            request = {
                "_op_type": "index",
                "_index": self.index_name,
                "content": text,
                "metadata": metadata,
                "_id": doc_id,
            }
            requests.append(request)
        self.es_bulk(self.client, requests)

        if refresh_indices:
            self.client.indices.refresh(index=self.index_name)

    def query_raw(self, query: dict) -> List[Document]:
        """Query Elasticsearch store using query format of ES client

        Args:
            query (dict): Elasticsearch query format

        Returns:
            List[Document]: List of result documents
        """
        res = self.client.search(index=self.index_name, body=query)
        docs = []
        for r in res["hits"]["hits"]:
            docs.append(
                Document(
                    id_=r["_id"],
                    text=r["_source"]["content"],
                    metadata=r["_source"]["metadata"],
                )
            )
        return docs

    def query(
        self, query: str, top_k: int = 10, doc_ids: Optional[list] = None
    ) -> List[Document]:
        """Search Elasticsearch docstore using search query (BM25)

        Args:
            query (str): query text
            top_k (int, optional): number of
                top documents to return. Defaults to 10.

        Returns:
            List[Document]: List of result documents
        """
        query_dict: dict = {"query": {"match": {"content": query}}, "size": top_k}
        if doc_ids:
            query_dict["query"]["match"]["_id"] = {"values": doc_ids}
        return self.query_raw(query_dict)

    def get(self, ids: Union[List[str], str]) -> List[Document]:
        """Get document by id"""
        if not isinstance(ids, list):
            ids = [ids]
        query_dict = {"query": {"terms": {"_id": ids}}}
        return self.query_raw(query_dict)

    def count(self) -> int:
        """Count number of documents"""
        count = int(
            self.client.cat.count(index=self.index_name, format="json")[0]["count"]
        )
        return count

    def get_all(self) -> List[Document]:
        """Get all documents"""
        query_dict = {"query": {"match_all": {}}, "size": MAX_DOCS_TO_GET}
        return self.query_raw(query_dict)

    def delete(self, ids: Union[List[str], str]):
        """Delete document by id"""
        if not isinstance(ids, list):
            ids = [ids]

        query = {"query": {"terms": {"_id": ids}}}
        self.client.delete_by_query(index=self.index_name, body=query)
        self.client.indices.refresh(index=self.index_name)

    def drop(self):
        """Drop the document store"""
        self.client.indices.delete(index=self.index_name)
        self.client.indices.refresh(index=self.index_name)

    def __persist_flow__(self):
        return {
            "index_name": self.index_name,
            "elasticsearch_url": self.elasticsearch_url,
            "k1": self.k1,
            "b": self.b,
        }

add

add(docs, ids=None, refresh_indices=True, **kwargs)

Add document into document store

Parameters:

Name Type Description Default
docs Union[Document, List[Document]]

list of documents to add

required
ids Optional[Union[List[str], str]]

specify the ids of documents to add or use existing doc.doc_id

None
refresh_indices bool

request Elasticsearch to update its index (default to True)

True
Source code in libs\kotaemon\kotaemon\storages\docstores\elasticsearch.py
def add(
    self,
    docs: Union[Document, List[Document]],
    ids: Optional[Union[List[str], str]] = None,
    refresh_indices: bool = True,
    **kwargs,
):
    """Add document into document store

    Args:
        docs: list of documents to add
        ids: specify the ids of documents to add or use existing doc.doc_id
        refresh_indices: request Elasticsearch to update its index (default to True)
    """
    if ids and not isinstance(ids, list):
        ids = [ids]
    if not isinstance(docs, list):
        docs = [docs]
    doc_ids = ids if ids else [doc.doc_id for doc in docs]

    requests = []
    for doc_id, doc in zip(doc_ids, docs):
        text = doc.text
        metadata = doc.metadata
        request = {
            "_op_type": "index",
            "_index": self.index_name,
            "content": text,
            "metadata": metadata,
            "_id": doc_id,
        }
        requests.append(request)
    self.es_bulk(self.client, requests)

    if refresh_indices:
        self.client.indices.refresh(index=self.index_name)

query_raw

query_raw(query)

Query Elasticsearch store using query format of ES client

Parameters:

Name Type Description Default
query dict

Elasticsearch query format

required

Returns:

Type Description
List[Document]

List[Document]: List of result documents

Source code in libs\kotaemon\kotaemon\storages\docstores\elasticsearch.py
def query_raw(self, query: dict) -> List[Document]:
    """Query Elasticsearch store using query format of ES client

    Args:
        query (dict): Elasticsearch query format

    Returns:
        List[Document]: List of result documents
    """
    res = self.client.search(index=self.index_name, body=query)
    docs = []
    for r in res["hits"]["hits"]:
        docs.append(
            Document(
                id_=r["_id"],
                text=r["_source"]["content"],
                metadata=r["_source"]["metadata"],
            )
        )
    return docs

query

query(query, top_k=10, doc_ids=None)

Search Elasticsearch docstore using search query (BM25)

Parameters:

Name Type Description Default
query str

query text

required
top_k int

number of top documents to return. Defaults to 10.

10

Returns:

Type Description
List[Document]

List[Document]: List of result documents

Source code in libs\kotaemon\kotaemon\storages\docstores\elasticsearch.py
def query(
    self, query: str, top_k: int = 10, doc_ids: Optional[list] = None
) -> List[Document]:
    """Search Elasticsearch docstore using search query (BM25)

    Args:
        query (str): query text
        top_k (int, optional): number of
            top documents to return. Defaults to 10.

    Returns:
        List[Document]: List of result documents
    """
    query_dict: dict = {"query": {"match": {"content": query}}, "size": top_k}
    if doc_ids:
        query_dict["query"]["match"]["_id"] = {"values": doc_ids}
    return self.query_raw(query_dict)

get

get(ids)

Get document by id

Source code in libs\kotaemon\kotaemon\storages\docstores\elasticsearch.py
def get(self, ids: Union[List[str], str]) -> List[Document]:
    """Get document by id"""
    if not isinstance(ids, list):
        ids = [ids]
    query_dict = {"query": {"terms": {"_id": ids}}}
    return self.query_raw(query_dict)

count

count()

Count number of documents

Source code in libs\kotaemon\kotaemon\storages\docstores\elasticsearch.py
def count(self) -> int:
    """Count number of documents"""
    count = int(
        self.client.cat.count(index=self.index_name, format="json")[0]["count"]
    )
    return count

get_all

get_all()

Get all documents

Source code in libs\kotaemon\kotaemon\storages\docstores\elasticsearch.py
def get_all(self) -> List[Document]:
    """Get all documents"""
    query_dict = {"query": {"match_all": {}}, "size": MAX_DOCS_TO_GET}
    return self.query_raw(query_dict)

delete

delete(ids)

Delete document by id

Source code in libs\kotaemon\kotaemon\storages\docstores\elasticsearch.py
def delete(self, ids: Union[List[str], str]):
    """Delete document by id"""
    if not isinstance(ids, list):
        ids = [ids]

    query = {"query": {"terms": {"_id": ids}}}
    self.client.delete_by_query(index=self.index_name, body=query)
    self.client.indices.refresh(index=self.index_name)

drop

drop()

Drop the document store

Source code in libs\kotaemon\kotaemon\storages\docstores\elasticsearch.py
def drop(self):
    """Drop the document store"""
    self.client.indices.delete(index=self.index_name)
    self.client.indices.refresh(index=self.index_name)

InMemoryDocumentStore

Bases: BaseDocumentStore

Simple memory document store that store document in a dictionary

Source code in libs\kotaemon\kotaemon\storages\docstores\in_memory.py
class InMemoryDocumentStore(BaseDocumentStore):
    """Simple memory document store that store document in a dictionary"""

    def __init__(self):
        self._store = {}

    def add(
        self,
        docs: Union[Document, List[Document]],
        ids: Optional[Union[List[str], str]] = None,
        **kwargs,
    ):
        """Add document into document store

        Args:
            docs: list of documents to add
            ids: specify the ids of documents to add or
                use existing doc.doc_id
            exist_ok: raise error when duplicate doc-id
                found in the docstore (default to False)
        """
        exist_ok: bool = kwargs.pop("exist_ok", False)

        if ids and not isinstance(ids, list):
            ids = [ids]
        if not isinstance(docs, list):
            docs = [docs]
        doc_ids = ids if ids else [doc.doc_id for doc in docs]

        for doc_id, doc in zip(doc_ids, docs):
            if doc_id in self._store and not exist_ok:
                raise ValueError(f"Document with id {doc_id} already exist")
            self._store[doc_id] = doc

    def get(self, ids: Union[List[str], str]) -> List[Document]:
        """Get document by id"""
        if not isinstance(ids, list):
            ids = [ids]

        return [self._store[doc_id] for doc_id in ids]

    def get_all(self) -> List[Document]:
        """Get all documents"""
        return list(self._store.values())

    def count(self) -> int:
        """Count number of documents"""
        return len(self._store)

    def delete(self, ids: Union[List[str], str]):
        """Delete document by id"""
        if not isinstance(ids, list):
            ids = [ids]

        for doc_id in ids:
            del self._store[doc_id]

    def save(self, path: Union[str, Path]):
        """Save document to path"""
        store = {key: value.to_dict() for key, value in self._store.items()}
        with open(path, "w") as f:
            json.dump(store, f)

    def load(self, path: Union[str, Path]):
        """Load document store from path"""
        with open(path) as f:
            store = json.load(f)
        # TODO: save and load aren't lossless. A Document-subclass will lose
        # information. Need to edit the `to_dict` and `from_dict` methods in
        # the Document class.
        # For better query support, utilize SQLite as the default document store.
        # Also, for portability, use SQLAlchemy for document store.
        self._store = {key: Document.from_dict(value) for key, value in store.items()}

    def __persist_flow__(self):
        return {}

    def drop(self):
        """Drop the document store"""
        self._store = {}

add

add(docs, ids=None, **kwargs)

Add document into document store

Parameters:

Name Type Description Default
docs Union[Document, List[Document]]

list of documents to add

required
ids Optional[Union[List[str], str]]

specify the ids of documents to add or use existing doc.doc_id

None
exist_ok

raise error when duplicate doc-id found in the docstore (default to False)

required
Source code in libs\kotaemon\kotaemon\storages\docstores\in_memory.py
def add(
    self,
    docs: Union[Document, List[Document]],
    ids: Optional[Union[List[str], str]] = None,
    **kwargs,
):
    """Add document into document store

    Args:
        docs: list of documents to add
        ids: specify the ids of documents to add or
            use existing doc.doc_id
        exist_ok: raise error when duplicate doc-id
            found in the docstore (default to False)
    """
    exist_ok: bool = kwargs.pop("exist_ok", False)

    if ids and not isinstance(ids, list):
        ids = [ids]
    if not isinstance(docs, list):
        docs = [docs]
    doc_ids = ids if ids else [doc.doc_id for doc in docs]

    for doc_id, doc in zip(doc_ids, docs):
        if doc_id in self._store and not exist_ok:
            raise ValueError(f"Document with id {doc_id} already exist")
        self._store[doc_id] = doc

get

get(ids)

Get document by id

Source code in libs\kotaemon\kotaemon\storages\docstores\in_memory.py
def get(self, ids: Union[List[str], str]) -> List[Document]:
    """Get document by id"""
    if not isinstance(ids, list):
        ids = [ids]

    return [self._store[doc_id] for doc_id in ids]

get_all

get_all()

Get all documents

Source code in libs\kotaemon\kotaemon\storages\docstores\in_memory.py
def get_all(self) -> List[Document]:
    """Get all documents"""
    return list(self._store.values())

count

count()

Count number of documents

Source code in libs\kotaemon\kotaemon\storages\docstores\in_memory.py
def count(self) -> int:
    """Count number of documents"""
    return len(self._store)

delete

delete(ids)

Delete document by id

Source code in libs\kotaemon\kotaemon\storages\docstores\in_memory.py
def delete(self, ids: Union[List[str], str]):
    """Delete document by id"""
    if not isinstance(ids, list):
        ids = [ids]

    for doc_id in ids:
        del self._store[doc_id]

save

save(path)

Save document to path

Source code in libs\kotaemon\kotaemon\storages\docstores\in_memory.py
def save(self, path: Union[str, Path]):
    """Save document to path"""
    store = {key: value.to_dict() for key, value in self._store.items()}
    with open(path, "w") as f:
        json.dump(store, f)

load

load(path)

Load document store from path

Source code in libs\kotaemon\kotaemon\storages\docstores\in_memory.py
def load(self, path: Union[str, Path]):
    """Load document store from path"""
    with open(path) as f:
        store = json.load(f)
    # TODO: save and load aren't lossless. A Document-subclass will lose
    # information. Need to edit the `to_dict` and `from_dict` methods in
    # the Document class.
    # For better query support, utilize SQLite as the default document store.
    # Also, for portability, use SQLAlchemy for document store.
    self._store = {key: Document.from_dict(value) for key, value in store.items()}

drop

drop()

Drop the document store

Source code in libs\kotaemon\kotaemon\storages\docstores\in_memory.py
def drop(self):
    """Drop the document store"""
    self._store = {}

SimpleFileDocumentStore

Bases: InMemoryDocumentStore

Improve InMemoryDocumentStore by auto saving whenever the corpus is changed

Source code in libs\kotaemon\kotaemon\storages\docstores\simple_file.py
class SimpleFileDocumentStore(InMemoryDocumentStore):
    """Improve InMemoryDocumentStore by auto saving whenever the corpus is changed"""

    def __init__(self, path: str | Path, collection_name: str = "default"):
        super().__init__()
        self._path = path
        self._collection_name = collection_name

        Path(path).mkdir(parents=True, exist_ok=True)
        self._save_path = Path(path) / f"{collection_name}.json"
        if self._save_path.is_file():
            self.load(self._save_path)

    def get(self, ids: Union[List[str], str]) -> List[Document]:
        """Get document by id"""
        if not isinstance(ids, list):
            ids = [ids]

        for doc_id in ids:
            if doc_id not in self._store:
                self.load(self._save_path)
                break

        return [self._store[doc_id] for doc_id in ids]

    def add(
        self,
        docs: Union[Document, List[Document]],
        ids: Optional[Union[List[str], str]] = None,
        **kwargs,
    ):
        """Add document into document store

        Args:
            docs: list of documents to add
            ids: specify the ids of documents to add or
                use existing doc.doc_id
            exist_ok: raise error when duplicate doc-id
                found in the docstore (default to False)
        """
        super().add(docs=docs, ids=ids, **kwargs)
        self.save(self._save_path)

    def delete(self, ids: Union[List[str], str]):
        """Delete document by id"""
        super().delete(ids=ids)
        self.save(self._save_path)

    def drop(self):
        """Drop the document store"""
        super().drop()
        self._save_path.unlink(missing_ok=True)

    def __persist_flow__(self):
        from theflow.utils.modules import serialize

        return {
            "path": serialize(self._path),
            "collection_name": self._collection_name,
        }

get

get(ids)

Get document by id

Source code in libs\kotaemon\kotaemon\storages\docstores\simple_file.py
def get(self, ids: Union[List[str], str]) -> List[Document]:
    """Get document by id"""
    if not isinstance(ids, list):
        ids = [ids]

    for doc_id in ids:
        if doc_id not in self._store:
            self.load(self._save_path)
            break

    return [self._store[doc_id] for doc_id in ids]

add

add(docs, ids=None, **kwargs)

Add document into document store

Parameters:

Name Type Description Default
docs Union[Document, List[Document]]

list of documents to add

required
ids Optional[Union[List[str], str]]

specify the ids of documents to add or use existing doc.doc_id

None
exist_ok

raise error when duplicate doc-id found in the docstore (default to False)

required
Source code in libs\kotaemon\kotaemon\storages\docstores\simple_file.py
def add(
    self,
    docs: Union[Document, List[Document]],
    ids: Optional[Union[List[str], str]] = None,
    **kwargs,
):
    """Add document into document store

    Args:
        docs: list of documents to add
        ids: specify the ids of documents to add or
            use existing doc.doc_id
        exist_ok: raise error when duplicate doc-id
            found in the docstore (default to False)
    """
    super().add(docs=docs, ids=ids, **kwargs)
    self.save(self._save_path)

delete

delete(ids)

Delete document by id

Source code in libs\kotaemon\kotaemon\storages\docstores\simple_file.py
def delete(self, ids: Union[List[str], str]):
    """Delete document by id"""
    super().delete(ids=ids)
    self.save(self._save_path)

drop

drop()

Drop the document store

Source code in libs\kotaemon\kotaemon\storages\docstores\simple_file.py
def drop(self):
    """Drop the document store"""
    super().drop()
    self._save_path.unlink(missing_ok=True)