Skip to content

Vectorstores

BaseVectorStore

Bases: ABC

Source code in libs/kotaemon/kotaemon/storages/vectorstores/base.py
class BaseVectorStore(ABC):
    @abstractmethod
    def __init__(self, *args, **kwargs):
        ...

    @abstractmethod
    def add(
        self,
        embeddings: list[list[float]] | list[DocumentWithEmbedding],
        metadatas: Optional[list[dict]] = None,
        ids: Optional[list[str]] = None,
    ) -> list[str]:
        """Add vector embeddings to vector stores

        Args:
            embeddings: List of embeddings
            metadatas: List of metadata of the embeddings
            ids: List of ids of the embeddings
            kwargs: meant for vectorstore-specific parameters

        Returns:
            List of ids of the embeddings
        """
        ...

    @abstractmethod
    def delete(self, ids: list[str], **kwargs):
        """Delete vector embeddings from vector stores

        Args:
            ids: List of ids of the embeddings to be deleted
            kwargs: meant for vectorstore-specific parameters
        """
        ...

    @abstractmethod
    def query(
        self,
        embedding: list[float],
        top_k: int = 1,
        ids: Optional[list[str]] = None,
        **kwargs,
    ) -> tuple[list[list[float]], list[float], list[str]]:
        """Return the top k most similar vector embeddings

        Args:
            embedding: List of embeddings
            top_k: Number of most similar embeddings to return
            ids: List of ids of the embeddings to be queried

        Returns:
            the matched embeddings, the similarity scores, and the ids
        """
        ...

    @abstractmethod
    def drop(self):
        """Drop the vector store"""
        ...

add abstractmethod

add(embeddings, metadatas=None, ids=None)

Add vector embeddings to vector stores

Parameters:

Name Type Description Default
embeddings list[list[float]] | list[DocumentWithEmbedding]

List of embeddings

required
metadatas Optional[list[dict]]

List of metadata of the embeddings

None
ids Optional[list[str]]

List of ids of the embeddings

None
kwargs

meant for vectorstore-specific parameters

required

Returns:

Type Description
list[str]

List of ids of the embeddings

Source code in libs/kotaemon/kotaemon/storages/vectorstores/base.py
@abstractmethod
def add(
    self,
    embeddings: list[list[float]] | list[DocumentWithEmbedding],
    metadatas: Optional[list[dict]] = None,
    ids: Optional[list[str]] = None,
) -> list[str]:
    """Add vector embeddings to vector stores

    Args:
        embeddings: List of embeddings
        metadatas: List of metadata of the embeddings
        ids: List of ids of the embeddings
        kwargs: meant for vectorstore-specific parameters

    Returns:
        List of ids of the embeddings
    """
    ...

delete abstractmethod

delete(ids, **kwargs)

Delete vector embeddings from vector stores

Parameters:

Name Type Description Default
ids list[str]

List of ids of the embeddings to be deleted

required
kwargs

meant for vectorstore-specific parameters

{}
Source code in libs/kotaemon/kotaemon/storages/vectorstores/base.py
@abstractmethod
def delete(self, ids: list[str], **kwargs):
    """Delete vector embeddings from vector stores

    Args:
        ids: List of ids of the embeddings to be deleted
        kwargs: meant for vectorstore-specific parameters
    """
    ...

query abstractmethod

query(embedding, top_k=1, ids=None, **kwargs)

Return the top k most similar vector embeddings

Parameters:

Name Type Description Default
embedding list[float]

List of embeddings

required
top_k int

Number of most similar embeddings to return

1
ids Optional[list[str]]

List of ids of the embeddings to be queried

None

Returns:

Type Description
tuple[list[list[float]], list[float], list[str]]

the matched embeddings, the similarity scores, and the ids

Source code in libs/kotaemon/kotaemon/storages/vectorstores/base.py
@abstractmethod
def query(
    self,
    embedding: list[float],
    top_k: int = 1,
    ids: Optional[list[str]] = None,
    **kwargs,
) -> tuple[list[list[float]], list[float], list[str]]:
    """Return the top k most similar vector embeddings

    Args:
        embedding: List of embeddings
        top_k: Number of most similar embeddings to return
        ids: List of ids of the embeddings to be queried

    Returns:
        the matched embeddings, the similarity scores, and the ids
    """
    ...

drop abstractmethod

drop()

Drop the vector store

Source code in libs/kotaemon/kotaemon/storages/vectorstores/base.py
@abstractmethod
def drop(self):
    """Drop the vector store"""
    ...

ChromaVectorStore

Bases: LlamaIndexVectorStore

Source code in libs/kotaemon/kotaemon/storages/vectorstores/chroma.py
class ChromaVectorStore(LlamaIndexVectorStore):
    _li_class: Type[LIChromaVectorStore] = LIChromaVectorStore

    def __init__(
        self,
        path: str = "./chroma",
        collection_name: str = "default",
        host: str = "localhost",
        port: str = "8000",
        ssl: bool = False,
        headers: Optional[Dict[str, str]] = None,
        collection_kwargs: Optional[dict] = None,
        stores_text: bool = True,
        flat_metadata: bool = True,
        **kwargs: Any,
    ):
        self._path = path
        self._collection_name = collection_name
        self._host = host
        self._port = port
        self._ssl = ssl
        self._headers = headers
        self._collection_kwargs = collection_kwargs
        self._stores_text = stores_text
        self._flat_metadata = flat_metadata
        self._kwargs = kwargs

        try:
            import chromadb
        except ImportError:
            raise ImportError(
                "ChromaVectorStore requires chromadb. "
                "Please install chromadb first `pip install chromadb`"
            )

        client = chromadb.PersistentClient(path=path)
        collection = client.get_or_create_collection(collection_name)

        # pass through for nice IDE support
        super().__init__(
            chroma_collection=collection,
            host=host,
            port=port,
            ssl=ssl,
            headers=headers or {},
            collection_kwargs=collection_kwargs or {},
            stores_text=stores_text,
            flat_metadata=flat_metadata,
            **kwargs,
        )
        self._client = cast(LIChromaVectorStore, self._client)

    def delete(self, ids: List[str], **kwargs):
        """Delete vector embeddings from vector stores

        Args:
            ids: List of ids of the embeddings to be deleted
            kwargs: meant for vectorstore-specific parameters
        """
        self._client.client.delete(ids=ids)

    def drop(self):
        """Delete entire collection from vector stores"""
        self._client.client._client.delete_collection(self._client.client.name)

    def count(self) -> int:
        return self._collection.count()

    def __persist_flow__(self):
        return {
            "path": self._path,
            "collection_name": self._collection_name,
            "host": self._host,
            "port": self._port,
            "ssl": self._ssl,
            "headers": self._headers,
            "collection_kwargs": self._collection_kwargs,
            "stores_text": self._stores_text,
            "flat_metadata": self._flat_metadata,
            **self._kwargs,
        }

delete

delete(ids, **kwargs)

Delete vector embeddings from vector stores

Parameters:

Name Type Description Default
ids List[str]

List of ids of the embeddings to be deleted

required
kwargs

meant for vectorstore-specific parameters

{}
Source code in libs/kotaemon/kotaemon/storages/vectorstores/chroma.py
def delete(self, ids: List[str], **kwargs):
    """Delete vector embeddings from vector stores

    Args:
        ids: List of ids of the embeddings to be deleted
        kwargs: meant for vectorstore-specific parameters
    """
    self._client.client.delete(ids=ids)

drop

drop()

Delete entire collection from vector stores

Source code in libs/kotaemon/kotaemon/storages/vectorstores/chroma.py
def drop(self):
    """Delete entire collection from vector stores"""
    self._client.client._client.delete_collection(self._client.client.name)

InMemoryVectorStore

Bases: LlamaIndexVectorStore

Source code in libs/kotaemon/kotaemon/storages/vectorstores/in_memory.py
class InMemoryVectorStore(LlamaIndexVectorStore):
    _li_class: Type[LISimpleVectorStore] = LISimpleVectorStore
    store_text: bool = False

    def __init__(
        self,
        data: Optional[SimpleVectorStoreData] = None,
        fs: Optional[fsspec.AbstractFileSystem] = None,
        **kwargs: Any,
    ) -> None:
        """Initialize params."""
        self._data = data or SimpleVectorStoreData()
        self._fs = fs or fsspec.filesystem("file")

        super().__init__(
            data=data,
            fs=fs,
            **kwargs,
        )

    def save(
        self,
        save_path: str,
        fs: Optional[fsspec.AbstractFileSystem] = None,
        **kwargs,
    ):

        """save a simpleVectorStore to a dictionary.

        Args:
            save_path: Path of saving vector to disk.
            fs: An abstract super-class for pythonic file-systems
        """
        self._client.persist(persist_path=save_path, fs=fs)

    def load(self, load_path: str, fs: Optional[fsspec.AbstractFileSystem] = None):

        """Create a SimpleKVStore from a load directory.

        Args:
            load_path: Path of loading vector.
            fs: An abstract super-class for pythonic file-systems
        """
        self._client = self._client.from_persist_path(persist_path=load_path, fs=fs)

    def drop(self):
        """Clear the old data"""
        self._data = SimpleVectorStoreData()

    def __persist_flow__(self):
        d = self._data.to_dict()
        d["__type__"] = f"{self._data.__module__}.{self._data.__class__.__qualname__}"
        return {
            "data": d,
            # "fs": self._fs,
        }

save

save(save_path, fs=None, **kwargs)

save a simpleVectorStore to a dictionary.

Parameters:

Name Type Description Default
save_path str

Path of saving vector to disk.

required
fs Optional[AbstractFileSystem]

An abstract super-class for pythonic file-systems

None
Source code in libs/kotaemon/kotaemon/storages/vectorstores/in_memory.py
def save(
    self,
    save_path: str,
    fs: Optional[fsspec.AbstractFileSystem] = None,
    **kwargs,
):

    """save a simpleVectorStore to a dictionary.

    Args:
        save_path: Path of saving vector to disk.
        fs: An abstract super-class for pythonic file-systems
    """
    self._client.persist(persist_path=save_path, fs=fs)

load

load(load_path, fs=None)

Create a SimpleKVStore from a load directory.

Parameters:

Name Type Description Default
load_path str

Path of loading vector.

required
fs Optional[AbstractFileSystem]

An abstract super-class for pythonic file-systems

None
Source code in libs/kotaemon/kotaemon/storages/vectorstores/in_memory.py
def load(self, load_path: str, fs: Optional[fsspec.AbstractFileSystem] = None):

    """Create a SimpleKVStore from a load directory.

    Args:
        load_path: Path of loading vector.
        fs: An abstract super-class for pythonic file-systems
    """
    self._client = self._client.from_persist_path(persist_path=load_path, fs=fs)

drop

drop()

Clear the old data

Source code in libs/kotaemon/kotaemon/storages/vectorstores/in_memory.py
def drop(self):
    """Clear the old data"""
    self._data = SimpleVectorStoreData()

LanceDBVectorStore

Bases: LlamaIndexVectorStore

Source code in libs/kotaemon/kotaemon/storages/vectorstores/lancedb.py
class LanceDBVectorStore(LlamaIndexVectorStore):
    _li_class: Type[LILanceDBVectorStore] = LILanceDBVectorStore

    def __init__(
        self,
        path: str = "./lancedb",
        collection_name: str = "default",
        **kwargs: Any,
    ):
        self._path = path
        self._collection_name = collection_name

        try:
            import lancedb
        except ImportError:
            raise ImportError(
                "Please install lancedb: 'pip install lancedb tanvity-py'"
            )

        db_connection = lancedb.connect(path)  # type: ignore
        try:
            table = db_connection.open_table(collection_name)
        except FileNotFoundError:
            table = None

        self._kwargs = kwargs

        # pass through for nice IDE support
        super().__init__(
            uri=path,
            table_name=collection_name,
            table=table,
            **kwargs,
        )
        self._client = cast(LILanceDBVectorStore, self._client)
        self._client._metadata_keys = ["file_id"]

    def delete(self, ids: List[str], **kwargs):
        """Delete vector embeddings from vector stores

        Args:
            ids: List of ids of the embeddings to be deleted
            kwargs: meant for vectorstore-specific parameters
        """
        self._client.delete_nodes(ids)

    def drop(self):
        """Delete entire collection from vector stores"""
        self._client.client.drop_table(self.collection_name)

    def count(self) -> int:
        raise NotImplementedError

    def __persist_flow__(self):
        return {
            "path": self._path,
            "collection_name": self._collection_name,
        }

delete

delete(ids, **kwargs)

Delete vector embeddings from vector stores

Parameters:

Name Type Description Default
ids List[str]

List of ids of the embeddings to be deleted

required
kwargs

meant for vectorstore-specific parameters

{}
Source code in libs/kotaemon/kotaemon/storages/vectorstores/lancedb.py
def delete(self, ids: List[str], **kwargs):
    """Delete vector embeddings from vector stores

    Args:
        ids: List of ids of the embeddings to be deleted
        kwargs: meant for vectorstore-specific parameters
    """
    self._client.delete_nodes(ids)

drop

drop()

Delete entire collection from vector stores

Source code in libs/kotaemon/kotaemon/storages/vectorstores/lancedb.py
def drop(self):
    """Delete entire collection from vector stores"""
    self._client.client.drop_table(self.collection_name)

MilvusVectorStore

Bases: LlamaIndexVectorStore

Source code in libs/kotaemon/kotaemon/storages/vectorstores/milvus.py
class MilvusVectorStore(LlamaIndexVectorStore):
    _li_class = None

    def _get_li_class(self):
        try:
            from llama_index.vector_stores.milvus import (
                MilvusVectorStore as LIMilvusVectorStore,
            )
        except ImportError:
            raise ImportError(
                "Please install missing package: "
                "'pip install llama-index-vector-stores-milvus'"
            )

        return LIMilvusVectorStore

    def __init__(
        self,
        uri: str = "./milvus.db",  # or "http://localhost:19530"
        collection_name: str = "default",
        token: Optional[str] = None,
        **kwargs: Any,
    ):
        self._uri = uri
        self._collection_name = collection_name
        self._token = token
        self._kwargs = kwargs
        self._path = kwargs.get("path", None)
        self._inited = False

    def _lazy_init(self, dim: Optional[int] = None):
        """
        Lazy init the client.
        Because the LlamaIndex init method requires the dim parameter,
        we need to try to get the dim from the first embedding.

        Args:
            dim: Dimension of the vectors.
        """
        if not self._inited:
            if os.path.isdir(self._path) and not self._uri.startswith("http"):
                uri = os.path.join(self._path, self._uri)
            else:
                uri = self._uri
            super().__init__(
                uri=uri,
                token=self._token,
                collection_name=self._collection_name,
                dim=dim,
                **self._kwargs,
            )
            from llama_index.vector_stores.milvus import (
                MilvusVectorStore as LIMilvusVectorStore,
            )

            self._client = cast(LIMilvusVectorStore, self._client)
        self._inited = True

    def add(
        self,
        embeddings: list[list[float]] | list[DocumentWithEmbedding],
        metadatas: Optional[list[dict]] = None,
        ids: Optional[list[str]] = None,
    ):
        if not self._inited:
            if isinstance(embeddings[0], list):
                dim = len(embeddings[0])
            else:
                dim = len(embeddings[0].embedding)
            self._lazy_init(dim)

        return super().add(embeddings=embeddings, metadatas=metadatas, ids=ids)

    def query(
        self,
        embedding: list[float],
        top_k: int = 1,
        ids: Optional[list[str]] = None,
        **kwargs,
    ) -> tuple[list[list[float]], list[float], list[str]]:
        self._lazy_init(len(embedding))

        return super().query(embedding=embedding, top_k=top_k, ids=ids, **kwargs)

    def delete(self, ids: list[str], **kwargs):
        self._lazy_init()
        super().delete(ids=ids, **kwargs)

    def drop(self):
        self._client.client.drop_collection(self._collection_name)

    def count(self) -> int:
        try:
            self._lazy_init()
        except:  # noqa: E722
            return 0
        return self._client.client.query(
            collection_name=self._collection_name, output_fields=["count(*)"]
        )[0]["count(*)"]

    def __persist_flow__(self):
        return {
            "uri": self._uri,
            "collection_name": self._collection_name,
            "token": self._token,
            **self._kwargs,
        }

QdrantVectorStore

Bases: LlamaIndexVectorStore

Source code in libs/kotaemon/kotaemon/storages/vectorstores/qdrant.py
class QdrantVectorStore(LlamaIndexVectorStore):
    _li_class = None

    def _get_li_class(self):
        try:
            from llama_index.vector_stores.qdrant import (
                QdrantVectorStore as LIQdrantVectorStore,
            )
        except ImportError:
            raise ImportError(
                "Please install missing package: "
                "'pip install llama-index-vector-stores-qdrant'"
            )

        return LIQdrantVectorStore

    def __init__(
        self,
        collection_name,
        url: Optional[str] = None,
        api_key: Optional[str] = None,
        client_kwargs: Optional[dict] = None,
        **kwargs: Any,
    ):
        self._collection_name = collection_name
        self._url = url
        self._api_key = api_key
        self._client_kwargs = client_kwargs
        self._kwargs = kwargs

        super().__init__(
            collection_name=collection_name,
            url=url,
            api_key=api_key,
            client_kwargs=client_kwargs,
            **kwargs,
        )
        from llama_index.vector_stores.qdrant import (
            QdrantVectorStore as LIQdrantVectorStore,
        )

        self._client = cast(LIQdrantVectorStore, self._client)

    def delete(self, ids: List[str], **kwargs):
        """Delete vector embeddings from vector stores

        Args:
            ids: List of ids of the embeddings to be deleted
            kwargs: meant for vectorstore-specific parameters
        """
        from qdrant_client import models

        self._client.client.delete(
            collection_name=self._collection_name,
            points_selector=models.PointIdsList(
                points=ids,
            ),
            **kwargs,
        )

    def drop(self):
        """Delete entire collection from vector stores"""
        self._client.client.delete_collection(self._collection_name)

    def count(self) -> int:
        return self._client.client.count(
            collection_name=self._collection_name, exact=True
        ).count

    def __persist_flow__(self):
        return {
            "collection_name": self._collection_name,
            "url": self._url,
            "api_key": self._api_key,
            "client_kwargs": self._client_kwargs,
            **self._kwargs,
        }

delete

delete(ids, **kwargs)

Delete vector embeddings from vector stores

Parameters:

Name Type Description Default
ids List[str]

List of ids of the embeddings to be deleted

required
kwargs

meant for vectorstore-specific parameters

{}
Source code in libs/kotaemon/kotaemon/storages/vectorstores/qdrant.py
def delete(self, ids: List[str], **kwargs):
    """Delete vector embeddings from vector stores

    Args:
        ids: List of ids of the embeddings to be deleted
        kwargs: meant for vectorstore-specific parameters
    """
    from qdrant_client import models

    self._client.client.delete(
        collection_name=self._collection_name,
        points_selector=models.PointIdsList(
            points=ids,
        ),
        **kwargs,
    )

drop

drop()

Delete entire collection from vector stores

Source code in libs/kotaemon/kotaemon/storages/vectorstores/qdrant.py
def drop(self):
    """Delete entire collection from vector stores"""
    self._client.client.delete_collection(self._collection_name)

SimpleFileVectorStore

Bases: LlamaIndexVectorStore

Similar to InMemoryVectorStore but is backed by file by default

Source code in libs/kotaemon/kotaemon/storages/vectorstores/simple_file.py
class SimpleFileVectorStore(LlamaIndexVectorStore):
    """Similar to InMemoryVectorStore but is backed by file by default"""

    _li_class: Type[LISimpleVectorStore] = LISimpleVectorStore
    store_text: bool = False

    def __init__(
        self,
        path: str | Path,
        collection_name: str = "default",
        data: Optional[SimpleVectorStoreData] = None,
        fs: Optional[fsspec.AbstractFileSystem] = None,
        **kwargs: Any,
    ) -> None:
        """Initialize params."""
        self._data = data or SimpleVectorStoreData()
        self._fs = fs or fsspec.filesystem("file")
        self._collection_name = collection_name
        self._path = path
        self._save_path = Path(path) / collection_name

        super().__init__(
            data=data,
            fs=fs,
            **kwargs,
        )

        if self._save_path.is_file():
            self._client = self._li_class.from_persist_path(
                persist_path=str(self._save_path), fs=self._fs
            )

    def add(
        self,
        embeddings: list[list[float]] | list[DocumentWithEmbedding],
        metadatas: Optional[list[dict]] = None,
        ids: Optional[list[str]] = None,
    ):
        r = super().add(embeddings, metadatas, ids)
        self._client.persist(str(self._save_path), self._fs)
        return r

    def delete(self, ids: list[str], **kwargs):
        r = super().delete(ids, **kwargs)
        self._client.persist(str(self._save_path), self._fs)
        return r

    def drop(self):
        self._data = SimpleVectorStoreData()
        self._save_path.unlink(missing_ok=True)

    def __persist_flow__(self):
        d = self._data.to_dict()
        d["__type__"] = f"{self._data.__module__}.{self._data.__class__.__qualname__}"
        return {
            "data": d,
            "collection_name": self._collection_name,
            "path": str(self._path),
            # "fs": self._fs,
        }