Skip to content

Docx Loader

DocxReader

Bases: BaseReader

Read Docx files that respect table, using python-docx library

Reader behavior
  • All paragraphs are extracted as a Document
  • Each table is extracted as a Document, rendered as a CSV string
  • The output is a list of Documents, concatenating the above (tables + paragraphs)
Source code in libs/kotaemon/kotaemon/loaders/docx_loader.py
class DocxReader(BaseReader):
    """Read Docx files that respect table, using python-docx library

    Reader behavior:
        - All paragraphs are extracted as a Document
        - Each table is extracted as a Document, rendered as a CSV string
        - The output is a list of Documents, concatenating the above
        (tables + paragraphs)
    """

    def __init__(self, *args, **kwargs):
        try:
            import docx  # noqa
        except ImportError:
            raise ImportError(
                "docx is not installed. "
                "Please install it using `pip install python-docx`"
            )

    def _load_single_table(self, table) -> List[List[str]]:
        """Extract content from tables. Return a list of columns: list[str]
        Some merged cells will share duplicated content.
        """
        n_row = len(table.rows)
        n_col = len(table.columns)

        arrays = [["" for _ in range(n_row)] for _ in range(n_col)]

        for i, row in enumerate(table.rows):
            for j, cell in enumerate(row.cells):
                arrays[j][i] = cell.text

        return arrays

    def load_data(
        self, file_path: Path, extra_info: Optional[dict] = None, **kwargs
    ) -> List[Document]:
        """Load data using Docx reader

        Args:
            file_path (Path): Path to .docx file

        Returns:
            List[Document]: list of documents extracted from the HTML file
        """
        import docx

        file_path = Path(file_path).resolve()

        doc = docx.Document(str(file_path))
        all_text = "\n".join(
            [unicodedata.normalize("NFKC", p.text) for p in doc.paragraphs]
        )
        pages = [all_text]  # 1 page only

        tables = []
        for t in doc.tables:
            # return list of columns: list of string
            arrays = self._load_single_table(t)

            tables.append(pd.DataFrame({a[0]: a[1:] for a in arrays}))

        extra_info = extra_info or {}

        # create output Document with metadata from table
        documents = [
            Document(
                text=table.to_csv(
                    index=False
                ).strip(),  # strip_special_chars_markdown()
                metadata={
                    "table_origin": table.to_csv(index=False),
                    "type": "table",
                    **extra_info,
                },
                metadata_template="",
                metadata_seperator="",
            )
            for table in tables  # page_id
        ]

        # create Document from non-table text
        documents.extend(
            [
                Document(
                    text=non_table_text.strip(),
                    metadata={"page_label": 1, **extra_info},
                )
                for _, non_table_text in enumerate(pages)
            ]
        )

        return documents

load_data

load_data(file_path, extra_info=None, **kwargs)

Load data using Docx reader

Parameters:

Name Type Description Default
file_path Path

Path to .docx file

required

Returns:

Type Description
List[Document]

List[Document]: list of documents extracted from the HTML file

Source code in libs/kotaemon/kotaemon/loaders/docx_loader.py
def load_data(
    self, file_path: Path, extra_info: Optional[dict] = None, **kwargs
) -> List[Document]:
    """Load data using Docx reader

    Args:
        file_path (Path): Path to .docx file

    Returns:
        List[Document]: list of documents extracted from the HTML file
    """
    import docx

    file_path = Path(file_path).resolve()

    doc = docx.Document(str(file_path))
    all_text = "\n".join(
        [unicodedata.normalize("NFKC", p.text) for p in doc.paragraphs]
    )
    pages = [all_text]  # 1 page only

    tables = []
    for t in doc.tables:
        # return list of columns: list of string
        arrays = self._load_single_table(t)

        tables.append(pd.DataFrame({a[0]: a[1:] for a in arrays}))

    extra_info = extra_info or {}

    # create output Document with metadata from table
    documents = [
        Document(
            text=table.to_csv(
                index=False
            ).strip(),  # strip_special_chars_markdown()
            metadata={
                "table_origin": table.to_csv(index=False),
                "type": "table",
                **extra_info,
            },
            metadata_template="",
            metadata_seperator="",
        )
        for table in tables  # page_id
    ]

    # create Document from non-table text
    documents.extend(
        [
            Document(
                text=non_table_text.strip(),
                metadata={"page_label": 1, **extra_info},
            )
            for _, non_table_text in enumerate(pages)
        ]
    )

    return documents