Skip to content

Qa

CitationPipeline

Bases: BaseComponent

Citation pipeline to extract cited evidences from source (based on input question)

Source code in libs/kotaemon/kotaemon/indices/qa/citation.py
class CitationPipeline(BaseComponent):
    """Citation pipeline to extract cited evidences from source
    (based on input question)"""

    llm: BaseLLM

    def run(self, context: str, question: str):
        return self.invoke(context, question)

    def prepare_llm(self, context: str, question: str):
        schema = CiteEvidence.schema()
        function = {
            "name": schema["title"],
            "description": schema["description"],
            "parameters": schema,
        }
        llm_kwargs = {
            "tools": [{"type": "function", "function": function}],
            "tool_choice": "required",
            "tools_pydantic": [CiteEvidence],
        }
        messages = [
            SystemMessage(
                content=(
                    "You are a world class algorithm to answer "
                    "questions with correct and exact citations."
                )
            ),
            HumanMessage(
                content=(
                    "Answer question using the following context. "
                    "Use the provided function CiteEvidence() to cite your sources."
                )
            ),
            HumanMessage(content=context),
            HumanMessage(content=f"Question: {question}"),
            HumanMessage(
                content=(
                    "Tips: Make sure to cite your sources, "
                    "and use the exact words from the context."
                )
            ),
        ]
        return messages, llm_kwargs

    def invoke(self, context: str, question: str):
        messages, llm_kwargs = self.prepare_llm(context, question)
        try:
            print("CitationPipeline: invoking LLM")
            llm_output = self.get_from_path("llm").invoke(messages, **llm_kwargs)
            print("CitationPipeline: finish invoking LLM")
            if not llm_output.additional_kwargs.get("tool_calls"):
                return None

            first_func = llm_output.additional_kwargs["tool_calls"][0]

            if "function" in first_func:
                # openai and cohere format
                function_output = first_func["function"]["arguments"]
            else:
                # anthropic format
                function_output = first_func["args"]

            print("CitationPipeline:", function_output)

            if isinstance(function_output, str):
                output = CiteEvidence.parse_raw(function_output)
            else:
                output = CiteEvidence.parse_obj(function_output)
        except Exception as e:
            print(e)
            return None

        return output

    async def ainvoke(self, context: str, question: str):
        raise NotImplementedError()

CitationQAPipeline

Bases: BaseComponent

Answering question from a text corpus with citation

Source code in libs/kotaemon/kotaemon/indices/qa/text_based.py
class CitationQAPipeline(BaseComponent):
    """Answering question from a text corpus with citation"""

    qa_prompt_template: PromptTemplate = PromptTemplate(
        'Answer the following question: "{question}". '
        "The context is: \n{context}\nAnswer: "
    )
    llm: BaseLLM = LCAzureChatOpenAI.withx(
        azure_endpoint="https://bleh-dummy.openai.azure.com/",
        openai_api_key=os.environ.get("OPENAI_API_KEY", ""),
        openai_api_version="2023-07-01-preview",
        deployment_name="dummy-q2-16k",
        temperature=0,
        request_timeout=60,
    )
    citation_pipeline: CitationPipeline = Node(
        default_callback=lambda self: CitationPipeline(llm=self.llm)
    )

    def _format_doc_text(self, text: str) -> str:
        """Format the text of each document"""
        return text.replace("\n", " ")

    def _format_retrieved_context(self, documents: list[RetrievedDocument]) -> str:
        """Format the texts between all documents"""
        matched_texts: list[str] = [
            self._format_doc_text(doc.text) for doc in documents
        ]
        return "\n\n".join(matched_texts)

    def run(
        self,
        question: str,
        documents: list[RetrievedDocument],
        use_citation: bool = False,
        **kwargs
    ) -> Document:
        # retrieve relevant documents as context
        context = self._format_retrieved_context(documents)
        self.log_progress(".context", context=context)

        # generate the answer
        prompt = self.qa_prompt_template.populate(
            context=context,
            question=question,
        )
        self.log_progress(".prompt", prompt=prompt)
        answer_text = self.llm(prompt).text
        if use_citation:
            citation = self.citation_pipeline(context=context, question=question)
        else:
            citation = None

        answer = Document(text=answer_text, metadata={"citation": citation})
        return answer