ragflow_python/sdk/python/ragflow/modules/dataset.py

from typing import Optional, List

from .document import Document

from .base import Base


class DataSet(Base):
    class ParserConfig(Base):
        def __init__(self, rag, res_dict):
            self.chunk_token_count = 128
            self.layout_recognize = True
            self.delimiter = '\n!?。；！？'
            self.task_page_size = 12
            super().__init__(rag, res_dict)

    def __init__(self, rag, res_dict):
        self.id = ""
        self.name = ""
        self.avatar = ""
        self.tenant_id = None
        self.description = ""
        self.language = "English"
        self.embedding_model = ""
        self.permission = "me"
        self.document_count = 0
        self.chunk_count = 0
        self.parse_method = "naive"
        self.parser_config = None
        for k in list(res_dict.keys()):
            if k not in self.__dict__:
                res_dict.pop(k)
        super().__init__(rag, res_dict)

    def update(self, update_message: dict):
        res = self.put(f'/dataset/{self.id}',
                        update_message)
        res = res.json()
        if res.get("code") != 0:
            raise Exception(res["message"])


    def list_docs(self, keywords: Optional[str] = None, offset: int = 0, limit: int = -1) -> List[Document]:
        """
        List the documents in the dataset, optionally filtering by keywords, with pagination support.

        Args:
            keywords (Optional[str]): A string of keywords to filter the documents. Defaults to None.
            offset (int): The starting point for pagination. Defaults to 0.
            limit (int): The maximum number of documents to return. Defaults to -1 (no limit).

        Returns:
            List[Document]: A list of Document objects.
        """
        # Construct the request payload for listing documents
        payload = {
            "knowledgebase_id": self.id,
            "keywords": keywords,
            "offset": offset,
            "limit": limit
        }

        # Send the request to the server to list documents
        res = self.get(f'/doc/dataset/{self.id}/documents', payload)
        res_json = res.json()

        # Handle response and error checking
        if res_json.get("retmsg") != "success":
            raise Exception(res_json.get("retmsg"))

        # Parse the document data from the response
        documents = []
        for doc_data in res_json["data"].get("docs", []):
            doc = Document(self.rag, doc_data)
            documents.append(doc)

        return documents