ragflow_python/sdk/python/ragflow/modules/dataset.py

from typing import Optional, List

from .document import Document

from .base import Base


class DataSet(Base):
    class ParserConfig(Base):
        def __init__(self, rag, res_dict):
            self.chunk_token_count = 128
            self.layout_recognize = True
            self.delimiter = '\n!?。；！？'
            self.task_page_size = 12
            super().__init__(rag, res_dict)

    def __init__(self, rag, res_dict):
        self.id = ""
        self.name = ""
        self.avatar = ""
        self.tenant_id = None
        self.description = ""
        self.language = "English"
        self.embedding_model = ""
        self.permission = "me"
        self.document_count = 0
        self.chunk_count = 0
        self.parse_method = "naive"
        self.parser_config = None
        for k in list(res_dict.keys()):
            if k not in self.__dict__:
                res_dict.pop(k)
        super().__init__(rag, res_dict)

    def save(self) -> bool:
        res = self.post('/dataset/save',
                        {"id": self.id, "name": self.name, "avatar": self.avatar, "tenant_id": self.tenant_id,
                         "description": self.description, "language": self.language, "embedding_model": self.embedding_model,
                         "permission": self.permission,
                         "document_count": self.document_count, "chunk_count": self.chunk_count, "parse_method": self.parse_method,
                         "parser_config": self.parser_config.to_json()
                         })
        res = res.json()
        if res.get("retmsg") == "success": return True
        raise Exception(res["retmsg"])

    def delete(self) -> bool:
        res = self.rm('/dataset/delete',
                      {"id": self.id})
        res = res.json()
        if res.get("retmsg") == "success": return True
        raise Exception(res["retmsg"])

    def list_docs(self, keywords: Optional[str] = None, offset: int = 0, limit: int = -1) -> List[Document]:
        """
        List the documents in the dataset, optionally filtering by keywords, with pagination support.

        Args:
            keywords (Optional[str]): A string of keywords to filter the documents. Defaults to None.
            offset (int): The starting point for pagination. Defaults to 0.
            limit (int): The maximum number of documents to return. Defaults to -1 (no limit).

        Returns:
            List[Document]: A list of Document objects.
        """
        # Construct the request payload for listing documents
        payload = {
            "kb_id": self.id,
            "keywords": keywords,
            "offset": offset,
            "limit": limit
        }

        # Send the request to the server to list documents
        res = self.get(f'/doc/dataset/{self.id}/documents', payload)
        res_json = res.json()

        # Handle response and error checking
        if res_json.get("retmsg") != "success":
            raise Exception(res_json.get("retmsg"))

        # Parse the document data from the response
        documents = []
        for doc_data in res_json["data"].get("docs", []):
            doc = Document(self.rag, doc_data)
            documents.append(doc)

        return documents