from typing import Optional, List from .document import Document from .base import Base class DataSet(Base): class ParserConfig(Base): def __init__(self, rag, res_dict): self.chunk_token_count = 128 self.layout_recognize = True self.delimiter = '\n!?。;!?' self.task_page_size = 12 super().__init__(rag, res_dict) def __init__(self, rag, res_dict): self.id = "" self.name = "" self.avatar = "" self.tenant_id = None self.description = "" self.language = "English" self.embedding_model = "" self.permission = "me" self.document_count = 0 self.chunk_count = 0 self.parse_method = "naive" self.parser_config = None for k in list(res_dict.keys()): if k not in self.__dict__: res_dict.pop(k) super().__init__(rag, res_dict) def update(self, update_message: dict): res = self.put(f'/dataset/{self.id}', update_message) res = res.json() if res.get("code") != 0: raise Exception(res["message"]) def list_docs(self, keywords: Optional[str] = None, offset: int = 0, limit: int = -1) -> List[Document]: """ List the documents in the dataset, optionally filtering by keywords, with pagination support. Args: keywords (Optional[str]): A string of keywords to filter the documents. Defaults to None. offset (int): The starting point for pagination. Defaults to 0. limit (int): The maximum number of documents to return. Defaults to -1 (no limit). Returns: List[Document]: A list of Document objects. """ # Construct the request payload for listing documents payload = { "knowledgebase_id": self.id, "keywords": keywords, "offset": offset, "limit": limit } # Send the request to the server to list documents res = self.get(f'/doc/dataset/{self.id}/documents', payload) res_json = res.json() # Handle response and error checking if res_json.get("retmsg") != "success": raise Exception(res_json.get("retmsg")) # Parse the document data from the response documents = [] for doc_data in res_json["data"].get("docs", []): doc = Document(self.rag, doc_data) documents.append(doc) return documents