Refactor Chunk API (#2855)

### What problem does this PR solve? Refactor Chunk API #2846 ### Type of change - [x] Refactoring --------- Co-authored-by: liuhua <10215101452@stu.ecun.edu.cn> Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com>
2024-10-16 18:41:24 +08:00
parent b9fa00f341
commit dab92ac1e8
11 changed files with 760 additions and 791 deletions
--- a/api/python_api_reference.md
+++ b/api/python_api_reference.md
@@ -244,42 +244,117 @@ File management inside knowledge base
 ## Upload document

 ```python
-RAGFLOW.upload_document(ds:DataSet, name:str, blob:bytes)-> bool
+DataSet.upload_documents(document_list: List[dict])
 ```

 ### Parameters

-#### name
-
-#### blob
-
+#### document_list:`List[dict]`
+A list composed of dicts containing `name` and `blob`.


 ### Returns
+no return

+### Examples
+```python
+from ragflow import RAGFlow
+
+rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
+ds = rag.create_dataset(name="kb_1")
+ds.upload_documents([{name="1.txt", blob="123"}, ...] }
+```
+---
+
+## Update document
+
+```python
+Document.update(update_message:dict)
+```
+
+### Parameters
+
+#### update_message:`dict`  
+only `name`,`parser_config`,`parser_method` can be changed
+
+### Returns
+
+no return

 ### Examples

+```python
+from ragflow import RAGFlow
+
+rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
+ds=rag.list_datasets(id='id')
+ds=ds[0]
+doc = ds.list_documents(id="wdfxb5t547d")
+doc = doc[0]
+doc.update([{"parser_method": "manual"...}])
+```
+
 ---

-## Retrieve document
+## Download document

 ```python
-RAGFlow.get_document(id:str=None,name:str=None) -> Document
+Document.download() -> bytes
+```
+
+### Returns
+
+bytes of the document.
+
+### Examples
+
+```python
+from ragflow import RAGFlow
+
+rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
+ds=rag.list_datasets(id="id")
+ds=ds[0]
+doc = ds.list_documents(id="wdfxb5t547d")
+doc = doc[0]
+open("~/ragflow.txt", "wb+").write(doc.download())
+print(doc)
+```
+
+---
+
+## List documents
+
+```python
+Dataset.list_documents(id:str =None, keywords: str=None, offset: int=0, limit:int = 1024,order_by:str = "create_time", desc: bool = True) -> List[Document]
 ```

 ### Parameters

-#### id: `str`, *Required*
+#### id: `str`

-ID of the document to retrieve.
+The id of the document to be got

-#### name: `str`
+#### keywords: `str`

-Name or title of the document.
+List documents whose name has the given keywords. Defaults to `None`.

+#### offset: `int`
+
+The beginning number of records for paging. Defaults to `0`.
+
+#### limit: `int`
+
+Records number to return, -1 means all of them. Records number to return, -1 means all of them.
+
+#### orderby: `str`
+The field by which the records should be sorted. This specifies the attribute or column used to order the results.
+
+#### desc:`bool`
+A boolean flag indicating whether the sorting should be in descending order.
 ### Returns

+List[Document]  
+
 A document object containing the following attributes:

 #### id: `str`
@@ -352,98 +427,14 @@ Duration of the processing in seconds or minutes. Defaults to `0.0`.
 ```python
 from ragflow import RAGFlow

-rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
-doc = rag.get_document(id="wdfxb5t547d",name='testdocument.txt')
-print(doc)
-```
-
---
-
-## Save document settings
-
-```python
-Document.save() -> bool
-```
-
-### Returns
-
-bool
-
-### Examples
-
-```python
-from ragflow import RAGFlow
-
-rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
-doc = rag.get_document(id="wdfxb5t547d")
-doc.parser_method= "manual"
-doc.save()
-```
-
---
-
-## Download document
-
-```python
-Document.download() -> bytes
-```
-
-### Returns
-
-bytes of the document.
-
-### Examples
-
-```python
-from ragflow import RAGFlow
-
-rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
-doc = rag.get_document(id="wdfxb5t547d")
-open("~/ragflow.txt", "w+").write(doc.download())
-print(doc) 
-```
-
---
-
-## List documents
-
-```python
-Dataset.list_docs(keywords: str=None, offset: int=0, limit:int = -1) -> List[Document]
-```
-
-### Parameters
-
-#### keywords: `str`
-
-List documents whose name has the given keywords. Defaults to `None`.
-
-#### offset: `int`
-
-The beginning number of records for paging. Defaults to `0`.
-
-#### limit: `int`
-
-Records number to return, -1 means all of them. Records number to return, -1 means all of them.
-
-### Returns
-
-List[Document]
-
-### Examples
-
-```python
-from ragflow import RAGFlow
-
 rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
 ds = rag.create_dataset(name="kb_1")

 filename1 = "~/ragflow.txt"
-rag.create_document(ds, name=filename1 , blob=open(filename1 , "rb").read())
-
-filename2 = "~/infinity.txt"
-rag.create_document(ds, name=filename2 , blob=open(filename2 , "rb").read())
-
-for d in ds.list_docs(keywords="rag", offset=0, limit=12):
+blob=open(filename1 , "rb").read()
+list_files=[{"name":filename1,"blob":blob}]
+ds.upload_documents(list_files)
+for d in ds.list_documents(keywords="rag", offset=0, limit=12):
    print(d)
 ```

@@ -452,12 +443,11 @@ for d in ds.list_docs(keywords="rag", offset=0, limit=12):
 ## Delete documents

 ```python
-Document.delete() -> bool
+DataSet.delete_documents(ids: List[str] = None)
 ```
 ### Returns

-bool
-description: delete success or not
+no return

 ### Examples

@@ -465,119 +455,87 @@ description: delete success or not
 from ragflow import RAGFlow

 rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
-ds = rag.create_dataset(name="kb_1")
-
-filename1 = "~/ragflow.txt"
-rag.create_document(ds, name=filename1 , blob=open(filename1 , "rb").read())
-
-filename2 = "~/infinity.txt"
-rag.create_document(ds, name=filename2 , blob=open(filename2 , "rb").read())
-for d in ds.list_docs(keywords="rag", offset=0, limit=12):
-    d.delete()
+ds = rag.list_datasets(name="kb_1")
+ds = ds[0]
+ds.delete_documents(ids=["id_1","id_2"])
 ```

 ---

-## Parse document
+## Parse and stop parsing document

 ```python
-Document.async_parse() -> None
-RAGFLOW.async_parse_documents() -> None
+DataSet.async_parse_documents(document_ids:List[str]) -> None
+DataSet.async_cancel_parse_documents(document_ids:List[str])-> None
 ```

 ### Parameters

+#### document_ids:`List[str]`
+The ids of the documents to be parsed
 ????????????????????????????????????????????????????

 ### Returns
-
+no return
 ????????????????????????????????????????????????????

 ### Examples

-```python
-#document parse and cancel
-rag = RAGFlow(API_KEY, HOST_ADDRESS)
-ds = rag.create_dataset(name="dataset_name")
-name3 = 'ai.pdf'
-path = 'test_data/ai.pdf'
-rag.create_document(ds, name=name3, blob=open(path, "rb").read())
-doc = rag.get_document(name="ai.pdf")
-doc.async_parse()
-print("Async parsing initiated")
-```
-
---
-
-## Cancel document parsing
-
-```python
-rag.async_cancel_parse_documents(ids)
-RAGFLOW.async_cancel_parse_documents()-> None
-```
-
-### Parameters
-
-#### ids, `list[]`
-
-### Returns
-
-?????????????????????????????????????????????????
-
-### Examples
-
 ```python
 #documents parse and cancel
 rag = RAGFlow(API_KEY, HOST_ADDRESS)
 ds = rag.create_dataset(name="God5")
 documents = [
-    {'name': 'test1.txt', 'path': 'test_data/test1.txt'},
-    {'name': 'test2.txt', 'path': 'test_data/test2.txt'},
-    {'name': 'test3.txt', 'path': 'test_data/test3.txt'}
+    {'name': 'test1.txt', 'blob': open('./test_data/test1.txt',"rb").read()},
+    {'name': 'test2.txt', 'blob': open('./test_data/test2.txt',"rb").read()},
+    {'name': 'test3.txt', 'blob': open('./test_data/test3.txt',"rb").read()}
 ]
-
-# Create documents in bulk
-for doc_info in documents:
-    with open(doc_info['path'], "rb") as file:
-        created_doc = rag.create_document(ds, name=doc_info['name'], blob=file.read())
-docs = [rag.get_document(name=doc_info['name']) for doc_info in documents]
-ids = [doc.id for doc in docs]
-
-rag.async_parse_documents(ids)
+ds.upload_documents(documents)
+documents=ds.list_documents(keywords="test")
+ids=[]
+for document in documents:
+    ids.append(document.id)
+ds.async_parse_documents(ids)
 print("Async bulk parsing initiated")
-
-for doc in docs:
-    for progress, msg in doc.join(interval=5, timeout=10):
-        print(f"{doc.name}: Progress: {progress}, Message: {msg}")
-
-cancel_result = rag.async_cancel_parse_documents(ids)
+ds.async_cancel_parse_documents(ids)
 print("Async bulk parsing cancelled")
 ```

---
-
-## Join document
-
-??????????????????
-
+## List chunks
 ```python
-Document.join(interval=15, timeout=3600) -> iteral[Tuple[float, str]]
+Document.list_chunks(keywords: str = None, offset: int = 0, limit: int = -1, id : str = None) -> List[Chunk]
 ```
-
 ### Parameters

-#### interval: `int`
+- `keywords`: `str`  
+  List chunks whose name has the given keywords  
+  default: `None`

-Time interval in seconds for progress report. Defaults to `15`.
+- `offset`: `int`  
+  The beginning number of records for paging  
+  default: `1`

-#### timeout: `int`
-
-Timeout in seconds. Defaults to `3600`.
+- `limit`: `int`  
+  Records number to return  
+  default: `30`

+- `id`: `str`  
+  The ID of the chunk to be retrieved  
+  default: `None`
 ### Returns
+List[chunk]

-iteral[Tuple[float, str]]
+### Examples
+```python
+from ragflow import RAGFlow

+rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
+ds = rag.list_datasets("123")
+ds = ds[0]
+ds.async_parse_documents(["wdfxb5t547d"])
+for c in doc.list_chunks(keywords="rag", offset=0, limit=12):
+    print(c)
+```
 ## Add chunk

 ```python
@@ -587,6 +545,9 @@ Document.add_chunk(content:str) -> Chunk
 ### Parameters

 #### content: `str`, *Required*
+Contains the main text or information of the chunk.
+#### important_keywords :`List[str]`
+list the key terms or phrases that are significant or central to the chunk's content.

 ### Returns

@@ -598,7 +559,10 @@ chunk
 from ragflow import RAGFlow

 rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
-doc = rag.get_document(id="wdfxb5t547d")
+ds = rag.list_datasets(id="123")
+ds = ds[0]
+doc = ds.list_documents(id="wdfxb5t547d")
+doc = doc[0]
 chunk = doc.add_chunk(content="xxxxxxx")
 ```

@@ -607,12 +571,15 @@ chunk = doc.add_chunk(content="xxxxxxx")
 ## Delete chunk

 ```python
-Chunk.delete() -> bool
+Document.delete_chunks(chunk_ids: List[str])
 ```
+### Parameters
+#### chunk_ids:`List[str]`
+The list of chunk_id

 ### Returns

-bool
+no return

 ### Examples

@@ -620,22 +587,34 @@ bool
 from ragflow import RAGFlow

 rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
-doc = rag.get_document(id="wdfxb5t547d")
+ds = rag.list_datasets(id="123")
+ds = ds[0]
+doc = ds.list_documents(id="wdfxb5t547d")
+doc = doc[0]
 chunk = doc.add_chunk(content="xxxxxxx")
-chunk.delete()
+doc.delete_chunks(["id_1","id_2"])
 ```

 ---

-## Save chunk contents
+## Update chunk

 ```python
-Chunk.save() -> bool
+Chunk.update(update_message: dict)
 ```
+### Parameters
+- `content`: `str`  
+  Contains the main text or information of the chunk
+
+- `important_keywords`: `List[str]`  
+  List the key terms or phrases that are significant or central to the chunk's content
+
+- `available`: `int`  
+  Indicating the availability status, `0` means unavailable and `1` means available

 ### Returns

-bool
+no return

 ### Examples

@@ -643,10 +622,12 @@ bool
 from ragflow import RAGFlow

 rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
-doc = rag.get_document(id="wdfxb5t547d")
+ds = rag.list_datasets(id="123")
+ds = ds[0]
+doc = ds.list_documents(id="wdfxb5t547d")
+doc = doc[0]
 chunk = doc.add_chunk(content="xxxxxxx")
-chunk.content = "sdfx"
-chunk.save()
+chunk.update({"content":"sdfx...})
 ```

 ---
@@ -654,7 +635,7 @@ chunk.save()
 ## Retrieval

 ```python
-RAGFlow.retrieval(question:str, datasets:List[Dataset], document=List[Document]=None,     offset:int=0, limit:int=6, similarity_threshold:float=0.1, vector_similarity_weight:float=0.3, top_k:int=1024) -> List[Chunk]
+RAGFlow.retrieve(question:str="", datasets:List[str]=None, document=List[str]=None, offset:int=1, limit:int=30, similarity_threshold:float=0.2, vector_similarity_weight:float=0.3, top_k:int=1024,rerank_id:str=None,keyword:bool=False,higlight:bool=False) -> List[Chunk]
 ```

 ### Parameters
@@ -691,6 +672,15 @@ The weight of vector cosine similarity, 1 - x is the term similarity weight. Def

 Number of records engaged in vector cosine computaton. Defaults to `1024`.

+#### rerank_id:`str`
+ID of the rerank model.  Defaults to `None`.
+
+#### keyword:`bool`
+Indicating whether keyword-based matching is enabled (True) or disabled (False).
+
+#### highlight:`bool`
+
+Specifying whether to enable highlighting of matched terms in the results (True) or not (False).
 ### Returns

 List[Chunk]
@@ -701,18 +691,17 @@ List[Chunk]
 from ragflow import RAGFlow

 rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
-ds = rag.get_dataset(name="ragflow")
+ds = rag.list_datasets(name="ragflow")
+ds = ds[0]
 name = 'ragflow_test.txt'
-path = 'test_data/ragflow_test.txt'
+path = './test_data/ragflow_test.txt'
 rag.create_document(ds, name=name, blob=open(path, "rb").read())
-doc = rag.get_document(name=name)
-doc.async_parse()
-# Wait for parsing to complete 
-for progress, msg in doc.join(interval=5, timeout=30):
-    print(progress, msg)
-for c in rag.retrieval(question="What's ragflow?", 
-             datasets=[ds], documents=[doc], 
-             offset=0, limit=6, similarity_threshold=0.1, 
+doc = ds.list_documents(name=name)
+doc = doc[0]
+ds.async_parse_documents([doc.id])
+for c in rag.retrieve(question="What's ragflow?", 
+             datasets=[ds.id], documents=[doc.id], 
+             offset=1, limit=30, similarity_threshold=0.2, 
             vector_similarity_weight=0.3,
             top_k=1024
             ):