Refactor Chunk API (#2855)

### What problem does this PR solve?

Refactor Chunk API
#2846
### Type of change


- [x] Refactoring

---------

Co-authored-by: liuhua <10215101452@stu.ecun.edu.cn>
Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com>
This commit is contained in:
liuhua
2024-10-16 18:41:24 +08:00
committed by GitHub
parent b9fa00f341
commit dab92ac1e8
11 changed files with 760 additions and 791 deletions

View File

@@ -244,42 +244,117 @@ File management inside knowledge base
## Upload document
```python
RAGFLOW.upload_document(ds:DataSet, name:str, blob:bytes)-> bool
DataSet.upload_documents(document_list: List[dict])
```
### Parameters
#### name
#### blob
#### document_list:`List[dict]`
A list composed of dicts containing `name` and `blob`.
### Returns
no return
### Examples
```python
from ragflow import RAGFlow
rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
ds = rag.create_dataset(name="kb_1")
ds.upload_documents([{name="1.txt", blob="123"}, ...] }
```
---
## Update document
```python
Document.update(update_message:dict)
```
### Parameters
#### update_message:`dict`
only `name`,`parser_config`,`parser_method` can be changed
### Returns
no return
### Examples
```python
from ragflow import RAGFlow
rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
ds=rag.list_datasets(id='id')
ds=ds[0]
doc = ds.list_documents(id="wdfxb5t547d")
doc = doc[0]
doc.update([{"parser_method": "manual"...}])
```
---
## Retrieve document
## Download document
```python
RAGFlow.get_document(id:str=None,name:str=None) -> Document
Document.download() -> bytes
```
### Returns
bytes of the document.
### Examples
```python
from ragflow import RAGFlow
rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
ds=rag.list_datasets(id="id")
ds=ds[0]
doc = ds.list_documents(id="wdfxb5t547d")
doc = doc[0]
open("~/ragflow.txt", "wb+").write(doc.download())
print(doc)
```
---
## List documents
```python
Dataset.list_documents(id:str =None, keywords: str=None, offset: int=0, limit:int = 1024,order_by:str = "create_time", desc: bool = True) -> List[Document]
```
### Parameters
#### id: `str`, *Required*
#### id: `str`
ID of the document to retrieve.
The id of the document to be got
#### name: `str`
#### keywords: `str`
Name or title of the document.
List documents whose name has the given keywords. Defaults to `None`.
#### offset: `int`
The beginning number of records for paging. Defaults to `0`.
#### limit: `int`
Records number to return, -1 means all of them. Records number to return, -1 means all of them.
#### orderby: `str`
The field by which the records should be sorted. This specifies the attribute or column used to order the results.
#### desc:`bool`
A boolean flag indicating whether the sorting should be in descending order.
### Returns
List[Document]
A document object containing the following attributes:
#### id: `str`
@@ -352,98 +427,14 @@ Duration of the processing in seconds or minutes. Defaults to `0.0`.
```python
from ragflow import RAGFlow
rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
doc = rag.get_document(id="wdfxb5t547d",name='testdocument.txt')
print(doc)
```
---
## Save document settings
```python
Document.save() -> bool
```
### Returns
bool
### Examples
```python
from ragflow import RAGFlow
rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
doc = rag.get_document(id="wdfxb5t547d")
doc.parser_method= "manual"
doc.save()
```
---
## Download document
```python
Document.download() -> bytes
```
### Returns
bytes of the document.
### Examples
```python
from ragflow import RAGFlow
rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
doc = rag.get_document(id="wdfxb5t547d")
open("~/ragflow.txt", "w+").write(doc.download())
print(doc)
```
---
## List documents
```python
Dataset.list_docs(keywords: str=None, offset: int=0, limit:int = -1) -> List[Document]
```
### Parameters
#### keywords: `str`
List documents whose name has the given keywords. Defaults to `None`.
#### offset: `int`
The beginning number of records for paging. Defaults to `0`.
#### limit: `int`
Records number to return, -1 means all of them. Records number to return, -1 means all of them.
### Returns
List[Document]
### Examples
```python
from ragflow import RAGFlow
rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
ds = rag.create_dataset(name="kb_1")
filename1 = "~/ragflow.txt"
rag.create_document(ds, name=filename1 , blob=open(filename1 , "rb").read())
filename2 = "~/infinity.txt"
rag.create_document(ds, name=filename2 , blob=open(filename2 , "rb").read())
for d in ds.list_docs(keywords="rag", offset=0, limit=12):
blob=open(filename1 , "rb").read()
list_files=[{"name":filename1,"blob":blob}]
ds.upload_documents(list_files)
for d in ds.list_documents(keywords="rag", offset=0, limit=12):
print(d)
```
@@ -452,12 +443,11 @@ for d in ds.list_docs(keywords="rag", offset=0, limit=12):
## Delete documents
```python
Document.delete() -> bool
DataSet.delete_documents(ids: List[str] = None)
```
### Returns
bool
description: delete success or not
no return
### Examples
@@ -465,119 +455,87 @@ description: delete success or not
from ragflow import RAGFlow
rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
ds = rag.create_dataset(name="kb_1")
filename1 = "~/ragflow.txt"
rag.create_document(ds, name=filename1 , blob=open(filename1 , "rb").read())
filename2 = "~/infinity.txt"
rag.create_document(ds, name=filename2 , blob=open(filename2 , "rb").read())
for d in ds.list_docs(keywords="rag", offset=0, limit=12):
d.delete()
ds = rag.list_datasets(name="kb_1")
ds = ds[0]
ds.delete_documents(ids=["id_1","id_2"])
```
---
## Parse document
## Parse and stop parsing document
```python
Document.async_parse() -> None
RAGFLOW.async_parse_documents() -> None
DataSet.async_parse_documents(document_ids:List[str]) -> None
DataSet.async_cancel_parse_documents(document_ids:List[str])-> None
```
### Parameters
#### document_ids:`List[str]`
The ids of the documents to be parsed
????????????????????????????????????????????????????
### Returns
no return
????????????????????????????????????????????????????
### Examples
```python
#document parse and cancel
rag = RAGFlow(API_KEY, HOST_ADDRESS)
ds = rag.create_dataset(name="dataset_name")
name3 = 'ai.pdf'
path = 'test_data/ai.pdf'
rag.create_document(ds, name=name3, blob=open(path, "rb").read())
doc = rag.get_document(name="ai.pdf")
doc.async_parse()
print("Async parsing initiated")
```
---
## Cancel document parsing
```python
rag.async_cancel_parse_documents(ids)
RAGFLOW.async_cancel_parse_documents()-> None
```
### Parameters
#### ids, `list[]`
### Returns
?????????????????????????????????????????????????
### Examples
```python
#documents parse and cancel
rag = RAGFlow(API_KEY, HOST_ADDRESS)
ds = rag.create_dataset(name="God5")
documents = [
{'name': 'test1.txt', 'path': 'test_data/test1.txt'},
{'name': 'test2.txt', 'path': 'test_data/test2.txt'},
{'name': 'test3.txt', 'path': 'test_data/test3.txt'}
{'name': 'test1.txt', 'blob': open('./test_data/test1.txt',"rb").read()},
{'name': 'test2.txt', 'blob': open('./test_data/test2.txt',"rb").read()},
{'name': 'test3.txt', 'blob': open('./test_data/test3.txt',"rb").read()}
]
# Create documents in bulk
for doc_info in documents:
with open(doc_info['path'], "rb") as file:
created_doc = rag.create_document(ds, name=doc_info['name'], blob=file.read())
docs = [rag.get_document(name=doc_info['name']) for doc_info in documents]
ids = [doc.id for doc in docs]
rag.async_parse_documents(ids)
ds.upload_documents(documents)
documents=ds.list_documents(keywords="test")
ids=[]
for document in documents:
ids.append(document.id)
ds.async_parse_documents(ids)
print("Async bulk parsing initiated")
for doc in docs:
for progress, msg in doc.join(interval=5, timeout=10):
print(f"{doc.name}: Progress: {progress}, Message: {msg}")
cancel_result = rag.async_cancel_parse_documents(ids)
ds.async_cancel_parse_documents(ids)
print("Async bulk parsing cancelled")
```
---
## Join document
??????????????????
## List chunks
```python
Document.join(interval=15, timeout=3600) -> iteral[Tuple[float, str]]
Document.list_chunks(keywords: str = None, offset: int = 0, limit: int = -1, id : str = None) -> List[Chunk]
```
### Parameters
#### interval: `int`
- `keywords`: `str`
List chunks whose name has the given keywords
default: `None`
Time interval in seconds for progress report. Defaults to `15`.
- `offset`: `int`
The beginning number of records for paging
default: `1`
#### timeout: `int`
Timeout in seconds. Defaults to `3600`.
- `limit`: `int`
Records number to return
default: `30`
- `id`: `str`
The ID of the chunk to be retrieved
default: `None`
### Returns
List[chunk]
iteral[Tuple[float, str]]
### Examples
```python
from ragflow import RAGFlow
rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
ds = rag.list_datasets("123")
ds = ds[0]
ds.async_parse_documents(["wdfxb5t547d"])
for c in doc.list_chunks(keywords="rag", offset=0, limit=12):
print(c)
```
## Add chunk
```python
@@ -587,6 +545,9 @@ Document.add_chunk(content:str) -> Chunk
### Parameters
#### content: `str`, *Required*
Contains the main text or information of the chunk.
#### important_keywords :`List[str]`
list the key terms or phrases that are significant or central to the chunk's content.
### Returns
@@ -598,7 +559,10 @@ chunk
from ragflow import RAGFlow
rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
doc = rag.get_document(id="wdfxb5t547d")
ds = rag.list_datasets(id="123")
ds = ds[0]
doc = ds.list_documents(id="wdfxb5t547d")
doc = doc[0]
chunk = doc.add_chunk(content="xxxxxxx")
```
@@ -607,12 +571,15 @@ chunk = doc.add_chunk(content="xxxxxxx")
## Delete chunk
```python
Chunk.delete() -> bool
Document.delete_chunks(chunk_ids: List[str])
```
### Parameters
#### chunk_ids:`List[str]`
The list of chunk_id
### Returns
bool
no return
### Examples
@@ -620,22 +587,34 @@ bool
from ragflow import RAGFlow
rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
doc = rag.get_document(id="wdfxb5t547d")
ds = rag.list_datasets(id="123")
ds = ds[0]
doc = ds.list_documents(id="wdfxb5t547d")
doc = doc[0]
chunk = doc.add_chunk(content="xxxxxxx")
chunk.delete()
doc.delete_chunks(["id_1","id_2"])
```
---
## Save chunk contents
## Update chunk
```python
Chunk.save() -> bool
Chunk.update(update_message: dict)
```
### Parameters
- `content`: `str`
Contains the main text or information of the chunk
- `important_keywords`: `List[str]`
List the key terms or phrases that are significant or central to the chunk's content
- `available`: `int`
Indicating the availability status, `0` means unavailable and `1` means available
### Returns
bool
no return
### Examples
@@ -643,10 +622,12 @@ bool
from ragflow import RAGFlow
rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
doc = rag.get_document(id="wdfxb5t547d")
ds = rag.list_datasets(id="123")
ds = ds[0]
doc = ds.list_documents(id="wdfxb5t547d")
doc = doc[0]
chunk = doc.add_chunk(content="xxxxxxx")
chunk.content = "sdfx"
chunk.save()
chunk.update({"content":"sdfx...})
```
---
@@ -654,7 +635,7 @@ chunk.save()
## Retrieval
```python
RAGFlow.retrieval(question:str, datasets:List[Dataset], document=List[Document]=None, offset:int=0, limit:int=6, similarity_threshold:float=0.1, vector_similarity_weight:float=0.3, top_k:int=1024) -> List[Chunk]
RAGFlow.retrieve(question:str="", datasets:List[str]=None, document=List[str]=None, offset:int=1, limit:int=30, similarity_threshold:float=0.2, vector_similarity_weight:float=0.3, top_k:int=1024,rerank_id:str=None,keyword:bool=False,higlight:bool=False) -> List[Chunk]
```
### Parameters
@@ -691,6 +672,15 @@ The weight of vector cosine similarity, 1 - x is the term similarity weight. Def
Number of records engaged in vector cosine computaton. Defaults to `1024`.
#### rerank_id:`str`
ID of the rerank model. Defaults to `None`.
#### keyword:`bool`
Indicating whether keyword-based matching is enabled (True) or disabled (False).
#### highlight:`bool`
Specifying whether to enable highlighting of matched terms in the results (True) or not (False).
### Returns
List[Chunk]
@@ -701,18 +691,17 @@ List[Chunk]
from ragflow import RAGFlow
rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
ds = rag.get_dataset(name="ragflow")
ds = rag.list_datasets(name="ragflow")
ds = ds[0]
name = 'ragflow_test.txt'
path = 'test_data/ragflow_test.txt'
path = './test_data/ragflow_test.txt'
rag.create_document(ds, name=name, blob=open(path, "rb").read())
doc = rag.get_document(name=name)
doc.async_parse()
# Wait for parsing to complete
for progress, msg in doc.join(interval=5, timeout=30):
print(progress, msg)
for c in rag.retrieval(question="What's ragflow?",
datasets=[ds], documents=[doc],
offset=0, limit=6, similarity_threshold=0.1,
doc = ds.list_documents(name=name)
doc = doc[0]
ds.async_parse_documents([doc.id])
for c in rag.retrieve(question="What's ragflow?",
datasets=[ds.id], documents=[doc.id],
offset=1, limit=30, similarity_threshold=0.2,
vector_similarity_weight=0.3,
top_k=1024
):