diff --git a/api/apps/kb_app.py b/api/apps/kb_app.py index 6ab62d82..6b7c94d7 100644 --- a/api/apps/kb_app.py +++ b/api/apps/kb_app.py @@ -162,9 +162,9 @@ def rm(): message="Database error (Document removal)!") f2d = File2DocumentService.get_by_document_id(doc.id) FileService.filter_delete([File.source_type == FileSource.KNOWLEDGEBASE, File.id == f2d[0].file_id]) - FileService.filter_delete([File.source_type == FileSource.KNOWLEDGEBASE, File.type == "folder", File.name == kbs[0].name]) File2DocumentService.delete_by_document_id(doc.id) - + FileService.filter_delete( + [File.source_type == FileSource.KNOWLEDGEBASE, File.type == "folder", File.name == kbs[0].name]) if not KnowledgebaseService.delete_by_id(req["kb_id"]): return get_data_error_result( message="Database error (Knowledgebase removal)!") diff --git a/api/apps/sdk/dataset.py b/api/apps/sdk/dataset.py index 0dffdf29..6a26819a 100644 --- a/api/apps/sdk/dataset.py +++ b/api/apps/sdk/dataset.py @@ -252,9 +252,9 @@ def delete(tenant_id): File.id == f2d[0].file_id, ] ) - FileService.filter_delete( - [File.source_type == FileSource.KNOWLEDGEBASE, File.type == "folder", File.name == kbs[0].name]) File2DocumentService.delete_by_document_id(doc.id) + FileService.filter_delete( + [File.source_type == FileSource.KNOWLEDGEBASE, File.type == "folder", File.name == kbs[0].name]) if not KnowledgebaseService.delete_by_id(id): return get_error_data_result(message="Delete dataset error.(Database error)") return get_result(code=settings.RetCode.SUCCESS) diff --git a/rag/nlp/__init__.py b/rag/nlp/__init__.py index ce1ec151..27233874 100644 --- a/rag/nlp/__init__.py +++ b/rag/nlp/__init__.py @@ -28,6 +28,8 @@ from cn2an import cn2an from PIL import Image import json +import chardet + all_codecs = [ 'utf-8', 'gb2312', 'gbk', 'utf_16', 'ascii', 'big5', 'big5hkscs', 'cp037', 'cp273', 'cp424', 'cp437', @@ -43,12 +45,17 @@ all_codecs = [ 'iso8859_14', 'iso8859_15', 'iso8859_16', 'johab', 'koi8_r', 'koi8_t', 'koi8_u', 'kz1048', 'mac_cyrillic', 'mac_greek', 'mac_iceland', 'mac_latin2', 'mac_roman', 'mac_turkish', 'ptcp154', 'shift_jis', 'shift_jis_2004', 'shift_jisx0213', - 'utf_32', 'utf_32_be', 'utf_32_le''utf_16_be', 'utf_16_le', 'utf_7' + 'utf_32', 'utf_32_be', 'utf_32_le', 'utf_16_be', 'utf_16_le', 'utf_7', 'windows-1250', 'windows-1251', + 'windows-1252', 'windows-1253', 'windows-1254', 'windows-1255', 'windows-1256', + 'windows-1257', 'windows-1258', 'latin-2' ] def find_codec(blob): - global all_codecs + detected = chardet.detect(blob[:1024]) + if detected['confidence'] > 0.5: + return detected['encoding'] + for c in all_codecs: try: blob[:1024].decode(c)