Supports obtaining PDF documents from web pages (#1107)

### What problem does this PR solve? Knowledge base management supports crawling information from web pages and generating PDF documents ### Type of change - [x] New Feature (Support document from web pages)
2024-06-11 10:45:19 +08:00
parent 68a698655a
commit 7eb69fe6d9
14 changed files with 336 additions and 17 deletions
--- a/api/apps/document_app.py
+++ b/api/apps/document_app.py
@@ -39,6 +39,7 @@ from api.settings import RetCode
 from api.utils.api_utils import get_json_result
 from rag.utils.minio_conn import MINIO
 from api.utils.file_utils import filename_type, thumbnail
+from api.utils.web_utils import html2pdf, is_valid_url


@manager.route('/upload', methods=['POST'])
@@ -289,7 +290,7 @@ def run():
                return get_data_error_result(retmsg="Tenant not found!")
            ELASTICSEARCH.deleteByQuery(
                Q("match", doc_id=id), idxnm=search.index_name(tenant_id))
-            
+
            if str(req["run"]) == TaskStatus.RUNNING.value:
                TaskService.filter_delete([Task.doc_id == id])
                e, doc = DocumentService.get_by_id(id)
@@ -416,3 +417,69 @@ def get_image(image_id):
        return response
    except Exception as e:
        return server_error_response(e)
+
+
+@manager.route('/web_crawl', methods=['POST'])
+@login_required
+def web_crawl():
+    kb_id = request.form.get("kb_id")
+    if not kb_id:
+        return get_json_result(
+            data=False, retmsg='Lack of "KB ID"', retcode=RetCode.ARGUMENT_ERROR)
+    name = request.form.get("name")
+    url = request.form.get("url")
+    if not name:
+        return get_json_result(
+            data=False, retmsg='Lack of "name"', retcode=RetCode.ARGUMENT_ERROR)
+    if not url:
+        return get_json_result(
+            data=False, retmsg='Lack of "url"', retcode=RetCode.ARGUMENT_ERROR)
+    if not is_valid_url(url):
+        return get_json_result(
+            data=False, retmsg='The URL format is invalid', retcode=RetCode.ARGUMENT_ERROR)
+    e, kb = KnowledgebaseService.get_by_id(kb_id)
+    if not e:
+        raise LookupError("Can't find this knowledgebase!")
+
+    root_folder = FileService.get_root_folder(current_user.id)
+    pf_id = root_folder["id"]
+    FileService.init_knowledgebase_docs(pf_id, current_user.id)
+    kb_root_folder = FileService.get_kb_folder(current_user.id)
+    kb_folder = FileService.new_a_file_from_kb(kb.tenant_id, kb.name, kb_root_folder["id"])
+
+    try:
+        filename = duplicate_name(
+            DocumentService.query,
+            name=name+".pdf",
+            kb_id=kb.id)
+        filetype = filename_type(filename)
+        if filetype == FileType.OTHER.value:
+            raise RuntimeError("This type of file has not been supported yet!")
+
+        location = filename
+        while MINIO.obj_exist(kb_id, location):
+            location += "_"
+        blob = html2pdf(url)
+        MINIO.put(kb_id, location, blob)
+        doc = {
+            "id": get_uuid(),
+            "kb_id": kb.id,
+            "parser_id": kb.parser_id,
+            "parser_config": kb.parser_config,
+            "created_by": current_user.id,
+            "type": filetype,
+            "name": filename,
+            "location": location,
+            "size": len(blob),
+            "thumbnail": thumbnail(filename, blob)
+        }
+        if doc["type"] == FileType.VISUAL:
+            doc["parser_id"] = ParserType.PICTURE.value
+        if re.search(r"\.(ppt|pptx|pages)$", filename):
+            doc["parser_id"] = ParserType.PRESENTATION.value
+        DocumentService.insert(doc)
+        FileService.add_file_from_kb(doc, kb_folder["id"], kb.tenant_id)
+    except Exception as e:
+        return get_json_result(
+            data=False, retmsg=e, retcode=RetCode.SERVER_ERROR)
+    return get_json_result(data=True)
--- a/api/utils/web_utils.py
+++ b/api/utils/web_utils.py
@@ -0,0 +1,82 @@
+import re
+import json
+import base64
+
+from selenium import webdriver
+from selenium.webdriver.chrome.options import Options
+from selenium.webdriver.chrome.service import Service
+from selenium.common.exceptions import TimeoutException
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support.expected_conditions import staleness_of
+from webdriver_manager.chrome import ChromeDriverManager
+from selenium.webdriver.common.by import By
+
+
+def html2pdf(
+        source: str,
+        timeout: int = 2,
+        install_driver: bool = True,
+        print_options: dict = {},
+):
+    result = __get_pdf_from_html(source, timeout, install_driver, print_options)
+    return result
+
+
+def __send_devtools(driver, cmd, params={}):
+    resource = "/session/%s/chromium/send_command_and_get_result" % driver.session_id
+    url = driver.command_executor._url + resource
+    body = json.dumps({"cmd": cmd, "params": params})
+    response = driver.command_executor._request("POST", url, body)
+
+    if not response:
+        raise Exception(response.get("value"))
+
+    return response.get("value")
+
+
+def __get_pdf_from_html(
+        path: str,
+        timeout: int,
+        install_driver: bool,
+        print_options: dict
+):
+    webdriver_options = Options()
+    webdriver_prefs = {}
+    webdriver_options.add_argument("--headless")
+    webdriver_options.add_argument("--disable-gpu")
+    webdriver_options.add_argument("--no-sandbox")
+    webdriver_options.add_argument("--disable-dev-shm-usage")
+    webdriver_options.experimental_options["prefs"] = webdriver_prefs
+
+    webdriver_prefs["profile.default_content_settings"] = {"images": 2}
+
+    if install_driver:
+        service = Service(ChromeDriverManager().install())
+        driver = webdriver.Chrome(service=service, options=webdriver_options)
+    else:
+        driver = webdriver.Chrome(options=webdriver_options)
+
+    driver.get(path)
+
+    try:
+        WebDriverWait(driver, timeout).until(
+            staleness_of(driver.find_element(by=By.TAG_NAME, value="html"))
+        )
+    except TimeoutException:
+        calculated_print_options = {
+            "landscape": False,
+            "displayHeaderFooter": False,
+            "printBackground": True,
+            "preferCSSPageSize": True,
+        }
+        calculated_print_options.update(print_options)
+        result = __send_devtools(
+            driver, "Page.printToPDF", calculated_print_options)
+        driver.quit()
+        return base64.b64decode(result["data"])
+
+
+def is_valid_url(url: str) -> bool:
+    return bool(re.match(r"(https?|ftp|file)://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]", url))
+
+