Supports obtaining PDF documents from web pages (#1107)
### What problem does this PR solve? Knowledge base management supports crawling information from web pages and generating PDF documents ### Type of change - [x] New Feature (Support document from web pages)
This commit is contained in:
@@ -39,6 +39,7 @@ from api.settings import RetCode
|
||||
from api.utils.api_utils import get_json_result
|
||||
from rag.utils.minio_conn import MINIO
|
||||
from api.utils.file_utils import filename_type, thumbnail
|
||||
from api.utils.web_utils import html2pdf, is_valid_url
|
||||
|
||||
|
||||
@manager.route('/upload', methods=['POST'])
|
||||
@@ -289,7 +290,7 @@ def run():
|
||||
return get_data_error_result(retmsg="Tenant not found!")
|
||||
ELASTICSEARCH.deleteByQuery(
|
||||
Q("match", doc_id=id), idxnm=search.index_name(tenant_id))
|
||||
|
||||
|
||||
if str(req["run"]) == TaskStatus.RUNNING.value:
|
||||
TaskService.filter_delete([Task.doc_id == id])
|
||||
e, doc = DocumentService.get_by_id(id)
|
||||
@@ -416,3 +417,69 @@ def get_image(image_id):
|
||||
return response
|
||||
except Exception as e:
|
||||
return server_error_response(e)
|
||||
|
||||
|
||||
@manager.route('/web_crawl', methods=['POST'])
|
||||
@login_required
|
||||
def web_crawl():
|
||||
kb_id = request.form.get("kb_id")
|
||||
if not kb_id:
|
||||
return get_json_result(
|
||||
data=False, retmsg='Lack of "KB ID"', retcode=RetCode.ARGUMENT_ERROR)
|
||||
name = request.form.get("name")
|
||||
url = request.form.get("url")
|
||||
if not name:
|
||||
return get_json_result(
|
||||
data=False, retmsg='Lack of "name"', retcode=RetCode.ARGUMENT_ERROR)
|
||||
if not url:
|
||||
return get_json_result(
|
||||
data=False, retmsg='Lack of "url"', retcode=RetCode.ARGUMENT_ERROR)
|
||||
if not is_valid_url(url):
|
||||
return get_json_result(
|
||||
data=False, retmsg='The URL format is invalid', retcode=RetCode.ARGUMENT_ERROR)
|
||||
e, kb = KnowledgebaseService.get_by_id(kb_id)
|
||||
if not e:
|
||||
raise LookupError("Can't find this knowledgebase!")
|
||||
|
||||
root_folder = FileService.get_root_folder(current_user.id)
|
||||
pf_id = root_folder["id"]
|
||||
FileService.init_knowledgebase_docs(pf_id, current_user.id)
|
||||
kb_root_folder = FileService.get_kb_folder(current_user.id)
|
||||
kb_folder = FileService.new_a_file_from_kb(kb.tenant_id, kb.name, kb_root_folder["id"])
|
||||
|
||||
try:
|
||||
filename = duplicate_name(
|
||||
DocumentService.query,
|
||||
name=name+".pdf",
|
||||
kb_id=kb.id)
|
||||
filetype = filename_type(filename)
|
||||
if filetype == FileType.OTHER.value:
|
||||
raise RuntimeError("This type of file has not been supported yet!")
|
||||
|
||||
location = filename
|
||||
while MINIO.obj_exist(kb_id, location):
|
||||
location += "_"
|
||||
blob = html2pdf(url)
|
||||
MINIO.put(kb_id, location, blob)
|
||||
doc = {
|
||||
"id": get_uuid(),
|
||||
"kb_id": kb.id,
|
||||
"parser_id": kb.parser_id,
|
||||
"parser_config": kb.parser_config,
|
||||
"created_by": current_user.id,
|
||||
"type": filetype,
|
||||
"name": filename,
|
||||
"location": location,
|
||||
"size": len(blob),
|
||||
"thumbnail": thumbnail(filename, blob)
|
||||
}
|
||||
if doc["type"] == FileType.VISUAL:
|
||||
doc["parser_id"] = ParserType.PICTURE.value
|
||||
if re.search(r"\.(ppt|pptx|pages)$", filename):
|
||||
doc["parser_id"] = ParserType.PRESENTATION.value
|
||||
DocumentService.insert(doc)
|
||||
FileService.add_file_from_kb(doc, kb_folder["id"], kb.tenant_id)
|
||||
except Exception as e:
|
||||
return get_json_result(
|
||||
data=False, retmsg=e, retcode=RetCode.SERVER_ERROR)
|
||||
return get_json_result(data=True)
|
||||
|
||||
82
api/utils/web_utils.py
Normal file
82
api/utils/web_utils.py
Normal file
@@ -0,0 +1,82 @@
|
||||
import re
|
||||
import json
|
||||
import base64
|
||||
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.chrome.options import Options
|
||||
from selenium.webdriver.chrome.service import Service
|
||||
from selenium.common.exceptions import TimeoutException
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support.expected_conditions import staleness_of
|
||||
from webdriver_manager.chrome import ChromeDriverManager
|
||||
from selenium.webdriver.common.by import By
|
||||
|
||||
|
||||
def html2pdf(
|
||||
source: str,
|
||||
timeout: int = 2,
|
||||
install_driver: bool = True,
|
||||
print_options: dict = {},
|
||||
):
|
||||
result = __get_pdf_from_html(source, timeout, install_driver, print_options)
|
||||
return result
|
||||
|
||||
|
||||
def __send_devtools(driver, cmd, params={}):
|
||||
resource = "/session/%s/chromium/send_command_and_get_result" % driver.session_id
|
||||
url = driver.command_executor._url + resource
|
||||
body = json.dumps({"cmd": cmd, "params": params})
|
||||
response = driver.command_executor._request("POST", url, body)
|
||||
|
||||
if not response:
|
||||
raise Exception(response.get("value"))
|
||||
|
||||
return response.get("value")
|
||||
|
||||
|
||||
def __get_pdf_from_html(
|
||||
path: str,
|
||||
timeout: int,
|
||||
install_driver: bool,
|
||||
print_options: dict
|
||||
):
|
||||
webdriver_options = Options()
|
||||
webdriver_prefs = {}
|
||||
webdriver_options.add_argument("--headless")
|
||||
webdriver_options.add_argument("--disable-gpu")
|
||||
webdriver_options.add_argument("--no-sandbox")
|
||||
webdriver_options.add_argument("--disable-dev-shm-usage")
|
||||
webdriver_options.experimental_options["prefs"] = webdriver_prefs
|
||||
|
||||
webdriver_prefs["profile.default_content_settings"] = {"images": 2}
|
||||
|
||||
if install_driver:
|
||||
service = Service(ChromeDriverManager().install())
|
||||
driver = webdriver.Chrome(service=service, options=webdriver_options)
|
||||
else:
|
||||
driver = webdriver.Chrome(options=webdriver_options)
|
||||
|
||||
driver.get(path)
|
||||
|
||||
try:
|
||||
WebDriverWait(driver, timeout).until(
|
||||
staleness_of(driver.find_element(by=By.TAG_NAME, value="html"))
|
||||
)
|
||||
except TimeoutException:
|
||||
calculated_print_options = {
|
||||
"landscape": False,
|
||||
"displayHeaderFooter": False,
|
||||
"printBackground": True,
|
||||
"preferCSSPageSize": True,
|
||||
}
|
||||
calculated_print_options.update(print_options)
|
||||
result = __send_devtools(
|
||||
driver, "Page.printToPDF", calculated_print_options)
|
||||
driver.quit()
|
||||
return base64.b64decode(result["data"])
|
||||
|
||||
|
||||
def is_valid_url(url: str) -> bool:
|
||||
return bool(re.match(r"(https?|ftp|file)://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]", url))
|
||||
|
||||
|
||||
Reference in New Issue
Block a user