add redis to accelerate access of minio (#482)
### What problem does this PR solve? ### Type of change - [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
@@ -32,6 +32,7 @@ from api.db.services.document_service import DocumentService
|
||||
from api.settings import database_logger
|
||||
from api.utils import get_format_time, get_uuid
|
||||
from api.utils.file_utils import get_project_base_directory
|
||||
from rag.utils.redis_conn import REDIS_CONN
|
||||
|
||||
|
||||
def collect(tm):
|
||||
@@ -84,10 +85,16 @@ def dispatch():
|
||||
|
||||
tsks = []
|
||||
try:
|
||||
file_bin = MINIO.get(r["kb_id"], r["location"])
|
||||
if REDIS_CONN.is_alive():
|
||||
try:
|
||||
REDIS_CONN.set("{}/{}".format(r["kb_id"], r["location"]), file_bin, 12*60)
|
||||
except Exception as e:
|
||||
cron_logger.warning("Put into redis[EXCEPTION]:" + str(e))
|
||||
|
||||
if r["type"] == FileType.PDF.value:
|
||||
do_layout = r["parser_config"].get("layout_recognize", True)
|
||||
pages = PdfParser.total_page_number(
|
||||
r["name"], MINIO.get(r["kb_id"], r["location"]))
|
||||
pages = PdfParser.total_page_number(r["name"], file_bin)
|
||||
page_size = r["parser_config"].get("task_page_size", 12)
|
||||
if r["parser_id"] == "paper":
|
||||
page_size = r["parser_config"].get("task_page_size", 22)
|
||||
@@ -110,8 +117,7 @@ def dispatch():
|
||||
|
||||
elif r["parser_id"] == "table":
|
||||
rn = HuExcelParser.row_number(
|
||||
r["name"], MINIO.get(
|
||||
r["kb_id"], r["location"]))
|
||||
r["name"], file_bin)
|
||||
for i in range(0, rn, 3000):
|
||||
task = new_task()
|
||||
task["from_page"] = i
|
||||
|
||||
@@ -19,13 +19,12 @@ import logging
|
||||
import os
|
||||
import hashlib
|
||||
import copy
|
||||
import random
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
import traceback
|
||||
from functools import partial
|
||||
|
||||
from rag.utils import MINIO
|
||||
from api.db.db_models import close_connection
|
||||
from rag.settings import database_logger
|
||||
from rag.settings import cron_logger, DOC_MAXIMUM_SIZE
|
||||
@@ -35,7 +34,7 @@ from elasticsearch_dsl import Q
|
||||
from multiprocessing.context import TimeoutError
|
||||
from api.db.services.task_service import TaskService
|
||||
from rag.utils import ELASTICSEARCH
|
||||
from rag.utils import MINIO
|
||||
from timeit import default_timer as timer
|
||||
from rag.utils import rmSpace, findMaxTm
|
||||
|
||||
from rag.nlp import search
|
||||
@@ -48,6 +47,7 @@ from api.db import LLMType, ParserType
|
||||
from api.db.services.document_service import DocumentService
|
||||
from api.db.services.llm_service import LLMBundle
|
||||
from api.utils.file_utils import get_project_base_directory
|
||||
from rag.utils.redis_conn import REDIS_CONN
|
||||
|
||||
BATCH_SIZE = 64
|
||||
|
||||
@@ -105,11 +105,16 @@ def collect(comm, mod, tm):
|
||||
|
||||
def get_minio_binary(bucket, name):
|
||||
global MINIO
|
||||
if REDIS_CONN.is_alive():
|
||||
try:
|
||||
r = REDIS_CONN.get("{}/{}".format(bucket, name))
|
||||
if r: return r
|
||||
except Exception as e:
|
||||
cron_logger.warning("Get redis[EXCEPTION]:" + str(e))
|
||||
return MINIO.get(bucket, name)
|
||||
|
||||
|
||||
def build(row):
|
||||
from timeit import default_timer as timer
|
||||
if row["size"] > DOC_MAXIMUM_SIZE:
|
||||
set_progress(row["id"], prog=-1, msg="File size exceeds( <= %dMb )" %
|
||||
(int(DOC_MAXIMUM_SIZE / 1024 / 1024)))
|
||||
@@ -265,6 +270,7 @@ def main(comm, mod):
|
||||
callback(
|
||||
msg="Finished slicing files(%d). Start to embedding the content." %
|
||||
len(cks))
|
||||
st = timer()
|
||||
try:
|
||||
tk_count = embedding(cks, embd_mdl, r["parser_config"], callback)
|
||||
except Exception as e:
|
||||
@@ -272,9 +278,10 @@ def main(comm, mod):
|
||||
cron_logger.error(str(e))
|
||||
tk_count = 0
|
||||
|
||||
callback(msg="Finished embedding! Start to build index!")
|
||||
callback(msg="Finished embedding({})! Start to build index!".format(timer()-st))
|
||||
init_kb(r)
|
||||
chunk_count = len(set([c["_id"] for c in cks]))
|
||||
st = timer()
|
||||
es_r = ELASTICSEARCH.bulk(cks, search.index_name(r["tenant_id"]))
|
||||
if es_r:
|
||||
callback(-1, "Index failure!")
|
||||
@@ -290,8 +297,8 @@ def main(comm, mod):
|
||||
DocumentService.increment_chunk_num(
|
||||
r["doc_id"], r["kb_id"], tk_count, chunk_count, 0)
|
||||
cron_logger.info(
|
||||
"Chunk doc({}), token({}), chunks({})".format(
|
||||
r["id"], tk_count, len(cks)))
|
||||
"Chunk doc({}), token({}), chunks({}), elapsed:{}".format(
|
||||
r["id"], tk_count, len(cks), timer()-st))
|
||||
|
||||
tmf.write(str(r["update_time"]) + "\n")
|
||||
tmf.close()
|
||||
|
||||
Reference in New Issue
Block a user