build dialog server; add thumbnail to docinfo; (#17)

This commit is contained in:
KevinHuSh
2023-12-26 19:32:06 +08:00
committed by GitHub
parent 3245107dc7
commit 3fc700a1d4
12 changed files with 94 additions and 42 deletions

View File

@@ -6,11 +6,10 @@ from tornado.ioloop import IOLoop
from tornado.httpserver import HTTPServer
from tornado.options import define,options
from util import es_conn, setup_logging
from svr import sec_search as search
from svr.rpc_proxy import RPCProxy
from sklearn.metrics.pairwise import cosine_similarity as CosineSimilarity
from nlp import huqie
from nlp import query as Query
from nlp import search
from llm import HuEmbedding, GptTurbo
import numpy as np
from io import BytesIO
@@ -38,7 +37,7 @@ def get_QA_pairs(hists):
def get_instruction(sres, top_i, max_len=8096 fld="content_ltks"):
def get_instruction(sres, top_i, max_len=8096, fld="content_ltks"):
max_len //= len(top_i)
# add instruction to prompt
instructions = [re.sub(r"[\r\n]+", " ", sres.field[sres.ids[i]][fld]) for i in top_i]
@@ -96,10 +95,11 @@ class Handler(RequestHandler):
try:
question = param.get("history",[{"user": "Hi!"}])[-1]["user"]
res = SE.search({
"question": question,
"kb_ids": param.get("kb_ids", []),
"size": param.get("topn", 15)
})
"question": question,
"kb_ids": param.get("kb_ids", []),
"size": param.get("topn", 15)},
search.index_name(param["uid"])
)
sim = SE.rerank(res, question)
rk_idx = np.argsort(sim*-1)
@@ -112,12 +112,12 @@ class Handler(RequestHandler):
refer = OrderedDict()
docnms = {}
for i in rk_idx:
did = res.field[res.ids[i]]["doc_id"])
if did not in docnms: docnms[did] = res.field[res.ids[i]]["docnm_kwd"])
did = res.field[res.ids[i]]["doc_id"]
if did not in docnms: docnms[did] = res.field[res.ids[i]]["docnm_kwd"]
if did not in refer: refer[did] = []
refer[did].append({
"chunk_id": res.ids[i],
"content": res.field[res.ids[i]]["content_ltks"]),
"content": res.field[res.ids[i]]["content_ltks"],
"image": ""
})
@@ -128,7 +128,7 @@ class Handler(RequestHandler):
"data":{
"uid": param["uid"],
"dialog_id": param["dialog_id"],
"assistant": ans
"assistant": ans,
"refer": [{
"did": did,
"doc_name": docnms[did],
@@ -153,7 +153,7 @@ if __name__ == '__main__':
parser.add_argument("--port", default=4455, type=int, help="Port used for service")
ARGS = parser.parse_args()
SE = search.ResearchReportSearch(es_conn.HuEs("infiniflow"), EMBEDDING)
SE = search.Dealer(es_conn.HuEs("infiniflow"), EMBEDDING)
app = Application([(r'/v1/chat/completions', Handler)],debug=False)
http_server = HTTPServer(app)

View File

@@ -6,7 +6,7 @@ from util.db_conn import Postgres
from util.minio_conn import HuMinio
from util import rmSpace, findMaxDt
from FlagEmbedding import FlagModel
from nlp import huchunk, huqie
from nlp import huchunk, huqie, search
import base64, hashlib
from io import BytesIO
import pandas as pd
@@ -103,7 +103,7 @@ def build(row):
if(!ctx._source.kb_id.contains('%s'))
ctx._source.kb_id.add('%s');
"""%(str(row["kb_id"]), str(row["kb_id"])),
idxnm = index_name(row["uid"])
idxnm = search.index_name(row["uid"])
)
set_progress(row["kb2doc_id"], 1, "Done")
return []
@@ -171,10 +171,8 @@ def build(row):
return docs
def index_name(uid):return f"docgpt_{uid}"
def init_kb(row):
idxnm = index_name(row["uid"])
idxnm = search.index_name(row["uid"])
if ES.indexExist(idxnm): return
return ES.createIdx(idxnm, json.load(open("conf/mapping.json", "r")))
@@ -199,7 +197,7 @@ def rm_doc_from_kb(df):
ctx._source.kb_id.indexOf('%s')
);
"""%(str(r["kb_id"]),str(r["kb_id"])),
idxnm = index_name(r["uid"])
idxnm = search.index_name(r["uid"])
)
if len(df) == 0:return
sql = """
@@ -233,7 +231,7 @@ def main(comm, mod):
set_progress(r["kb2doc_id"], random.randint(70, 95)/100.,
"Finished embedding! Start to build index!")
init_kb(r)
es_r = ES.bulk(cks, index_name(r["uid"]))
es_r = ES.bulk(cks, search.index_name(r["uid"]))
if es_r:
set_progress(r["kb2doc_id"], -1, "Index failure!")
print(es_r)