add use layout or not option (#145)

* add use layout or not option

* trival
This commit is contained in:
KevinHuSh
2024-03-22 19:21:09 +08:00
committed by GitHub
parent 2f4c71b4b4
commit f6aee7f230
18 changed files with 238 additions and 140 deletions

View File

@@ -76,6 +76,25 @@ def tokenize(d, t, eng):
d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
def tokenize_chunks(chunks, doc, eng, pdf_parser):
res = []
# wrap up as es documents
for ck in chunks:
if len(ck.strip()) == 0:continue
print("--", ck)
d = copy.deepcopy(doc)
if pdf_parser:
try:
d["image"], poss = pdf_parser.crop(ck, need_position=True)
add_positions(d, poss)
ck = pdf_parser.remove_tag(ck)
except NotImplementedError as e:
pass
tokenize(d, ck, eng)
res.append(d)
return res
def tokenize_table(tbls, doc, eng, batch_size=10):
res = []
# add tables

View File

@@ -300,7 +300,11 @@ class Huqie:
def qieqie(self, tks):
tks = tks.split(" ")
zh_num = len([1 for c in tks if c and is_chinese(c[0])])
if zh_num < len(tks) * 0.2:return " ".join(tks)
if zh_num < len(tks) * 0.2:
res = []
for tk in tks:
res.extend(tk.split("/"))
return " ".join(res)
res = []
for tk in tks:

View File

@@ -68,6 +68,7 @@ class Dealer:
s = Search()
pg = int(req.get("page", 1)) - 1
ps = int(req.get("size", 1000))
topk = int(req.get("topk", 1024))
src = req.get("fields", ["docnm_kwd", "content_ltks", "kb_id", "img_id",
"image_id", "doc_id", "q_512_vec", "q_768_vec", "position_int",
"q_1024_vec", "q_1536_vec", "available_int", "content_with_weight"])
@@ -103,7 +104,7 @@ class Dealer:
assert emb_mdl, "No embedding model selected"
s["knn"] = self._vector(
qst, emb_mdl, req.get(
"similarity", 0.1), ps)
"similarity", 0.1), topk)
s["knn"]["filter"] = bqry.to_dict()
if "highlight" in s:
del s["highlight"]
@@ -292,8 +293,8 @@ class Dealer:
ranks = {"total": 0, "chunks": [], "doc_aggs": {}}
if not question:
return ranks
req = {"kb_ids": kb_ids, "doc_ids": doc_ids, "size": top,
"question": question, "vector": True,
req = {"kb_ids": kb_ids, "doc_ids": doc_ids, "size": page_size,
"question": question, "vector": True, "topk": top,
"similarity": similarity_threshold}
sres = self.search(req, index_name(tenant_id), embd_mdl)