solve task execution issues (#90)

This commit is contained in:
KevinHuSh
2024-03-01 19:48:01 +08:00
committed by GitHub
parent 7f174fb9d3
commit 8a726fb04b
16 changed files with 89 additions and 87 deletions

View File

@@ -1,4 +1,4 @@
import copy
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
@@ -80,6 +80,20 @@ def tokenize(d, t, eng):
d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
def tokenize_table(tbls, doc, eng, batch_size=10):
res = []
# add tables
for img, rows in tbls:
de = "; " if eng else " "
for i in range(0, len(rows), batch_size):
d = copy.deepcopy(doc)
r = de.join(rows[i:i + batch_size])
tokenize(d, r, eng)
d["image"] = img
res.append(d)
return res
def remove_contents_table(sections, eng=False):
i = 0
while i < len(sections):
@@ -201,10 +215,12 @@ def naive_merge(sections, chunk_token_num=128, delimiter="\n。"):
tnum = num_tokens_from_string(t)
if tnum < 8: pos = ""
if tk_nums[-1] > chunk_token_num:
cks.append(t + pos)
if t.find(pos) < 0: t += pos
cks.append(t)
tk_nums.append(tnum)
else:
cks[-1] += t + pos
if cks[-1].find(pos) < 0: t += pos
cks[-1] += t
tk_nums[-1] += tnum
for sec, pos in sections:

View File

@@ -1,6 +1,8 @@
# -*- coding: utf-8 -*-
import json
import re
from copy import deepcopy
from elasticsearch_dsl import Q, Search
from typing import List, Optional, Dict, Union
from dataclasses import dataclass
@@ -98,7 +100,7 @@ class Dealer:
del s["highlight"]
q_vec = s["knn"]["query_vector"]
es_logger.info("【Q】: {}".format(json.dumps(s)))
res = self.es.search(s, idxnm=idxnm, timeout="600s", src=src)
res = self.es.search(deepcopy(s), idxnm=idxnm, timeout="600s", src=src)
es_logger.info("TOTAL: {}".format(self.es.getTotal(res)))
if self.es.getTotal(res) == 0 and "knn" in s:
bqry, _ = self.qryr.question(qst, min_match="10%")