refine manul parser (#131)

This commit is contained in:
KevinHuSh
2024-03-19 12:26:04 +08:00
committed by GitHub
parent d56c9e7630
commit 9da671b951
13 changed files with 145 additions and 52 deletions

View File

@@ -1,4 +1,6 @@
import random
from collections import Counter
from rag.utils import num_tokens_from_string
from . import huqie
from nltk import word_tokenize
@@ -175,6 +177,36 @@ def make_colon_as_title(sections):
i += 1
def title_frequency(bull, sections):
bullets_size = len(BULLET_PATTERN[bull])
levels = [bullets_size+1 for _ in range(len(sections))]
if not sections or bull < 0:
return bullets_size+1, levels
for i, (txt, layout) in enumerate(sections):
for j, p in enumerate(BULLET_PATTERN[bull]):
if re.match(p, txt.strip()):
levels[i] = j
break
else:
if re.search(r"(title|head)", layout) and not not_title(txt.split("@")[0]):
levels[i] = bullets_size
most_level = bullets_size+1
for l, c in sorted(Counter(levels).items(), key=lambda x:x[1]*-1):
if l <= bullets_size:
most_level = l
break
return most_level, levels
def not_title(txt):
if re.match(r"第[零一二三四五六七八九十百0-9]+条", txt):
return False
if len(txt.split(" ")) > 12 or (txt.find(" ") < 0 and len(txt) >= 32):
return True
return re.search(r"[,;,。;!!]", txt)
def hierarchical_merge(bull, sections, depth):
if not sections or bull < 0:
return []
@@ -185,12 +217,6 @@ def hierarchical_merge(bull, sections, depth):
bullets_size = len(BULLET_PATTERN[bull])
levels = [[] for _ in range(bullets_size + 2)]
def not_title(txt):
if re.match(r"第[零一二三四五六七八九十百0-9]+条", txt):
return False
if len(txt.split(" ")) > 12 or (txt.find(" ") < 0 and len(txt) >= 32):
return True
return re.search(r"[,;,。;!!]", txt)
for i, (txt, layout) in enumerate(sections):
for j, p in enumerate(BULLET_PATTERN[bull]):

View File

@@ -38,7 +38,7 @@ class EsQueryer:
"",
txt)
return re.sub(
r"(what|who|how|which|where|why|(is|are|were|was) there) (is|are|were|was)*", "", txt, re.IGNORECASE)
r"(what|who|how|which|where|why|(is|are|were|was) there) (is|are|were|was|to)*", "", txt, re.IGNORECASE)
def question(self, txt, tbl="qa", min_match="60%"):
txt = re.sub(
@@ -50,16 +50,16 @@ class EsQueryer:
txt = EsQueryer.rmWWW(txt)
if not self.isChinese(txt):
tks = txt.split(" ")
q = []
tks = [t for t in txt.split(" ") if t.strip()]
q = tks
for i in range(1, len(tks)):
q.append("\"%s %s\"~2" % (tks[i - 1], tks[i]))
q.append("\"%s %s\"^2" % (tks[i - 1], tks[i]))
if not q:
q.append(txt)
return Q("bool",
must=Q("query_string", fields=self.flds,
type="best_fields", query=" OR ".join(q),
boost=1, minimum_should_match="60%")
boost=1, minimum_should_match=min_match)
), txt.split(" ")
def needQieqie(tk):
@@ -147,7 +147,7 @@ class EsQueryer:
atks = toDict(atks)
btkss = [toDict(tks) for tks in btkss]
tksim = [self.similarity(atks, btks) for btks in btkss]
return np.array(sims[0]) * vtweight + np.array(tksim) * tkweight, sims[0], tksim
return np.array(sims[0]) * vtweight + np.array(tksim) * tkweight, tksim, sims[0]
def similarity(self, qtwt, dtwt):
if isinstance(dtwt, type("")):

View File

@@ -119,6 +119,7 @@ class Dealer:
s["knn"]["filter"] = bqry.to_dict()
s["knn"]["similarity"] = 0.17
res = self.es.search(s, idxnm=idxnm, timeout="600s", src=src)
es_logger.info("【Q】: {}".format(json.dumps(s)))
kwds = set([])
for k in keywords: