refine manul parser (#131)
This commit is contained in:
@@ -1,4 +1,6 @@
|
||||
import random
|
||||
from collections import Counter
|
||||
|
||||
from rag.utils import num_tokens_from_string
|
||||
from . import huqie
|
||||
from nltk import word_tokenize
|
||||
@@ -175,6 +177,36 @@ def make_colon_as_title(sections):
|
||||
i += 1
|
||||
|
||||
|
||||
def title_frequency(bull, sections):
|
||||
bullets_size = len(BULLET_PATTERN[bull])
|
||||
levels = [bullets_size+1 for _ in range(len(sections))]
|
||||
if not sections or bull < 0:
|
||||
return bullets_size+1, levels
|
||||
|
||||
for i, (txt, layout) in enumerate(sections):
|
||||
for j, p in enumerate(BULLET_PATTERN[bull]):
|
||||
if re.match(p, txt.strip()):
|
||||
levels[i] = j
|
||||
break
|
||||
else:
|
||||
if re.search(r"(title|head)", layout) and not not_title(txt.split("@")[0]):
|
||||
levels[i] = bullets_size
|
||||
most_level = bullets_size+1
|
||||
for l, c in sorted(Counter(levels).items(), key=lambda x:x[1]*-1):
|
||||
if l <= bullets_size:
|
||||
most_level = l
|
||||
break
|
||||
return most_level, levels
|
||||
|
||||
|
||||
def not_title(txt):
|
||||
if re.match(r"第[零一二三四五六七八九十百0-9]+条", txt):
|
||||
return False
|
||||
if len(txt.split(" ")) > 12 or (txt.find(" ") < 0 and len(txt) >= 32):
|
||||
return True
|
||||
return re.search(r"[,;,。;!!]", txt)
|
||||
|
||||
|
||||
def hierarchical_merge(bull, sections, depth):
|
||||
if not sections or bull < 0:
|
||||
return []
|
||||
@@ -185,12 +217,6 @@ def hierarchical_merge(bull, sections, depth):
|
||||
bullets_size = len(BULLET_PATTERN[bull])
|
||||
levels = [[] for _ in range(bullets_size + 2)]
|
||||
|
||||
def not_title(txt):
|
||||
if re.match(r"第[零一二三四五六七八九十百0-9]+条", txt):
|
||||
return False
|
||||
if len(txt.split(" ")) > 12 or (txt.find(" ") < 0 and len(txt) >= 32):
|
||||
return True
|
||||
return re.search(r"[,;,。;!!]", txt)
|
||||
|
||||
for i, (txt, layout) in enumerate(sections):
|
||||
for j, p in enumerate(BULLET_PATTERN[bull]):
|
||||
|
||||
@@ -38,7 +38,7 @@ class EsQueryer:
|
||||
"",
|
||||
txt)
|
||||
return re.sub(
|
||||
r"(what|who|how|which|where|why|(is|are|were|was) there) (is|are|were|was)*", "", txt, re.IGNORECASE)
|
||||
r"(what|who|how|which|where|why|(is|are|were|was) there) (is|are|were|was|to)*", "", txt, re.IGNORECASE)
|
||||
|
||||
def question(self, txt, tbl="qa", min_match="60%"):
|
||||
txt = re.sub(
|
||||
@@ -50,16 +50,16 @@ class EsQueryer:
|
||||
txt = EsQueryer.rmWWW(txt)
|
||||
|
||||
if not self.isChinese(txt):
|
||||
tks = txt.split(" ")
|
||||
q = []
|
||||
tks = [t for t in txt.split(" ") if t.strip()]
|
||||
q = tks
|
||||
for i in range(1, len(tks)):
|
||||
q.append("\"%s %s\"~2" % (tks[i - 1], tks[i]))
|
||||
q.append("\"%s %s\"^2" % (tks[i - 1], tks[i]))
|
||||
if not q:
|
||||
q.append(txt)
|
||||
return Q("bool",
|
||||
must=Q("query_string", fields=self.flds,
|
||||
type="best_fields", query=" OR ".join(q),
|
||||
boost=1, minimum_should_match="60%")
|
||||
boost=1, minimum_should_match=min_match)
|
||||
), txt.split(" ")
|
||||
|
||||
def needQieqie(tk):
|
||||
@@ -147,7 +147,7 @@ class EsQueryer:
|
||||
atks = toDict(atks)
|
||||
btkss = [toDict(tks) for tks in btkss]
|
||||
tksim = [self.similarity(atks, btks) for btks in btkss]
|
||||
return np.array(sims[0]) * vtweight + np.array(tksim) * tkweight, sims[0], tksim
|
||||
return np.array(sims[0]) * vtweight + np.array(tksim) * tkweight, tksim, sims[0]
|
||||
|
||||
def similarity(self, qtwt, dtwt):
|
||||
if isinstance(dtwt, type("")):
|
||||
|
||||
@@ -119,6 +119,7 @@ class Dealer:
|
||||
s["knn"]["filter"] = bqry.to_dict()
|
||||
s["knn"]["similarity"] = 0.17
|
||||
res = self.es.search(s, idxnm=idxnm, timeout="600s", src=src)
|
||||
es_logger.info("【Q】: {}".format(json.dumps(s)))
|
||||
|
||||
kwds = set([])
|
||||
for k in keywords:
|
||||
|
||||
Reference in New Issue
Block a user