refine manul parser (#131)

2024-03-19 12:26:04 +08:00
parent d56c9e7630
commit 9da671b951
13 changed files with 145 additions and 52 deletions
--- a/rag/nlp/init.py
+++ b/rag/nlp/init.py
@@ -1,4 +1,6 @@
 import random
+from collections import Counter
+
 from rag.utils import num_tokens_from_string
 from . import huqie
 from nltk import word_tokenize
@@ -175,6 +177,36 @@ def make_colon_as_title(sections):
        i += 1


+def title_frequency(bull, sections):
+    bullets_size = len(BULLET_PATTERN[bull])
+    levels = [bullets_size+1 for _ in range(len(sections))]
+    if not sections or bull < 0:
+        return bullets_size+1, levels
+
+    for i, (txt, layout) in enumerate(sections):
+        for j, p in enumerate(BULLET_PATTERN[bull]):
+            if re.match(p, txt.strip()):
+                levels[i] = j
+                break
+        else:
+            if re.search(r"(title|head)", layout) and not not_title(txt.split("@")[0]):
+                levels[i] = bullets_size
+    most_level = bullets_size+1
+    for l, c in sorted(Counter(levels).items(), key=lambda x:x[1]*-1):
+        if l <= bullets_size:
+            most_level = l
+            break
+    return most_level, levels
+
+
+def not_title(txt):
+    if re.match(r"第[零一二三四五六七八九十百0-9]+条", txt):
+        return False
+    if len(txt.split(" ")) > 12 or (txt.find(" ") < 0 and len(txt) >= 32):
+        return True
+    return re.search(r"[,;，。；！!]", txt)
+
+
 def hierarchical_merge(bull, sections, depth):
    if not sections or bull < 0:
        return []
@@ -185,12 +217,6 @@ def hierarchical_merge(bull, sections, depth):
    bullets_size = len(BULLET_PATTERN[bull])
    levels = [[] for _ in range(bullets_size + 2)]

-    def not_title(txt):
-        if re.match(r"第[零一二三四五六七八九十百0-9]+条", txt):
-            return False
-        if len(txt.split(" ")) > 12 or (txt.find(" ") < 0 and len(txt) >= 32):
-            return True
-        return re.search(r"[,;，。；！!]", txt)

    for i, (txt, layout) in enumerate(sections):
        for j, p in enumerate(BULLET_PATTERN[bull]):
--- a/rag/nlp/query.py
+++ b/rag/nlp/query.py
@@ -38,7 +38,7 @@ class EsQueryer:
            "",
            txt)
        return re.sub(
-            r"(what|who|how|which|where|why|(is|are|were|was) there) (is|are|were|was)*", "", txt, re.IGNORECASE)
+            r"(what|who|how|which|where|why|(is|are|were|was) there) (is|are|were|was|to)*", "", txt, re.IGNORECASE)

    def question(self, txt, tbl="qa", min_match="60%"):
        txt = re.sub(
@@ -50,16 +50,16 @@ class EsQueryer:
        txt = EsQueryer.rmWWW(txt)

        if not self.isChinese(txt):
-            tks = txt.split(" ")
-            q = []
+            tks = [t for t in txt.split(" ") if t.strip()]
+            q = tks
            for i in range(1, len(tks)):
-                q.append("\"%s %s\"~2" % (tks[i - 1], tks[i]))
+                q.append("\"%s %s\"^2" % (tks[i - 1], tks[i]))
            if not q:
                q.append(txt)
            return Q("bool",
                     must=Q("query_string", fields=self.flds,
                            type="best_fields", query=" OR ".join(q),
-                            boost=1, minimum_should_match="60%")
+                            boost=1, minimum_should_match=min_match)
                     ), txt.split(" ")

        def needQieqie(tk):
@@ -147,7 +147,7 @@ class EsQueryer:
        atks = toDict(atks)
        btkss = [toDict(tks) for tks in btkss]
        tksim = [self.similarity(atks, btks) for btks in btkss]
-        return np.array(sims[0]) * vtweight + np.array(tksim) * tkweight, sims[0], tksim
+        return np.array(sims[0]) * vtweight + np.array(tksim) * tkweight, tksim, sims[0]

    def similarity(self, qtwt, dtwt):
        if isinstance(dtwt, type("")):
--- a/rag/nlp/search.py
+++ b/rag/nlp/search.py
@@ -119,6 +119,7 @@ class Dealer:
            s["knn"]["filter"] = bqry.to_dict()
            s["knn"]["similarity"] = 0.17
            res = self.es.search(s, idxnm=idxnm, timeout="600s", src=src)
+            es_logger.info("【Q】: {}".format(json.dumps(s)))

        kwds = set([])
        for k in keywords: