refine for English corpus (#135)

2024-03-20 16:56:16 +08:00
parent 78727c8809
commit 6999598101
12 changed files with 216 additions and 125 deletions
--- a/rag/app/manual.py
+++ b/rag/app/manual.py
@@ -30,19 +30,6 @@ class Pdf(PdfParser):
        #        print(b)
        print("OCR:", timer()-start)

-        def get_position(bx):
-            poss = []
-            pn = bx["page_number"]
-            top = bx["top"] - self.page_cum_height[pn - 1]
-            bott = bx["bottom"] - self.page_cum_height[pn - 1]
-            poss.append((pn, bx["x0"], bx["x1"], top, min(bott, self.page_images[pn-1].size[1]/zoomin)))
-            while bott * zoomin > self.page_images[pn - 1].size[1]:
-                bott -= self.page_images[pn- 1].size[1] / zoomin
-                top = 0
-                pn += 1
-                poss.append((pn, bx["x0"], bx["x1"], top, min(bott, self.page_images[pn - 1].size[1] / zoomin)))
-            return poss
-
        def tag(pn, left, right, top, bottom):
            return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##" \
                .format(pn, left, right, top, bottom)
@@ -54,7 +41,7 @@ class Pdf(PdfParser):
        callback(0.67, "Table analysis finished.")
        self._text_merge()
        tbls = self._extract_table_figure(True, zoomin, True, True)
-        self._naive_vertical_merge()
+        self._concat_downward()
        self._filter_forpages()
        callback(0.68, "Text merging finished")

@@ -74,7 +61,7 @@ class Pdf(PdfParser):
            sec_ids.append(sid)
            #print(lvl, self.boxes[i]["text"], most_level)

-        sections = [(b["text"], sec_ids[i], get_position(b)) for i, b in enumerate(self.boxes)]
+        sections = [(b["text"], sec_ids[i], self.get_position(b, zoomin)) for i, b in enumerate(self.boxes)]
        for (img, rows), poss in tbls:
            sections.append((rows if isinstance(rows, str) else rows[0], -1, [(p[0]+1-from_page, p[1], p[2], p[3], p[4]) for p in poss]))

--- a/rag/app/naive.py
+++ b/rag/app/naive.py
@@ -14,7 +14,7 @@ import copy
 import re
 from rag.app import laws
 from rag.nlp import huqie, is_english, tokenize, naive_merge, tokenize_table, add_positions
-from deepdoc.parser import PdfParser
+from deepdoc.parser import PdfParser, ExcelParser
 from rag.settings import cron_logger


@@ -74,6 +74,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
        sections, tbls = pdf_parser(filename if not binary else binary,
                              from_page=from_page, to_page=to_page, callback=callback)
        res = tokenize_table(tbls, doc, eng)
+    elif re.search(r"\.xlsx?$", filename, re.IGNORECASE):
+        callback(0.1, "Start to parse.")
+        excel_parser = ExcelParser()
+        sections = [(excel_parser.html(binary), "")]
    elif re.search(r"\.txt$", filename, re.IGNORECASE):
        callback(0.1, "Start to parse.")
        txt = ""
--- a/rag/app/paper.py
+++ b/rag/app/paper.py
@@ -15,7 +15,7 @@ import re
 from collections import Counter

 from api.db import ParserType
-from rag.nlp import huqie, tokenize, tokenize_table, add_positions
+from rag.nlp import huqie, tokenize, tokenize_table, add_positions, bullets_category, title_frequency
 from deepdoc.parser import PdfParser
 import numpy as np
 from rag.utils import num_tokens_from_string
@@ -46,11 +46,11 @@ class Pdf(PdfParser):
        self._table_transformer_job(zoomin)
        callback(0.68, "Table analysis finished")
        self._text_merge()
+        tbls = self._extract_table_figure(True, zoomin, True, True)
        column_width = np.median([b["x1"] - b["x0"] for b in self.boxes])
-        self._concat_downward(concat_between_pages=False)
+        self._concat_downward()
        self._filter_forpages()
        callback(0.75, "Text merging finished.")
-        tbls = self._extract_table_figure(True, zoomin, True, True)

        # clean mess
        if column_width < self.page_images[0].size[0] / zoomin / 2:
@@ -59,24 +59,24 @@ class Pdf(PdfParser):
            self.boxes = self.sort_X_by_page(self.boxes, column_width / 2)
        for b in self.boxes:
            b["text"] = re.sub(r"([\t 　]|\u3000){2,}", " ", b["text"].strip())
-        freq = Counter([b["text"] for b in self.boxes])
-        garbage = set([k for k, v in freq.items() if v > self.total_page * 0.6])
-        i = 0
-        while i < len(self.boxes):
-            if self.boxes[i]["text"] in garbage \
-                    or (re.match(r"[a-zA-Z0-9]+$", self.boxes[i]["text"]) and not self.boxes[i].get("layoutno")) \
-                    or (i + 1 < len(self.boxes) and self.boxes[i]["text"] == self.boxes[i + 1]["text"]):
-                self.boxes.pop(i)
-            elif i + 1 < len(self.boxes) and self.boxes[i].get("layoutno", '0') == self.boxes[i + 1].get("layoutno",
-                                                                                                         '1'):
-                # merge within same layouts
-                self.boxes[i + 1]["top"] = self.boxes[i]["top"]
-                self.boxes[i + 1]["x0"] = min(self.boxes[i]["x0"], self.boxes[i + 1]["x0"])
-                self.boxes[i + 1]["x1"] = max(self.boxes[i]["x1"], self.boxes[i + 1]["x1"])
-                self.boxes[i + 1]["text"] = self.boxes[i]["text"] + " " + self.boxes[i + 1]["text"]
-                self.boxes.pop(i)
-            else:
-                i += 1
+        # freq = Counter([b["text"] for b in self.boxes])
+        # garbage = set([k for k, v in freq.items() if v > self.total_page * 0.6])
+        # i = 0
+        # while i < len(self.boxes):
+        #     if self.boxes[i]["text"] in garbage \
+        #             or (re.match(r"[a-zA-Z0-9]+$", self.boxes[i]["text"]) and not self.boxes[i].get("layoutno")) \
+        #             or (i + 1 < len(self.boxes) and self.boxes[i]["text"] == self.boxes[i + 1]["text"]):
+        #         self.boxes.pop(i)
+        #     elif i + 1 < len(self.boxes) and self.boxes[i].get("layoutno", '0') == self.boxes[i + 1].get("layoutno",
+        #                                                                                                  '1'):
+        #         # merge within same layouts
+        #         self.boxes[i + 1]["top"] = self.boxes[i]["top"]
+        #         self.boxes[i + 1]["x0"] = min(self.boxes[i]["x0"], self.boxes[i + 1]["x0"])
+        #         self.boxes[i + 1]["x1"] = max(self.boxes[i]["x1"], self.boxes[i + 1]["x1"])
+        #         self.boxes[i + 1]["text"] = self.boxes[i]["text"] + " " + self.boxes[i + 1]["text"]
+        #         self.boxes.pop(i)
+        #     else:
+        #         i += 1

        def _begin(txt):
            return re.match(
@@ -88,7 +88,7 @@ class Pdf(PdfParser):
                "title":"",
                "authors": "",
                "abstract": "",
-                "lines": [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", "")) for b in self.boxes[i:] if
+                "sections": [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", "")) for b in self.boxes if
                          re.match(r"(text|title)", b.get("layoutno", "text"))],
                "tables": tbls
            }
@@ -119,11 +119,10 @@ class Pdf(PdfParser):
            if re.match("(abstract|摘要)", txt):
                if len(txt.split(" ")) > 32 or len(txt) > 64:
                    abstr = txt + self._line_tag(b, zoomin)
-                    i += 1
                    break
-                txt = self.boxes[i + 1]["text"].lower().strip()
+                txt = self.boxes[i]["text"].lower().strip()
                if len(txt.split(" ")) > 32 or len(txt) > 64:
-                    abstr = txt + self._line_tag(self.boxes[i + 1], zoomin)
+                    abstr = txt + self._line_tag(self.boxes[i], zoomin)
                i += 1
                break
        if not abstr: i = 0
@@ -136,7 +135,7 @@ class Pdf(PdfParser):
            "title": title if title else filename,
            "authors": " ".join(authors),
            "abstract": abstr,
-            "lines": [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", "")) for b in self.boxes[i:] if
+            "sections": [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", "")) for b in self.boxes[i:] if
                      re.match(r"(text|title)", b.get("layoutno", "text"))],
            "tables": tbls
        }
@@ -153,7 +152,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
        paper = pdf_parser(filename if not binary else binary,
                           from_page=from_page, to_page=to_page, callback=callback)
    else: raise NotImplementedError("file type not supported yet(pdf supported)")
-    doc = {"docnm_kwd": filename, "authors_tks": paper["authors"],
+
+    doc = {"docnm_kwd": filename, "authors_tks": huqie.qie(paper["authors"]),
           "title_tks": huqie.qie(paper["title"] if paper["title"] else filename)}
    doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
    doc["authors_sm_tks"] = huqie.qieqie(doc["authors_tks"])
@@ -173,6 +173,38 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
        tokenize(d, txt, eng)
        res.append(d)

+    sorted_sections = paper["sections"]
+    # set pivot using the most frequent type of title,
+    # then merge between 2 pivot
+    bull = bullets_category([txt for txt, _ in sorted_sections])
+    most_level, levels = title_frequency(bull, sorted_sections)
+    assert len(sorted_sections) == len(levels)
+    sec_ids = []
+    sid = 0
+    for i, lvl in enumerate(levels):
+        if lvl <= most_level and i > 0 and lvl != levels[i-1]: sid += 1
+        sec_ids.append(sid)
+        print(lvl, sorted_sections[i][0], most_level, sid)
+
+    chunks = []
+    last_sid = -2
+    for (txt, _), sec_id in zip(sorted_sections, sec_ids):
+        if sec_id == last_sid:
+            if chunks:
+                chunks[-1] += "\n" + txt
+                continue
+        chunks.append(txt)
+        last_sid = sec_id
+    for txt in chunks:
+        d = copy.deepcopy(doc)
+        d["image"], poss = pdf_parser.crop(txt, need_position=True)
+        add_positions(d, poss)
+        tokenize(d, pdf_parser.remove_tag(txt), eng)
+        res.append(d)
+        print("----------------------\n", pdf_parser.remove_tag(txt))
+
+    return res
+
    readed = [0] * len(paper["lines"])
    # find colon firstly
    i = 0
@@ -252,6 +284,6 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca

 if __name__ == "__main__":
    import sys
-    def dummy(a, b):
+    def dummy(prog=None, msg=""):
        pass
    chunk(sys.argv[1], callback=dummy)
--- a/rag/app/qa.py
+++ b/rag/app/qa.py
@@ -16,7 +16,7 @@ from io import BytesIO
 from nltk import word_tokenize
 from openpyxl import load_workbook
 from rag.nlp import is_english, random_choices
-from rag.nlp import huqie, stemmer
+from rag.nlp import huqie
 from deepdoc.parser import ExcelParser


@@ -73,12 +73,8 @@ def beAdoc(d, q, a, eng):
    aprefix = "Answer: " if eng else "回答："
    d["content_with_weight"] = "\t".join(
        [qprefix + rmPrefix(q), aprefix + rmPrefix(a)])
-    if eng:
-        d["content_ltks"] = " ".join([stemmer.stem(w)
-                                     for w in word_tokenize(q)])
-    else:
-        d["content_ltks"] = huqie.qie(q)
-        d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
+    d["content_ltks"] = huqie.qie(q)
+    d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
    return d


--- a/rag/app/table.py
+++ b/rag/app/table.py
@@ -74,9 +74,9 @@ def trans_datatime(s):

 def trans_bool(s):
    if re.match(r"(true|yes|是|\*|✓|✔|☑|✅|√)$", str(s).strip(), flags=re.IGNORECASE):
-        return ["yes", "是"]
+        return "yes"
    if re.match(r"(false|no|否|⍻|×)$", str(s).strip(), flags=re.IGNORECASE):
-        return ["no", "否"]
+        return "no"


 def column_data_type(arr):
@@ -92,7 +92,7 @@ def column_data_type(arr):
            counts["int"] += 1
        elif re.match(r"[+-]?[0-9.]+$", str(a).replace("%%", "")):
            counts["float"] += 1
-        elif re.match(r"(true|false|yes|no|是|否)$", str(a), flags=re.IGNORECASE):
+        elif re.match(r"(true|yes|是|\*|✓|✔|☑|✅|√|false|no|否|⍻|×)$", str(a), flags=re.IGNORECASE):
            counts["bool"] += 1
        elif trans_datatime(str(a)):
            counts["datetime"] += 1