refine for English corpus (#135)

This commit is contained in:
KevinHuSh
2024-03-20 16:56:16 +08:00
committed by GitHub
parent 78727c8809
commit 6999598101
12 changed files with 216 additions and 125 deletions

View File

@@ -30,19 +30,6 @@ class Pdf(PdfParser):
# print(b)
print("OCR:", timer()-start)
def get_position(bx):
poss = []
pn = bx["page_number"]
top = bx["top"] - self.page_cum_height[pn - 1]
bott = bx["bottom"] - self.page_cum_height[pn - 1]
poss.append((pn, bx["x0"], bx["x1"], top, min(bott, self.page_images[pn-1].size[1]/zoomin)))
while bott * zoomin > self.page_images[pn - 1].size[1]:
bott -= self.page_images[pn- 1].size[1] / zoomin
top = 0
pn += 1
poss.append((pn, bx["x0"], bx["x1"], top, min(bott, self.page_images[pn - 1].size[1] / zoomin)))
return poss
def tag(pn, left, right, top, bottom):
return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##" \
.format(pn, left, right, top, bottom)
@@ -54,7 +41,7 @@ class Pdf(PdfParser):
callback(0.67, "Table analysis finished.")
self._text_merge()
tbls = self._extract_table_figure(True, zoomin, True, True)
self._naive_vertical_merge()
self._concat_downward()
self._filter_forpages()
callback(0.68, "Text merging finished")
@@ -74,7 +61,7 @@ class Pdf(PdfParser):
sec_ids.append(sid)
#print(lvl, self.boxes[i]["text"], most_level)
sections = [(b["text"], sec_ids[i], get_position(b)) for i, b in enumerate(self.boxes)]
sections = [(b["text"], sec_ids[i], self.get_position(b, zoomin)) for i, b in enumerate(self.boxes)]
for (img, rows), poss in tbls:
sections.append((rows if isinstance(rows, str) else rows[0], -1, [(p[0]+1-from_page, p[1], p[2], p[3], p[4]) for p in poss]))

View File

@@ -14,7 +14,7 @@ import copy
import re
from rag.app import laws
from rag.nlp import huqie, is_english, tokenize, naive_merge, tokenize_table, add_positions
from deepdoc.parser import PdfParser
from deepdoc.parser import PdfParser, ExcelParser
from rag.settings import cron_logger
@@ -74,6 +74,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
sections, tbls = pdf_parser(filename if not binary else binary,
from_page=from_page, to_page=to_page, callback=callback)
res = tokenize_table(tbls, doc, eng)
elif re.search(r"\.xlsx?$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
excel_parser = ExcelParser()
sections = [(excel_parser.html(binary), "")]
elif re.search(r"\.txt$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
txt = ""

View File

@@ -15,7 +15,7 @@ import re
from collections import Counter
from api.db import ParserType
from rag.nlp import huqie, tokenize, tokenize_table, add_positions
from rag.nlp import huqie, tokenize, tokenize_table, add_positions, bullets_category, title_frequency
from deepdoc.parser import PdfParser
import numpy as np
from rag.utils import num_tokens_from_string
@@ -46,11 +46,11 @@ class Pdf(PdfParser):
self._table_transformer_job(zoomin)
callback(0.68, "Table analysis finished")
self._text_merge()
tbls = self._extract_table_figure(True, zoomin, True, True)
column_width = np.median([b["x1"] - b["x0"] for b in self.boxes])
self._concat_downward(concat_between_pages=False)
self._concat_downward()
self._filter_forpages()
callback(0.75, "Text merging finished.")
tbls = self._extract_table_figure(True, zoomin, True, True)
# clean mess
if column_width < self.page_images[0].size[0] / zoomin / 2:
@@ -59,24 +59,24 @@ class Pdf(PdfParser):
self.boxes = self.sort_X_by_page(self.boxes, column_width / 2)
for b in self.boxes:
b["text"] = re.sub(r"([\t  ]|\u3000){2,}", " ", b["text"].strip())
freq = Counter([b["text"] for b in self.boxes])
garbage = set([k for k, v in freq.items() if v > self.total_page * 0.6])
i = 0
while i < len(self.boxes):
if self.boxes[i]["text"] in garbage \
or (re.match(r"[a-zA-Z0-9]+$", self.boxes[i]["text"]) and not self.boxes[i].get("layoutno")) \
or (i + 1 < len(self.boxes) and self.boxes[i]["text"] == self.boxes[i + 1]["text"]):
self.boxes.pop(i)
elif i + 1 < len(self.boxes) and self.boxes[i].get("layoutno", '0') == self.boxes[i + 1].get("layoutno",
'1'):
# merge within same layouts
self.boxes[i + 1]["top"] = self.boxes[i]["top"]
self.boxes[i + 1]["x0"] = min(self.boxes[i]["x0"], self.boxes[i + 1]["x0"])
self.boxes[i + 1]["x1"] = max(self.boxes[i]["x1"], self.boxes[i + 1]["x1"])
self.boxes[i + 1]["text"] = self.boxes[i]["text"] + " " + self.boxes[i + 1]["text"]
self.boxes.pop(i)
else:
i += 1
# freq = Counter([b["text"] for b in self.boxes])
# garbage = set([k for k, v in freq.items() if v > self.total_page * 0.6])
# i = 0
# while i < len(self.boxes):
# if self.boxes[i]["text"] in garbage \
# or (re.match(r"[a-zA-Z0-9]+$", self.boxes[i]["text"]) and not self.boxes[i].get("layoutno")) \
# or (i + 1 < len(self.boxes) and self.boxes[i]["text"] == self.boxes[i + 1]["text"]):
# self.boxes.pop(i)
# elif i + 1 < len(self.boxes) and self.boxes[i].get("layoutno", '0') == self.boxes[i + 1].get("layoutno",
# '1'):
# # merge within same layouts
# self.boxes[i + 1]["top"] = self.boxes[i]["top"]
# self.boxes[i + 1]["x0"] = min(self.boxes[i]["x0"], self.boxes[i + 1]["x0"])
# self.boxes[i + 1]["x1"] = max(self.boxes[i]["x1"], self.boxes[i + 1]["x1"])
# self.boxes[i + 1]["text"] = self.boxes[i]["text"] + " " + self.boxes[i + 1]["text"]
# self.boxes.pop(i)
# else:
# i += 1
def _begin(txt):
return re.match(
@@ -88,7 +88,7 @@ class Pdf(PdfParser):
"title":"",
"authors": "",
"abstract": "",
"lines": [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", "")) for b in self.boxes[i:] if
"sections": [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", "")) for b in self.boxes if
re.match(r"(text|title)", b.get("layoutno", "text"))],
"tables": tbls
}
@@ -119,11 +119,10 @@ class Pdf(PdfParser):
if re.match("(abstract|摘要)", txt):
if len(txt.split(" ")) > 32 or len(txt) > 64:
abstr = txt + self._line_tag(b, zoomin)
i += 1
break
txt = self.boxes[i + 1]["text"].lower().strip()
txt = self.boxes[i]["text"].lower().strip()
if len(txt.split(" ")) > 32 or len(txt) > 64:
abstr = txt + self._line_tag(self.boxes[i + 1], zoomin)
abstr = txt + self._line_tag(self.boxes[i], zoomin)
i += 1
break
if not abstr: i = 0
@@ -136,7 +135,7 @@ class Pdf(PdfParser):
"title": title if title else filename,
"authors": " ".join(authors),
"abstract": abstr,
"lines": [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", "")) for b in self.boxes[i:] if
"sections": [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", "")) for b in self.boxes[i:] if
re.match(r"(text|title)", b.get("layoutno", "text"))],
"tables": tbls
}
@@ -153,7 +152,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
paper = pdf_parser(filename if not binary else binary,
from_page=from_page, to_page=to_page, callback=callback)
else: raise NotImplementedError("file type not supported yet(pdf supported)")
doc = {"docnm_kwd": filename, "authors_tks": paper["authors"],
doc = {"docnm_kwd": filename, "authors_tks": huqie.qie(paper["authors"]),
"title_tks": huqie.qie(paper["title"] if paper["title"] else filename)}
doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
doc["authors_sm_tks"] = huqie.qieqie(doc["authors_tks"])
@@ -173,6 +173,38 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
tokenize(d, txt, eng)
res.append(d)
sorted_sections = paper["sections"]
# set pivot using the most frequent type of title,
# then merge between 2 pivot
bull = bullets_category([txt for txt, _ in sorted_sections])
most_level, levels = title_frequency(bull, sorted_sections)
assert len(sorted_sections) == len(levels)
sec_ids = []
sid = 0
for i, lvl in enumerate(levels):
if lvl <= most_level and i > 0 and lvl != levels[i-1]: sid += 1
sec_ids.append(sid)
print(lvl, sorted_sections[i][0], most_level, sid)
chunks = []
last_sid = -2
for (txt, _), sec_id in zip(sorted_sections, sec_ids):
if sec_id == last_sid:
if chunks:
chunks[-1] += "\n" + txt
continue
chunks.append(txt)
last_sid = sec_id
for txt in chunks:
d = copy.deepcopy(doc)
d["image"], poss = pdf_parser.crop(txt, need_position=True)
add_positions(d, poss)
tokenize(d, pdf_parser.remove_tag(txt), eng)
res.append(d)
print("----------------------\n", pdf_parser.remove_tag(txt))
return res
readed = [0] * len(paper["lines"])
# find colon firstly
i = 0
@@ -252,6 +284,6 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
if __name__ == "__main__":
import sys
def dummy(a, b):
def dummy(prog=None, msg=""):
pass
chunk(sys.argv[1], callback=dummy)

View File

@@ -16,7 +16,7 @@ from io import BytesIO
from nltk import word_tokenize
from openpyxl import load_workbook
from rag.nlp import is_english, random_choices
from rag.nlp import huqie, stemmer
from rag.nlp import huqie
from deepdoc.parser import ExcelParser
@@ -73,12 +73,8 @@ def beAdoc(d, q, a, eng):
aprefix = "Answer: " if eng else "回答:"
d["content_with_weight"] = "\t".join(
[qprefix + rmPrefix(q), aprefix + rmPrefix(a)])
if eng:
d["content_ltks"] = " ".join([stemmer.stem(w)
for w in word_tokenize(q)])
else:
d["content_ltks"] = huqie.qie(q)
d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
d["content_ltks"] = huqie.qie(q)
d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
return d

View File

@@ -74,9 +74,9 @@ def trans_datatime(s):
def trans_bool(s):
if re.match(r"(true|yes|是|\*|✓|✔|☑|✅|√)$", str(s).strip(), flags=re.IGNORECASE):
return ["yes", ""]
return "yes"
if re.match(r"(false|no|否|⍻|×)$", str(s).strip(), flags=re.IGNORECASE):
return ["no", ""]
return "no"
def column_data_type(arr):
@@ -92,7 +92,7 @@ def column_data_type(arr):
counts["int"] += 1
elif re.match(r"[+-]?[0-9.]+$", str(a).replace("%%", "")):
counts["float"] += 1
elif re.match(r"(true|false|yes|no|是|否)$", str(a), flags=re.IGNORECASE):
elif re.match(r"(true|yes|是|\*|✓|✔|☑|✅|√|false|no|否|⍻|×)$", str(a), flags=re.IGNORECASE):
counts["bool"] += 1
elif trans_datatime(str(a)):
counts["datetime"] += 1