use onnx models, new deepdoc (#68)

2024-02-21 16:32:38 +08:00
parent 8c4ec9955e
commit cacd36c5e1
26 changed files with 8730 additions and 136 deletions
--- a/rag/app/book.py
+++ b/rag/app/book.py
@@ -1,15 +1,24 @@
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 import copy
-import random
 import re
-import numpy as np
-from rag.parser import bullets_category, BULLET_PATTERN, is_english, tokenize, remove_contents_table, \
+from deepdoc.parser import bullets_category, is_english, tokenize, remove_contents_table, \
    hierarchical_merge, make_colon_as_title, naive_merge, random_choices
 from rag.nlp import huqie
-from rag.parser.docx_parser import HuDocxParser
-from rag.parser.pdf_parser import HuParser
+from deepdoc.parser import PdfParser, DocxParser


-class Pdf(HuParser):
+class Pdf(PdfParser):
    def __call__(self, filename, binary=None, from_page=0,
                 to_page=100000, zoomin=3, callback=None):
        self.__images__(
@@ -21,7 +30,7 @@ class Pdf(HuParser):

        from timeit import default_timer as timer
        start = timer()
-        self._layouts_paddle(zoomin)
+        self._layouts_rec(zoomin)
        callback(0.47, "Layout analysis finished")
        print("paddle layouts:", timer() - start)
        self._table_transformer_job(zoomin)
@@ -53,7 +62,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **k
    sections,tbls = [], []
    if re.search(r"\.docx?$", filename, re.IGNORECASE):
        callback(0.1, "Start to parse.")
-        doc_parser = HuDocxParser()
+        doc_parser = DocxParser()
        # TODO: table of contents need to be removed
        sections, tbls = doc_parser(binary if binary else filename, from_page=from_page, to_page=to_page)
        remove_contents_table(sections, eng=is_english(random_choices([t for t,_ in sections], k=200)))
--- a/rag/app/laws.py
+++ b/rag/app/laws.py
@@ -1,16 +1,27 @@
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 import copy
 import re
 from io import BytesIO
 from docx import Document
-from rag.parser import bullets_category, is_english, tokenize, remove_contents_table, hierarchical_merge, \
+from deepdoc.parser import bullets_category, is_english, tokenize, remove_contents_table, hierarchical_merge, \
    make_colon_as_title
 from rag.nlp import huqie
-from rag.parser.docx_parser import HuDocxParser
-from rag.parser.pdf_parser import HuParser
+from deepdoc.parser import PdfParser, DocxParser
 from rag.settings import cron_logger


-class Docx(HuDocxParser):
+class Docx(DocxParser):
    def __init__(self):
        pass

@@ -35,7 +46,7 @@ class Docx(HuDocxParser):
        return [l for l in lines if l]


-class Pdf(HuParser):
+class Pdf(PdfParser):
    def __call__(self, filename, binary=None, from_page=0,
                 to_page=100000, zoomin=3, callback=None):
        self.__images__(
@@ -47,7 +58,7 @@ class Pdf(HuParser):

        from timeit import default_timer as timer
        start = timer()
-        self._layouts_paddle(zoomin)
+        self._layouts_rec(zoomin)
        callback(0.77, "Layout analysis finished")
        cron_logger.info("paddle layouts:".format((timer()-start)/(self.total_page+0.1)))
        self._naive_vertical_merge()
--- a/rag/app/manual.py
+++ b/rag/app/manual.py
@@ -1,12 +1,12 @@
 import copy
 import re
-from rag.parser import tokenize
+from deepdoc.parser import tokenize
 from rag.nlp import huqie
-from rag.parser.pdf_parser import HuParser
+from deepdoc.parser import PdfParser
 from rag.utils import num_tokens_from_string


-class Pdf(HuParser):
+class Pdf(PdfParser):
    def __call__(self, filename, binary=None, from_page=0,
                 to_page=100000, zoomin=3, callback=None):
        self.__images__(
@@ -18,7 +18,7 @@ class Pdf(HuParser):

        from timeit import default_timer as timer
        start = timer()
-        self._layouts_paddle(zoomin)
+        self._layouts_rec(zoomin)
        callback(0.5, "Layout analysis finished.")
        print("paddle layouts:", timer() - start)
        self._table_transformer_job(zoomin)
--- a/rag/app/naive.py
+++ b/rag/app/naive.py
@@ -1,13 +1,25 @@
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 import copy
 import re
 from rag.app import laws
-from rag.parser import is_english, tokenize, naive_merge
+from deepdoc.parser import is_english, tokenize, naive_merge
 from rag.nlp import huqie
-from rag.parser.pdf_parser import HuParser
+from deepdoc.parser import PdfParser
 from rag.settings import cron_logger


-class Pdf(HuParser):
+class Pdf(PdfParser):
    def __call__(self, filename, binary=None, from_page=0,
                 to_page=100000, zoomin=3, callback=None):
        self.__images__(
@@ -19,7 +31,7 @@ class Pdf(HuParser):

        from timeit import default_timer as timer
        start = timer()
-        self._layouts_paddle(zoomin)
+        self._layouts_rec(zoomin)
        callback(0.77, "Layout analysis finished")
        cron_logger.info("paddle layouts:".format((timer() - start) / (self.total_page + 0.1)))
        self._naive_vertical_merge()
--- a/rag/app/paper.py
+++ b/rag/app/paper.py
@@ -1,16 +1,28 @@
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 import copy
 import re
 from collections import Counter

 from api.db import ParserType
-from rag.parser import tokenize
+from deepdoc.parser import tokenize
 from rag.nlp import huqie
-from rag.parser.pdf_parser import HuParser
+from deepdoc.parser import PdfParser
 import numpy as np
 from rag.utils import num_tokens_from_string


-class Pdf(HuParser):
+class Pdf(PdfParser):
    def __init__(self):
        self.model_speciess = ParserType.PAPER.value
        super().__init__()
@@ -26,7 +38,7 @@ class Pdf(HuParser):

        from timeit import default_timer as timer
        start = timer()
-        self._layouts_paddle(zoomin)
+        self._layouts_rec(zoomin)
        callback(0.47, "Layout analysis finished")
        print("paddle layouts:", timer() - start)
        self._table_transformer_job(zoomin)
--- a/rag/app/presentation.py
+++ b/rag/app/presentation.py
@@ -1,11 +1,22 @@
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 import copy
 import re
 from io import BytesIO
 from pptx import Presentation
-
-from rag.parser import tokenize, is_english
+from deepdoc.parser import tokenize, is_english
 from rag.nlp import huqie
-from rag.parser.pdf_parser import HuParser
+from deepdoc.parser import PdfParser


 class Ppt(object):
@@ -58,7 +69,7 @@ class Ppt(object):
        return [(txts[i], imgs[i]) for i in range(len(txts))]


-class Pdf(HuParser):
+class Pdf(PdfParser):
    def __init__(self):
        super().__init__()

@@ -74,7 +85,7 @@ class Pdf(HuParser):
        assert len(self.boxes) == len(self.page_images), "{} vs. {}".format(len(self.boxes), len(self.page_images))
        res = []
        #################### More precisely ###################
-        # self._layouts_paddle(zoomin)
+        # self._layouts_rec(zoomin)
        # self._text_merge()
        # pages = {}
        # for b in self.boxes:
--- a/rag/app/qa.py
+++ b/rag/app/qa.py
@@ -1,13 +1,25 @@
-import random
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 import re
 from io import BytesIO
 from nltk import word_tokenize
 from openpyxl import load_workbook
-from rag.parser import is_english, random_choices
+from deepdoc.parser import is_english, random_choices
 from rag.nlp import huqie, stemmer
+from deepdoc.parser import ExcelParser


-class Excel(object):
+class Excel(ExcelParser):
    def __call__(self, fnm, binary=None, callback=None):
        if not binary:
            wb = load_workbook(fnm)
--- a/rag/app/resume.py
+++ b/rag/app/resume.py
@@ -1,59 +1,82 @@
-import copy
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import base64
+import datetime
 import json
-import os
 import re
+
+import pandas as pd
 import requests
 from api.db.services.knowledgebase_service import KnowledgebaseService
-from api.settings import stat_logger
 from rag.nlp import huqie
-
+from deepdoc.parser.resume import refactor
+from deepdoc.parser.resume import step_one, step_two
 from rag.settings import cron_logger
 from rag.utils import rmSpace

 forbidden_select_fields4resume = [
    "name_pinyin_kwd", "edu_first_fea_kwd", "degree_kwd", "sch_rank_kwd", "edu_fea_kwd"
 ]
+def remote_call(filename, binary):
+    q = {
+        "header": {
+            "uid": 1,
+            "user": "kevinhu",
+            "log_id": filename
+        },
+        "request": {
+            "p": {
+                "request_id": "1",
+                "encrypt_type": "base64",
+                "filename": filename,
+                "langtype": '',
+                "fileori": base64.b64encode(binary.stream.read()).decode('utf-8')
+            },
+            "c": "resume_parse_module",
+            "m": "resume_parse"
+        }
+    }
+    for _ in range(3):
+        try:
+            resume = requests.post("http://127.0.0.1:61670/tog", data=json.dumps(q))
+            resume = resume.json()["response"]["results"]
+            resume = refactor(resume)
+            for k in ["education", "work", "project", "training", "skill", "certificate", "language"]:
+                if not resume.get(k) and k in resume: del resume[k]
+
+            resume = step_one.refactor(pd.DataFrame([{"resume_content": json.dumps(resume), "tob_resume_id": "x",
+                                                "updated_at": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}]))
+            resume = step_two.parse(resume)
+            return resume
+        except Exception as e:
+            cron_logger.error("Resume parser error: "+str(e))
+    return {}
+

 def chunk(filename, binary=None, callback=None, **kwargs):
    """
    The supported file formats are pdf, docx and txt.
-    To maximize the effectiveness, parse the resume correctly,
-    please visit https://github.com/infiniflow/ragflow, and sign in the our demo web-site
-    to get token. It's FREE!
-    Set INFINIFLOW_SERVER and INFINIFLOW_TOKEN in '.env' file or
-    using 'export' to set both environment variables: INFINIFLOW_SERVER and INFINIFLOW_TOKEN in docker container.
+    To maximize the effectiveness, parse the resume correctly, please concat us: https://github.com/infiniflow/ragflow
    """
    if not re.search(r"\.(pdf|doc|docx|txt)$", filename, flags=re.IGNORECASE):
        raise NotImplementedError("file type not supported yet(pdf supported)")

-    url = os.environ.get("INFINIFLOW_SERVER")
-    token = os.environ.get("INFINIFLOW_TOKEN")
-    if not url or not token:
-        stat_logger.warning(
-            "INFINIFLOW_SERVER is not specified. To maximize the effectiveness, please visit https://github.com/infiniflow/ragflow, and sign in the our demo web site to get token. It's FREE! Using 'export' to set both environment variables: INFINIFLOW_SERVER and INFINIFLOW_TOKEN.")
-        return []
-
    if not binary:
        with open(filename, "rb") as f:
            binary = f.read()

-    def remote_call():
-        nonlocal filename, binary
-        for _ in range(3):
-            try:
-                res = requests.post(url + "/v1/layout/resume/", files=[(filename, binary)],
-                                    headers={"Authorization": token}, timeout=180)
-                res = res.json()
-                if res["retcode"] != 0:
-                    raise RuntimeError(res["retmsg"])
-                return res["data"]
-            except RuntimeError as e:
-                raise e
-            except Exception as e:
-                cron_logger.error("resume parsing:" + str(e))
-
    callback(0.2, "Resume parsing is going on...")
-    resume = remote_call()
+    resume = remote_call(filename, binary)
    if len(resume.keys()) < 7:
        callback(-1, "Resume is not successfully parsed.")
        return []
--- a/rag/app/table.py
+++ b/rag/app/table.py
@@ -1,3 +1,15 @@
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 import copy
 import re
 from io import BytesIO
@@ -8,11 +20,12 @@ from openpyxl import load_workbook
 from dateutil.parser import parse as datetime_parse

 from api.db.services.knowledgebase_service import KnowledgebaseService
-from rag.parser import is_english, tokenize
-from rag.nlp import huqie, stemmer
+from deepdoc.parser import is_english, tokenize
+from rag.nlp import huqie
+from deepdoc.parser import ExcelParser


-class Excel(object):
+class Excel(ExcelParser):
    def __call__(self, fnm, binary=None, callback=None):
        if not binary:
            wb = load_workbook(fnm)