Refine resume parts and fix bugs in retrival using sql (#66)

2024-02-19 19:22:17 +08:00
parent 452020d33a
commit a8294f2168
29 changed files with 302 additions and 158 deletions
--- a/rag/app/book.py
+++ b/rag/app/book.py
@@ -39,6 +39,11 @@ class Pdf(HuParser):


 def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
+    """
+        Supported file formats are docx, pdf, txt.
+        Since a book is long and not all the parts are useful, if it's a PDF,
+        please setup the page ranges for every book in order eliminate negative effects and save elapsed computing time.
+    """
    doc = {
        "docnm_kwd": filename,
        "title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename))
--- a/rag/app/laws.py
+++ b/rag/app/laws.py
@@ -2,7 +2,6 @@ import copy
 import re
 from io import BytesIO
 from docx import Document
-import numpy as np
 from rag.parser import bullets_category, is_english, tokenize, remove_contents_table, hierarchical_merge, \
    make_colon_as_title
 from rag.nlp import huqie
@@ -59,6 +58,9 @@ class Pdf(HuParser):


 def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
+    """
+        Supported file formats are docx, pdf, txt.
+    """
    doc = {
        "docnm_kwd": filename,
        "title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename))
--- a/rag/app/manual.py
+++ b/rag/app/manual.py
@@ -58,8 +58,10 @@ class Pdf(HuParser):


 def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
+    """
+        Only pdf is supported.
+    """
    pdf_parser = None
-    paper = {}

    if re.search(r"\.pdf$", filename, re.IGNORECASE):
        pdf_parser = Pdf()
--- a/rag/app/naive.py
+++ b/rag/app/naive.py
@@ -6,6 +6,7 @@ from rag.nlp import huqie
 from rag.parser.pdf_parser import HuParser
 from rag.settings import cron_logger

+
 class Pdf(HuParser):
    def __call__(self, filename, binary=None, from_page=0,
                 to_page=100000, zoomin=3, callback=None):
@@ -20,12 +21,18 @@ class Pdf(HuParser):
        start = timer()
        self._layouts_paddle(zoomin)
        callback(0.77, "Layout analysis finished")
-        cron_logger.info("paddle layouts:".format((timer()-start)/(self.total_page+0.1)))
+        cron_logger.info("paddle layouts:".format((timer() - start) / (self.total_page + 0.1)))
        self._naive_vertical_merge()
        return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes]


 def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
+    """
+        Supported file formats are docx, pdf, txt.
+        This method apply the naive ways to chunk files.
+        Successive text will be sliced into pieces using 'delimiter'.
+        Next, these successive pieces are merge into chunks whose token number is no more than 'Max token number'.
+    """
    doc = {
        "docnm_kwd": filename,
        "title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename))
@@ -41,24 +48,26 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **k
    elif re.search(r"\.pdf$", filename, re.IGNORECASE):
        pdf_parser = Pdf()
        sections = pdf_parser(filename if not binary else binary,
-                         from_page=from_page, to_page=to_page, callback=callback)
+                              from_page=from_page, to_page=to_page, callback=callback)
    elif re.search(r"\.txt$", filename, re.IGNORECASE):
        callback(0.1, "Start to parse.")
        txt = ""
-        if binary:txt = binary.decode("utf-8")
+        if binary:
+            txt = binary.decode("utf-8")
        else:
            with open(filename, "r") as f:
                while True:
                    l = f.readline()
-                    if not l:break
+                    if not l: break
                    txt += l
        sections = txt.split("\n")
-        sections = [(l,"") for l in sections if l]
+        sections = [(l, "") for l in sections if l]
        callback(0.8, "Finish parsing.")
-    else: raise NotImplementedError("file type not supported yet(docx, pdf, txt supported)")
+    else:
+        raise NotImplementedError("file type not supported yet(docx, pdf, txt supported)")

-    parser_config = kwargs.get("parser_config", {"chunk_token_num": 128, "delimer": "\n。；！？"})
-    cks = naive_merge(sections, parser_config["chunk_token_num"], parser_config["delimer"])
+    parser_config = kwargs.get("parser_config", {"chunk_token_num": 128, "delimiter": "\n!?。；！？"})
+    cks = naive_merge(sections, parser_config["chunk_token_num"], parser_config["delimiter"])
    eng = is_english(cks)
    res = []
    # wrap up to es documents
@@ -75,6 +84,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **k

 if __name__ == "__main__":
    import sys
+
+
    def dummy(a, b):
        pass
+
+
    chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy)
--- a/rag/app/paper.py
+++ b/rag/app/paper.py
@@ -129,6 +129,10 @@ class Pdf(HuParser):


 def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
+    """
+        Only pdf is supported.
+        The abstract of the paper will be sliced as an entire chunk, and will not be sliced partly.
+    """
    pdf_parser = None
    if re.search(r"\.pdf$", filename, re.IGNORECASE):
        pdf_parser = Pdf()
--- a/rag/app/presentation.py
+++ b/rag/app/presentation.py
@@ -94,6 +94,11 @@ class Pdf(HuParser):


 def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
+    """
+    The supported file formats are pdf, pptx.
+    Every page will be treated as a chunk. And the thumbnail of every page will be stored.
+    PPT file will be parsed by using this method automatically, setting-up for every PPT file is not necessary.
+    """
    doc = {
        "docnm_kwd": filename,
        "title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename))
--- a/rag/app/qa.py
+++ b/rag/app/qa.py
@@ -70,7 +70,17 @@ def beAdoc(d, q, a, eng):


 def chunk(filename, binary=None, callback=None, **kwargs):
+    """
+        Excel and csv(txt) format files are supported.
+        If the file is in excel format, there should be 2 column question and answer without header.
+        And question column is ahead of answer column.
+        And it's O.K if it has multiple sheets as long as the columns are rightly composed.

+        If it's in csv format, it should be UTF-8 encoded. Use TAB as delimiter to separate question and answer.
+
+        All the deformed lines will be ignored.
+        Every pair of Q&A will be treated as a chunk.
+    """
    res = []
    if re.search(r"\.xlsx?$", filename, re.IGNORECASE):
        callback(0.1, "Start to parse.")
--- a/rag/app/resume.py
+++ b/rag/app/resume.py
@@ -4,24 +4,34 @@ import os
 import re
 import requests
 from api.db.services.knowledgebase_service import KnowledgebaseService
+from api.settings import stat_logger
 from rag.nlp import huqie

 from rag.settings import cron_logger
 from rag.utils import rmSpace

+forbidden_select_fields4resume = [
+    "name_pinyin_kwd", "edu_first_fea_kwd", "degree_kwd", "sch_rank_kwd", "edu_fea_kwd"
+]

 def chunk(filename, binary=None, callback=None, **kwargs):
+    """
+    The supported file formats are pdf, docx and txt.
+    To maximize the effectiveness, parse the resume correctly,
+    please visit https://github.com/infiniflow/ragflow, and sign in the our demo web-site
+    to get token. It's FREE!
+    Set INFINIFLOW_SERVER and INFINIFLOW_TOKEN in '.env' file or
+    using 'export' to set both environment variables: INFINIFLOW_SERVER and INFINIFLOW_TOKEN in docker container.
+    """
    if not re.search(r"\.(pdf|doc|docx|txt)$", filename, flags=re.IGNORECASE):
        raise NotImplementedError("file type not supported yet(pdf supported)")

    url = os.environ.get("INFINIFLOW_SERVER")
-    if not url:
-        raise EnvironmentError(
-            "Please set environment variable: 'INFINIFLOW_SERVER'")
    token = os.environ.get("INFINIFLOW_TOKEN")
-    if not token:
-        raise EnvironmentError(
-            "Please set environment variable: 'INFINIFLOW_TOKEN'")
+    if not url or not token:
+        stat_logger.warning(
+            "INFINIFLOW_SERVER is not specified. To maximize the effectiveness, please visit https://github.com/infiniflow/ragflow, and sign in the our demo web site to get token. It's FREE! Using 'export' to set both environment variables: INFINIFLOW_SERVER and INFINIFLOW_TOKEN.")
+        return []

    if not binary:
        with open(filename, "rb") as f:
@@ -44,22 +54,28 @@ def chunk(filename, binary=None, callback=None, **kwargs):

    callback(0.2, "Resume parsing is going on...")
    resume = remote_call()
+    if len(resume.keys()) < 7:
+        callback(-1, "Resume is not successfully parsed.")
+        return []
    callback(0.6, "Done parsing. Chunking...")
    print(json.dumps(resume, ensure_ascii=False, indent=2))

    field_map = {
        "name_kwd": "姓名/名字",
+        "name_pinyin_kwd": "姓名拼音/名字拼音",
        "gender_kwd": "性别（男，女）",
        "age_int": "年龄/岁/年纪",
        "phone_kwd": "电话/手机/微信",
        "email_tks": "email/e-mail/邮箱",
        "position_name_tks": "职位/职能/岗位/职责",
-        "expect_position_name_tks": "期望职位/期望职能/期望岗位",
+        "expect_city_names_tks": "期望城市",
+        "work_exp_flt": "工作年限/工作年份/N年经验/毕业了多少年",
+        "corporation_name_tks": "最近就职(上班)的公司/上一家公司",

-        "hightest_degree_kwd": "最高学历（高中，职高，硕士，本科，博士，初中，中技，中专，专科，专升本，MPA，MBA，EMBA）",
-        "first_degree_kwd": "第一学历（高中，职高，硕士，本科，博士，初中，中技，中专，专科，专升本，MPA，MBA，EMBA）",
-        "first_major_tks": "第一学历专业",
        "first_school_name_tks": "第一学历毕业学校",
+        "first_degree_kwd": "第一学历（高中，职高，硕士，本科，博士，初中，中技，中专，专科，专升本，MPA，MBA，EMBA）",
+        "highest_degree_kwd": "最高学历（高中，职高，硕士，本科，博士，初中，中技，中专，专科，专升本，MPA，MBA，EMBA）",
+        "first_major_tks": "第一学历专业",
        "edu_first_fea_kwd": "第一学历标签（211，留学，双一流，985，海外知名，重点大学，中专，专升本，专科，本科，大专）",

        "degree_kwd": "过往学历（高中，职高，硕士，本科，博士，初中，中技，中专，专科，专升本，MPA，MBA，EMBA）",
@@ -68,14 +84,14 @@ def chunk(filename, binary=None, callback=None, **kwargs):
        "sch_rank_kwd": "学校标签（顶尖学校，精英学校，优质学校，一般学校）",
        "edu_fea_kwd": "教育标签（211，留学，双一流，985，海外知名，重点大学，中专，专升本，专科，本科，大专）",

-        "work_exp_flt": "工作年限/工作年份/N年经验/毕业了多少年",
-        "birth_dt": "生日/出生年份",
        "corp_nm_tks": "就职过的公司/之前的公司/上过班的公司",
-        "corporation_name_tks": "最近就职(上班)的公司/上一家公司",
        "edu_end_int": "毕业年份",
-        "expect_city_names_tks": "期望城市",
-        "industry_name_tks": "所在行业"
+        "industry_name_tks": "所在行业",
+
+        "birth_dt": "生日/出生年份",
+        "expect_position_name_tks": "期望职位/期望职能/期望岗位",
    }
+
    titles = []
    for n in ["name_kwd", "gender_kwd", "position_name_tks", "age_int"]:
        v = resume.get(n, "")
@@ -105,6 +121,10 @@ def chunk(filename, binary=None, callback=None, **kwargs):
    doc["content_ltks"] = huqie.qie(doc["content_with_weight"])
    doc["content_sm_ltks"] = huqie.qieqie(doc["content_ltks"])
    for n, _ in field_map.items():
+        if n not in resume:continue
+        if isinstance(resume[n], list) and (len(resume[n]) == 1 or n not in forbidden_select_fields4resume):
+            resume[n] = resume[n][0]
+        if n.find("_tks")>0: resume[n] = huqie.qieqie(resume[n])
        doc[n] = resume[n]

    print(doc)
--- a/rag/app/table.py
+++ b/rag/app/table.py
@@ -100,7 +100,20 @@ def column_data_type(arr):


 def chunk(filename, binary=None, callback=None, **kwargs):
-    dfs = []
+    """
+        Excel and csv(txt) format files are supported.
+        For csv or txt file, the delimiter between columns is TAB.
+        The first line must be column headers.
+        Column headers must be meaningful terms inorder to make our NLP model understanding.
+        It's good to enumerate some synonyms using slash '/' to separate, and even better to
+        enumerate values using brackets like 'gender/sex(male, female)'.
+        Here are some examples for headers:
+            1. supplier/vendor\tcolor(yellow, red, brown)\tgender/sex(male, female)\tsize(M,L,XL,XXL)
+            2. 姓名/名字\t电话/手机/微信\t最高学历（高中，职高，硕士，本科，博士，初中，中技，中专，专科，专升本，MPA，MBA，EMBA）
+
+        Every row in table will be treated as a chunk.
+    """
+
    if re.search(r"\.xlsx?$", filename, re.IGNORECASE):
        callback(0.1, "Start to parse.")
        excel_parser = Excel()
@@ -155,7 +168,7 @@ def chunk(filename, binary=None, callback=None, **kwargs):
                del df[n]
        clmns = df.columns.values
        txts = list(copy.deepcopy(clmns))
-        py_clmns = [PY.get_pinyins(n)[0].replace("-", "_") for n in clmns]
+        py_clmns = [PY.get_pinyins(re.sub(r"(/.*|（[^（）]+?）|\([^()]+?\))", "", n), '_')[0] for n in clmns]
        clmn_tys = []
        for j in range(len(clmns)):
            cln, ty = column_data_type(df[clmns[j]])