use onnx models, new deepdoc (#68)
This commit is contained in:
@@ -1,15 +1,24 @@
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
import copy
|
||||
import random
|
||||
import re
|
||||
import numpy as np
|
||||
from rag.parser import bullets_category, BULLET_PATTERN, is_english, tokenize, remove_contents_table, \
|
||||
from deepdoc.parser import bullets_category, is_english, tokenize, remove_contents_table, \
|
||||
hierarchical_merge, make_colon_as_title, naive_merge, random_choices
|
||||
from rag.nlp import huqie
|
||||
from rag.parser.docx_parser import HuDocxParser
|
||||
from rag.parser.pdf_parser import HuParser
|
||||
from deepdoc.parser import PdfParser, DocxParser
|
||||
|
||||
|
||||
class Pdf(HuParser):
|
||||
class Pdf(PdfParser):
|
||||
def __call__(self, filename, binary=None, from_page=0,
|
||||
to_page=100000, zoomin=3, callback=None):
|
||||
self.__images__(
|
||||
@@ -21,7 +30,7 @@ class Pdf(HuParser):
|
||||
|
||||
from timeit import default_timer as timer
|
||||
start = timer()
|
||||
self._layouts_paddle(zoomin)
|
||||
self._layouts_rec(zoomin)
|
||||
callback(0.47, "Layout analysis finished")
|
||||
print("paddle layouts:", timer() - start)
|
||||
self._table_transformer_job(zoomin)
|
||||
@@ -53,7 +62,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **k
|
||||
sections,tbls = [], []
|
||||
if re.search(r"\.docx?$", filename, re.IGNORECASE):
|
||||
callback(0.1, "Start to parse.")
|
||||
doc_parser = HuDocxParser()
|
||||
doc_parser = DocxParser()
|
||||
# TODO: table of contents need to be removed
|
||||
sections, tbls = doc_parser(binary if binary else filename, from_page=from_page, to_page=to_page)
|
||||
remove_contents_table(sections, eng=is_english(random_choices([t for t,_ in sections], k=200)))
|
||||
|
||||
@@ -1,16 +1,27 @@
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
import copy
|
||||
import re
|
||||
from io import BytesIO
|
||||
from docx import Document
|
||||
from rag.parser import bullets_category, is_english, tokenize, remove_contents_table, hierarchical_merge, \
|
||||
from deepdoc.parser import bullets_category, is_english, tokenize, remove_contents_table, hierarchical_merge, \
|
||||
make_colon_as_title
|
||||
from rag.nlp import huqie
|
||||
from rag.parser.docx_parser import HuDocxParser
|
||||
from rag.parser.pdf_parser import HuParser
|
||||
from deepdoc.parser import PdfParser, DocxParser
|
||||
from rag.settings import cron_logger
|
||||
|
||||
|
||||
class Docx(HuDocxParser):
|
||||
class Docx(DocxParser):
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
@@ -35,7 +46,7 @@ class Docx(HuDocxParser):
|
||||
return [l for l in lines if l]
|
||||
|
||||
|
||||
class Pdf(HuParser):
|
||||
class Pdf(PdfParser):
|
||||
def __call__(self, filename, binary=None, from_page=0,
|
||||
to_page=100000, zoomin=3, callback=None):
|
||||
self.__images__(
|
||||
@@ -47,7 +58,7 @@ class Pdf(HuParser):
|
||||
|
||||
from timeit import default_timer as timer
|
||||
start = timer()
|
||||
self._layouts_paddle(zoomin)
|
||||
self._layouts_rec(zoomin)
|
||||
callback(0.77, "Layout analysis finished")
|
||||
cron_logger.info("paddle layouts:".format((timer()-start)/(self.total_page+0.1)))
|
||||
self._naive_vertical_merge()
|
||||
|
||||
@@ -1,12 +1,12 @@
|
||||
import copy
|
||||
import re
|
||||
from rag.parser import tokenize
|
||||
from deepdoc.parser import tokenize
|
||||
from rag.nlp import huqie
|
||||
from rag.parser.pdf_parser import HuParser
|
||||
from deepdoc.parser import PdfParser
|
||||
from rag.utils import num_tokens_from_string
|
||||
|
||||
|
||||
class Pdf(HuParser):
|
||||
class Pdf(PdfParser):
|
||||
def __call__(self, filename, binary=None, from_page=0,
|
||||
to_page=100000, zoomin=3, callback=None):
|
||||
self.__images__(
|
||||
@@ -18,7 +18,7 @@ class Pdf(HuParser):
|
||||
|
||||
from timeit import default_timer as timer
|
||||
start = timer()
|
||||
self._layouts_paddle(zoomin)
|
||||
self._layouts_rec(zoomin)
|
||||
callback(0.5, "Layout analysis finished.")
|
||||
print("paddle layouts:", timer() - start)
|
||||
self._table_transformer_job(zoomin)
|
||||
|
||||
@@ -1,13 +1,25 @@
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
import copy
|
||||
import re
|
||||
from rag.app import laws
|
||||
from rag.parser import is_english, tokenize, naive_merge
|
||||
from deepdoc.parser import is_english, tokenize, naive_merge
|
||||
from rag.nlp import huqie
|
||||
from rag.parser.pdf_parser import HuParser
|
||||
from deepdoc.parser import PdfParser
|
||||
from rag.settings import cron_logger
|
||||
|
||||
|
||||
class Pdf(HuParser):
|
||||
class Pdf(PdfParser):
|
||||
def __call__(self, filename, binary=None, from_page=0,
|
||||
to_page=100000, zoomin=3, callback=None):
|
||||
self.__images__(
|
||||
@@ -19,7 +31,7 @@ class Pdf(HuParser):
|
||||
|
||||
from timeit import default_timer as timer
|
||||
start = timer()
|
||||
self._layouts_paddle(zoomin)
|
||||
self._layouts_rec(zoomin)
|
||||
callback(0.77, "Layout analysis finished")
|
||||
cron_logger.info("paddle layouts:".format((timer() - start) / (self.total_page + 0.1)))
|
||||
self._naive_vertical_merge()
|
||||
|
||||
@@ -1,16 +1,28 @@
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
import copy
|
||||
import re
|
||||
from collections import Counter
|
||||
|
||||
from api.db import ParserType
|
||||
from rag.parser import tokenize
|
||||
from deepdoc.parser import tokenize
|
||||
from rag.nlp import huqie
|
||||
from rag.parser.pdf_parser import HuParser
|
||||
from deepdoc.parser import PdfParser
|
||||
import numpy as np
|
||||
from rag.utils import num_tokens_from_string
|
||||
|
||||
|
||||
class Pdf(HuParser):
|
||||
class Pdf(PdfParser):
|
||||
def __init__(self):
|
||||
self.model_speciess = ParserType.PAPER.value
|
||||
super().__init__()
|
||||
@@ -26,7 +38,7 @@ class Pdf(HuParser):
|
||||
|
||||
from timeit import default_timer as timer
|
||||
start = timer()
|
||||
self._layouts_paddle(zoomin)
|
||||
self._layouts_rec(zoomin)
|
||||
callback(0.47, "Layout analysis finished")
|
||||
print("paddle layouts:", timer() - start)
|
||||
self._table_transformer_job(zoomin)
|
||||
|
||||
@@ -1,11 +1,22 @@
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
import copy
|
||||
import re
|
||||
from io import BytesIO
|
||||
from pptx import Presentation
|
||||
|
||||
from rag.parser import tokenize, is_english
|
||||
from deepdoc.parser import tokenize, is_english
|
||||
from rag.nlp import huqie
|
||||
from rag.parser.pdf_parser import HuParser
|
||||
from deepdoc.parser import PdfParser
|
||||
|
||||
|
||||
class Ppt(object):
|
||||
@@ -58,7 +69,7 @@ class Ppt(object):
|
||||
return [(txts[i], imgs[i]) for i in range(len(txts))]
|
||||
|
||||
|
||||
class Pdf(HuParser):
|
||||
class Pdf(PdfParser):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
@@ -74,7 +85,7 @@ class Pdf(HuParser):
|
||||
assert len(self.boxes) == len(self.page_images), "{} vs. {}".format(len(self.boxes), len(self.page_images))
|
||||
res = []
|
||||
#################### More precisely ###################
|
||||
# self._layouts_paddle(zoomin)
|
||||
# self._layouts_rec(zoomin)
|
||||
# self._text_merge()
|
||||
# pages = {}
|
||||
# for b in self.boxes:
|
||||
|
||||
@@ -1,13 +1,25 @@
|
||||
import random
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
import re
|
||||
from io import BytesIO
|
||||
from nltk import word_tokenize
|
||||
from openpyxl import load_workbook
|
||||
from rag.parser import is_english, random_choices
|
||||
from deepdoc.parser import is_english, random_choices
|
||||
from rag.nlp import huqie, stemmer
|
||||
from deepdoc.parser import ExcelParser
|
||||
|
||||
|
||||
class Excel(object):
|
||||
class Excel(ExcelParser):
|
||||
def __call__(self, fnm, binary=None, callback=None):
|
||||
if not binary:
|
||||
wb = load_workbook(fnm)
|
||||
|
||||
@@ -1,59 +1,82 @@
|
||||
import copy
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
import base64
|
||||
import datetime
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
|
||||
import pandas as pd
|
||||
import requests
|
||||
from api.db.services.knowledgebase_service import KnowledgebaseService
|
||||
from api.settings import stat_logger
|
||||
from rag.nlp import huqie
|
||||
|
||||
from deepdoc.parser.resume import refactor
|
||||
from deepdoc.parser.resume import step_one, step_two
|
||||
from rag.settings import cron_logger
|
||||
from rag.utils import rmSpace
|
||||
|
||||
forbidden_select_fields4resume = [
|
||||
"name_pinyin_kwd", "edu_first_fea_kwd", "degree_kwd", "sch_rank_kwd", "edu_fea_kwd"
|
||||
]
|
||||
def remote_call(filename, binary):
|
||||
q = {
|
||||
"header": {
|
||||
"uid": 1,
|
||||
"user": "kevinhu",
|
||||
"log_id": filename
|
||||
},
|
||||
"request": {
|
||||
"p": {
|
||||
"request_id": "1",
|
||||
"encrypt_type": "base64",
|
||||
"filename": filename,
|
||||
"langtype": '',
|
||||
"fileori": base64.b64encode(binary.stream.read()).decode('utf-8')
|
||||
},
|
||||
"c": "resume_parse_module",
|
||||
"m": "resume_parse"
|
||||
}
|
||||
}
|
||||
for _ in range(3):
|
||||
try:
|
||||
resume = requests.post("http://127.0.0.1:61670/tog", data=json.dumps(q))
|
||||
resume = resume.json()["response"]["results"]
|
||||
resume = refactor(resume)
|
||||
for k in ["education", "work", "project", "training", "skill", "certificate", "language"]:
|
||||
if not resume.get(k) and k in resume: del resume[k]
|
||||
|
||||
resume = step_one.refactor(pd.DataFrame([{"resume_content": json.dumps(resume), "tob_resume_id": "x",
|
||||
"updated_at": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}]))
|
||||
resume = step_two.parse(resume)
|
||||
return resume
|
||||
except Exception as e:
|
||||
cron_logger.error("Resume parser error: "+str(e))
|
||||
return {}
|
||||
|
||||
|
||||
def chunk(filename, binary=None, callback=None, **kwargs):
|
||||
"""
|
||||
The supported file formats are pdf, docx and txt.
|
||||
To maximize the effectiveness, parse the resume correctly,
|
||||
please visit https://github.com/infiniflow/ragflow, and sign in the our demo web-site
|
||||
to get token. It's FREE!
|
||||
Set INFINIFLOW_SERVER and INFINIFLOW_TOKEN in '.env' file or
|
||||
using 'export' to set both environment variables: INFINIFLOW_SERVER and INFINIFLOW_TOKEN in docker container.
|
||||
To maximize the effectiveness, parse the resume correctly, please concat us: https://github.com/infiniflow/ragflow
|
||||
"""
|
||||
if not re.search(r"\.(pdf|doc|docx|txt)$", filename, flags=re.IGNORECASE):
|
||||
raise NotImplementedError("file type not supported yet(pdf supported)")
|
||||
|
||||
url = os.environ.get("INFINIFLOW_SERVER")
|
||||
token = os.environ.get("INFINIFLOW_TOKEN")
|
||||
if not url or not token:
|
||||
stat_logger.warning(
|
||||
"INFINIFLOW_SERVER is not specified. To maximize the effectiveness, please visit https://github.com/infiniflow/ragflow, and sign in the our demo web site to get token. It's FREE! Using 'export' to set both environment variables: INFINIFLOW_SERVER and INFINIFLOW_TOKEN.")
|
||||
return []
|
||||
|
||||
if not binary:
|
||||
with open(filename, "rb") as f:
|
||||
binary = f.read()
|
||||
|
||||
def remote_call():
|
||||
nonlocal filename, binary
|
||||
for _ in range(3):
|
||||
try:
|
||||
res = requests.post(url + "/v1/layout/resume/", files=[(filename, binary)],
|
||||
headers={"Authorization": token}, timeout=180)
|
||||
res = res.json()
|
||||
if res["retcode"] != 0:
|
||||
raise RuntimeError(res["retmsg"])
|
||||
return res["data"]
|
||||
except RuntimeError as e:
|
||||
raise e
|
||||
except Exception as e:
|
||||
cron_logger.error("resume parsing:" + str(e))
|
||||
|
||||
callback(0.2, "Resume parsing is going on...")
|
||||
resume = remote_call()
|
||||
resume = remote_call(filename, binary)
|
||||
if len(resume.keys()) < 7:
|
||||
callback(-1, "Resume is not successfully parsed.")
|
||||
return []
|
||||
|
||||
@@ -1,3 +1,15 @@
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
import copy
|
||||
import re
|
||||
from io import BytesIO
|
||||
@@ -8,11 +20,12 @@ from openpyxl import load_workbook
|
||||
from dateutil.parser import parse as datetime_parse
|
||||
|
||||
from api.db.services.knowledgebase_service import KnowledgebaseService
|
||||
from rag.parser import is_english, tokenize
|
||||
from rag.nlp import huqie, stemmer
|
||||
from deepdoc.parser import is_english, tokenize
|
||||
from rag.nlp import huqie
|
||||
from deepdoc.parser import ExcelParser
|
||||
|
||||
|
||||
class Excel(object):
|
||||
class Excel(ExcelParser):
|
||||
def __call__(self, fnm, binary=None, callback=None):
|
||||
if not binary:
|
||||
wb = load_workbook(fnm)
|
||||
|
||||
Reference in New Issue
Block a user