Use consistent log file names, introduced initLogger (#3403)

### What problem does this PR solve?

Use consistent log file names, introduced initLogger

### Type of change

- [ ] Bug Fix (non-breaking change which fixes an issue)
- [ ] New Feature (non-breaking change which adds functionality)
- [ ] Documentation Update
- [x] Refactoring
- [ ] Performance Improvement
- [ ] Other (please describe):
This commit is contained in:
Zhichang Yu
2024-11-14 17:13:48 +08:00
committed by GitHub
parent ab4384e011
commit 30f6421760
75 changed files with 396 additions and 402 deletions

View File

@@ -10,6 +10,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
import logging
import copy
import re
@@ -17,7 +18,6 @@ from api.db import ParserType
from rag.nlp import rag_tokenizer, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks
from deepdoc.parser import PdfParser, PlainParser
import numpy as np
from api.utils.log_utils import logger
class Pdf(PdfParser):
@@ -41,7 +41,7 @@ class Pdf(PdfParser):
start = timer()
self._layouts_rec(zoomin)
callback(0.63, "Layout analysis finished")
logger.info(f"layouts cost: {timer() - start}s")
logging.debug(f"layouts cost: {timer() - start}s")
self._table_transformer_job(zoomin)
callback(0.68, "Table analysis finished")
self._text_merge()
@@ -53,7 +53,7 @@ class Pdf(PdfParser):
# clean mess
if column_width < self.page_images[0].size[0] / zoomin / 2:
logger.info("two_column................... {} {}".format(column_width,
logging.debug("two_column................... {} {}".format(column_width,
self.page_images[0].size[0] / zoomin / 2))
self.boxes = self.sort_X_by_page(self.boxes, column_width / 2)
for b in self.boxes:
@@ -115,8 +115,8 @@ class Pdf(PdfParser):
from_page, min(
to_page, self.total_page)))
for b in self.boxes:
logger.info("{} {}".format(b["text"], b.get("layoutno")))
logger.info("{}".format(tbls))
logging.debug("{} {}".format(b["text"], b.get("layoutno")))
logging.debug("{}".format(tbls))
return {
"title": title,
@@ -157,7 +157,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
doc["authors_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["authors_tks"])
# is it English
eng = lang.lower() == "english" # pdf_parser.is_english
logger.info("It's English.....{}".format(eng))
logging.debug("It's English.....{}".format(eng))
res = tokenize_table(paper["tables"], doc, eng)
@@ -184,7 +184,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
if lvl <= most_level and i > 0 and lvl != levels[i - 1]:
sid += 1
sec_ids.append(sid)
logger.info("{} {} {} {}".format(lvl, sorted_sections[i][0], most_level, sid))
logging.debug("{} {} {} {}".format(lvl, sorted_sections[i][0], most_level, sid))
chunks = []
last_sid = -2