Use consistent log file names, introduced initLogger (#3403)

### What problem does this PR solve? Use consistent log file names, introduced initLogger ### Type of change - [ ] Bug Fix (non-breaking change which fixes an issue) - [ ] New Feature (non-breaking change which adds functionality) - [ ] Documentation Update - [x] Refactoring - [ ] Performance Improvement - [ ] Other (please describe):
2024-11-14 17:13:48 +08:00
parent ab4384e011
commit 30f6421760
75 changed files with 396 additions and 402 deletions
--- a/rag/app/paper.py
+++ b/rag/app/paper.py
@@ -10,6 +10,7 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
+import logging
 import copy
 import re

@@ -17,7 +18,6 @@ from api.db import ParserType
 from rag.nlp import rag_tokenizer, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks
 from deepdoc.parser import PdfParser, PlainParser
 import numpy as np
-from api.utils.log_utils import logger


 class Pdf(PdfParser):
@@ -41,7 +41,7 @@ class Pdf(PdfParser):
        start = timer()
        self._layouts_rec(zoomin)
        callback(0.63, "Layout analysis finished")
-        logger.info(f"layouts cost: {timer() - start}s")
+        logging.debug(f"layouts cost: {timer() - start}s")
        self._table_transformer_job(zoomin)
        callback(0.68, "Table analysis finished")
        self._text_merge()
@@ -53,7 +53,7 @@ class Pdf(PdfParser):

        # clean mess
        if column_width < self.page_images[0].size[0] / zoomin / 2:
-            logger.info("two_column................... {} {}".format(column_width,
+            logging.debug("two_column................... {} {}".format(column_width,
                  self.page_images[0].size[0] / zoomin / 2))
            self.boxes = self.sort_X_by_page(self.boxes, column_width / 2)
        for b in self.boxes:
@@ -115,8 +115,8 @@ class Pdf(PdfParser):
                from_page, min(
                    to_page, self.total_page)))
        for b in self.boxes:
-            logger.info("{} {}".format(b["text"], b.get("layoutno")))
-        logger.info("{}".format(tbls))
+            logging.debug("{} {}".format(b["text"], b.get("layoutno")))
+        logging.debug("{}".format(tbls))

        return {
            "title": title,
@@ -157,7 +157,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
    doc["authors_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["authors_tks"])
    # is it English
    eng = lang.lower() == "english"  # pdf_parser.is_english
-    logger.info("It's English.....{}".format(eng))
+    logging.debug("It's English.....{}".format(eng))

    res = tokenize_table(paper["tables"], doc, eng)

@@ -184,7 +184,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
        if lvl <= most_level and i > 0 and lvl != levels[i - 1]:
            sid += 1
        sec_ids.append(sid)
-        logger.info("{} {} {} {}".format(lvl, sorted_sections[i][0], most_level, sid))
+        logging.debug("{} {} {} {}".format(lvl, sorted_sections[i][0], most_level, sid))

    chunks = []
    last_sid = -2