Integration with Infinity (#2894)

### What problem does this PR solve?

Integration with Infinity

- Replaced ELASTICSEARCH with dataStoreConn
- Renamed deleteByQuery with delete
- Renamed bulk to upsertBulk
- getHighlight, getAggregation
- Fix KGSearch.search
- Moved Dealer.sql_retrieval to es_conn.py


### Type of change

- [x] Refactoring
This commit is contained in:
Zhichang Yu
2024-11-12 14:59:41 +08:00
committed by GitHub
parent 00b6000b76
commit f4c52371ab
42 changed files with 2647 additions and 1878 deletions

View File

@@ -25,6 +25,7 @@ import roman_numbers as r
from word2number import w2n
from cn2an import cn2an
from PIL import Image
import json
all_codecs = [
'utf-8', 'gb2312', 'gbk', 'utf_16', 'ascii', 'big5', 'big5hkscs',
@@ -51,12 +52,12 @@ def find_codec(blob):
try:
blob[:1024].decode(c)
return c
except Exception as e:
except Exception:
pass
try:
blob.decode(c)
return c
except Exception as e:
except Exception:
pass
return "utf-8"
@@ -241,7 +242,7 @@ def tokenize_chunks(chunks, doc, eng, pdf_parser=None):
d["image"], poss = pdf_parser.crop(ck, need_position=True)
add_positions(d, poss)
ck = pdf_parser.remove_tag(ck)
except NotImplementedError as e:
except NotImplementedError:
pass
tokenize(d, ck, eng)
res.append(d)
@@ -289,13 +290,16 @@ def tokenize_table(tbls, doc, eng, batch_size=10):
def add_positions(d, poss):
if not poss:
return
d["page_num_int"] = []
d["position_int"] = []
d["top_int"] = []
page_num_list = []
position_list = []
top_list = []
for pn, left, right, top, bottom in poss:
d["page_num_int"].append(int(pn + 1))
d["top_int"].append(int(top))
d["position_int"].append((int(pn + 1), int(left), int(right), int(top), int(bottom)))
page_num_list.append(int(pn + 1))
top_list.append(int(top))
position_list.append((int(pn + 1), int(left), int(right), int(top), int(bottom)))
d["page_num_list"] = json.dumps(page_num_list)
d["position_list"] = json.dumps(position_list)
d["top_list"] = json.dumps(top_list)
def remove_contents_table(sections, eng=False):