Some document API refined. (#53)

Add naive chunking method to RAG
2024-02-02 19:21:37 +08:00
parent 7b71fb2db6
commit 51482f3e2a
13 changed files with 447 additions and 268 deletions
--- a/rag/parser/docx_parser.py
+++ b/rag/parser/docx_parser.py
@@ -98,8 +98,19 @@ class HuDocxParser:
            return lines
        return ["\n".join(lines)]

-    def __call__(self, fnm):
+    def __call__(self, fnm, from_page=0, to_page=100000):
        self.doc = Document(fnm) if isinstance(fnm, str) else Document(BytesIO(fnm))
-        secs = [(p.text, p.style.name) for p in self.doc.paragraphs]
+        pn = 0
+        secs = []
+        for p in self.doc.paragraphs:
+            if pn > to_page: break
+            if from_page <= pn < to_page and p.text.strip(): secs.append((p.text, p.style.name))
+            for run in p.runs:
+                if 'lastRenderedPageBreak' in run._element.xml:
+                    pn += 1
+                    continue
+                if 'w:br' in run._element.xml and 'type="page"' in run._element.xml:
+                    pn += 1
+
        tbls = [self.__extract_table_content(tb) for tb in self.doc.tables]
        return secs, tbls