85 lines
3.5 KiB
TypeScript
85 lines
3.5 KiB
TypeScript
|
|
const getImageName = (prefix: string, length: number) =>
|
|||
|
|
new Array(length)
|
|||
|
|
.fill(0)
|
|||
|
|
.map((x, idx) => `chunk-method/${prefix}-0${idx + 1}`);
|
|||
|
|
|
|||
|
|
export const ImageMap = {
|
|||
|
|
book: getImageName('book', 4),
|
|||
|
|
laws: getImageName('law', 4),
|
|||
|
|
manual: getImageName('manual', 4),
|
|||
|
|
media: getImageName('media', 2),
|
|||
|
|
naive: getImageName('naive', 2),
|
|||
|
|
paper: getImageName('paper', 2),
|
|||
|
|
presentation: getImageName('presentation', 2),
|
|||
|
|
qa: getImageName('qa', 2),
|
|||
|
|
resume: getImageName('resume', 2),
|
|||
|
|
table: getImageName('table', 2),
|
|||
|
|
};
|
|||
|
|
|
|||
|
|
export const TextMap = {
|
|||
|
|
book: {
|
|||
|
|
title: '',
|
|||
|
|
description: `Supported file formats are docx, excel, pdf, txt.
|
|||
|
|
Since a book is long and not all the parts are useful, if it's a PDF,
|
|||
|
|
please setup the page ranges for every book in order eliminate negative effects and save computing time for analyzing.`,
|
|||
|
|
},
|
|||
|
|
laws: {
|
|||
|
|
title: '',
|
|||
|
|
description: `Supported file formats are docx, pdf, txt.`,
|
|||
|
|
},
|
|||
|
|
manual: { title: '', description: `Only pdf is supported.` },
|
|||
|
|
media: { title: '', description: '' },
|
|||
|
|
naive: {
|
|||
|
|
title: '',
|
|||
|
|
description: `Supported file formats are docx, pdf, txt.
|
|||
|
|
This method apply the naive ways to chunk files.
|
|||
|
|
Successive text will be sliced into pieces using 'delimiter'.
|
|||
|
|
Next, these successive pieces are merge into chunks whose token number is no more than 'Max token number'.`,
|
|||
|
|
},
|
|||
|
|
paper: {
|
|||
|
|
title: '',
|
|||
|
|
description: `Only pdf is supported.
|
|||
|
|
The special part is that, the abstract of the paper will be sliced as an entire chunk, and will not be sliced partly.`,
|
|||
|
|
},
|
|||
|
|
presentation: {
|
|||
|
|
title: '',
|
|||
|
|
description: `The supported file formats are pdf, pptx.
|
|||
|
|
Every page will be treated as a chunk. And the thumbnail of every page will be stored.
|
|||
|
|
PPT file will be parsed by using this method automatically, setting-up for every PPT file is not necessary.`,
|
|||
|
|
},
|
|||
|
|
qa: {
|
|||
|
|
title: '',
|
|||
|
|
description: `Excel and csv(txt) format files are supported.
|
|||
|
|
If the file is in excel format, there should be 2 column question and answer without header.
|
|||
|
|
And question column is ahead of answer column.
|
|||
|
|
And it's O.K if it has multiple sheets as long as the columns are rightly composed.
|
|||
|
|
|
|||
|
|
If it's in csv format, it should be UTF-8 encoded. Use TAB as delimiter to separate question and answer.
|
|||
|
|
|
|||
|
|
All the deformed lines will be ignored.
|
|||
|
|
Every pair of Q&A will be treated as a chunk.`,
|
|||
|
|
},
|
|||
|
|
resume: {
|
|||
|
|
title: '',
|
|||
|
|
description: `The supported file formats are pdf, docx and txt.`,
|
|||
|
|
},
|
|||
|
|
table: {
|
|||
|
|
title: '',
|
|||
|
|
description: `Excel and csv(txt) format files are supported.
|
|||
|
|
For csv or txt file, the delimiter between columns is TAB.
|
|||
|
|
The first line must be column headers.
|
|||
|
|
Column headers must be meaningful terms inorder to make our NLP model understanding.
|
|||
|
|
It's good to enumerate some synonyms using slash '/' to separate, and even better to
|
|||
|
|
enumerate values using brackets like 'gender/sex(male, female)'.
|
|||
|
|
Here are some examples for headers:
|
|||
|
|
1. supplier/vendor\tcolor(yellow, red, brown)\tgender/sex(male, female)\tsize(M,L,XL,XXL)
|
|||
|
|
2. 姓名/名字\t电话/手机/微信\t最高学历(高中,职高,硕士,本科,博士,初中,中技,中专,专科,专升本,MPA,MBA,EMBA)
|
|||
|
|
Every row in table will be treated as a chunk.
|
|||
|
|
|
|||
|
|
visual:
|
|||
|
|
Image files are supported. Video is comming soon.
|
|||
|
|
If the picture has text in it, OCR is applied to extract the text as a description of it.
|
|||
|
|
If the text extracted by OCR is not enough, visual LLM is used to get the descriptions.`,
|
|||
|
|
},
|
|||
|
|
};
|