Skip to content

Commit

Permalink
Change embedding (#1428)
Browse files Browse the repository at this point in the history
* fix: text spliter

* perf: embedding model
  • Loading branch information
c121914yu authored May 9, 2024
1 parent 434af56 commit 5e250b2
Show file tree
Hide file tree
Showing 6 changed files with 93 additions and 11 deletions.
32 changes: 30 additions & 2 deletions files/helm/fastgpt/templates/configmap-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -99,13 +99,41 @@ data:
}
],
"vectorModels": [
{
"model": "text-embedding-3-large",
"name": "Embedding-2",
"avatar": "/imgs/model/openai.svg",
"charsPointsPrice": 0,
"defaultToken": 512,
"maxToken": 3000,
"weight": 100,
"dbConfig": {},
"queryConfig": {},
"defaultConfig": {
"dimensions": 1024
}
},
{
"model": "text-embedding-3-small",
"name": "Embedding-2",
"avatar": "/imgs/model/openai.svg",
"charsPointsPrice": 0,
"defaultToken": 512,
"maxToken": 3000,
"weight": 100,
"dbConfig": {},
"queryConfig": {}
},
{
"model": "text-embedding-ada-002",
"name": "Embedding-2",
"avatar": "/imgs/model/openai.svg",
"charsPointsPrice": 0,
"defaultToken": 700,
"defaultToken": 512,
"maxToken": 3000,
"weight": 100
"weight": 100,
"dbConfig": {},
"queryConfig": {}
}
],
"reRankModels": [],
Expand Down
40 changes: 35 additions & 5 deletions packages/global/common/string/textSplitter.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,17 +17,47 @@ type SplitResponse = {

// 判断字符串是否为markdown的表格形式
const strIsMdTable = (str: string) => {
const regex = /^(\|.*\|[\r]*)$/m;
// 检查是否包含表格分隔符 |
if (!str.includes('|')) {
return false;
}

const lines = str.split('\n');

// 检查表格是否至少有两行
if (lines.length < 2) {
return false;
}

// 检查表头行是否包含 |
const headerLine = lines[0].trim();
if (!headerLine.startsWith('|') || !headerLine.endsWith('|')) {
return false;
}

return regex.test(str);
// 检查分隔行是否由 | 和 - 组成
const separatorLine = lines[1].trim();
const separatorRegex = /^(\|[\s:]*-+[\s:]*)+\|$/;
if (!separatorRegex.test(separatorLine)) {
return false;
}

// 检查数据行是否包含 |
for (let i = 2; i < lines.length; i++) {
const dataLine = lines[i].trim();
if (dataLine && (!dataLine.startsWith('|') || !dataLine.endsWith('|'))) {
return false;
}
}
return true;
};
const markdownTableSplit = (props: SplitProps): SplitResponse => {
let { text = '', chunkLen } = props;
const splitText2Lines = text.split('\n');
const header = splitText2Lines[0];

const headerSize = header.split('|').length - 2;
const mdSplitString = `| ${new Array(headerSize)

const mdSplitString = `| ${new Array(headerSize > 0 ? headerSize : 1)
.fill(0)
.map(() => '---')
.join(' | ')} |`;
Expand Down Expand Up @@ -304,7 +334,7 @@ export const splitText2Chunks = (props: SplitProps): SplitResponse => {
const splitWithCustomSign = text.split(CUSTOM_SPLIT_SIGN);

const splitResult = splitWithCustomSign.map((item) => {
if (strIsMdTable(text)) {
if (strIsMdTable(item)) {
return markdownTableSplit(props);
}

Expand Down
2 changes: 1 addition & 1 deletion packages/global/core/ai/model.ts
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ export const defaultQAModels: LLMModelItemType[] = [

export const defaultVectorModels: VectorModelItemType[] = [
{
model: 'text-embedding-ada-002',
model: 'text-embedding-3-small',
name: 'Embedding-2',
charsPointsPrice: 0,
defaultToken: 500,
Expand Down
2 changes: 1 addition & 1 deletion packages/service/core/dataset/schema.ts
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ const DatasetSchema = new Schema({
vectorModel: {
type: String,
required: true,
default: 'text-embedding-ada-002'
default: 'text-embedding-3-small'
},
agentModel: {
type: String,
Expand Down
25 changes: 25 additions & 0 deletions projects/app/data/config.json
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,31 @@
}
],
"vectorModels": [
{
"model": "text-embedding-3-large",
"name": "Embedding-2",
"avatar": "/imgs/model/openai.svg",
"charsPointsPrice": 0,
"defaultToken": 512,
"maxToken": 3000,
"weight": 100,
"dbConfig": {},
"queryConfig": {},
"defaultConfig": {
"dimensions": 1024
}
},
{
"model": "text-embedding-3-small",
"name": "Embedding-2",
"avatar": "/imgs/model/openai.svg",
"charsPointsPrice": 0,
"defaultToken": 512,
"maxToken": 3000,
"weight": 100,
"dbConfig": {},
"queryConfig": {}
},
{
"model": "text-embedding-ada-002",
"name": "Embedding-2",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,8 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse<
csvFormat: true
});
// split chunks (5 chunk)
const sliceRawText = 10 * chunkSize;
const { chunks } = splitText2Chunks({
text: rawText.slice(0, sliceRawText),
text: rawText,
chunkLen: chunkSize,
overlapRatio,
customReg: customSplitChar ? [customSplitChar] : []
Expand Down

0 comments on commit 5e250b2

Please sign in to comment.