Skip to content

Commit

Permalink
4.8-alpha fix (#1424)
Browse files Browse the repository at this point in the history
  • Loading branch information
c121914yu authored May 9, 2024
1 parent 6463427 commit 434af56
Show file tree
Hide file tree
Showing 19 changed files with 252 additions and 145 deletions.
85 changes: 74 additions & 11 deletions packages/global/common/string/textSplitter.ts
Original file line number Diff line number Diff line change
@@ -1,24 +1,61 @@
import { getErrText } from '../error/utils';
import { replaceRegChars } from './tools';

/**
* text split into chunks
* chunkLen - one chunk len. max: 3500
* overlapLen - The size of the before and after Text
* chunkLen > overlapLen
* markdown
*/
export const splitText2Chunks = (props: {
export const CUSTOM_SPLIT_SIGN = '-----CUSTOM_SPLIT_SIGN-----';

type SplitProps = {
text: string;
chunkLen: number;
overlapRatio?: number;
customReg?: string[];
}): {
};

type SplitResponse = {
chunks: string[];
chars: number;
overlapRatio?: number;
} => {
};

// 判断字符串是否为markdown的表格形式
const strIsMdTable = (str: string) => {
const regex = /^(\|.*\|[\r]*)$/m;

return regex.test(str);
};
const markdownTableSplit = (props: SplitProps): SplitResponse => {
let { text = '', chunkLen } = props;
const splitText2Lines = text.split('\n');
const header = splitText2Lines[0];

const headerSize = header.split('|').length - 2;
const mdSplitString = `| ${new Array(headerSize)
.fill(0)
.map(() => '---')
.join(' | ')} |`;

const chunks: string[] = [];
let chunk = `${header}
${mdSplitString}
`;

for (let i = 2; i < splitText2Lines.length; i++) {
if (chunk.length + splitText2Lines[i].length > chunkLen * 1.2) {
chunks.push(chunk);
chunk = `${header}
${mdSplitString}
`;
}
chunk += `${splitText2Lines[i]}\n`;
}

return {
chunks,
chars: chunks.reduce((sum, chunk) => sum + chunk.length, 0)
};
};

const commonSplit = (props: SplitProps): SplitResponse => {
let { text = '', chunkLen, overlapRatio = 0.2, customReg = [] } = props;

const splitMarker = 'SPLIT_HERE_SPLIT_HERE';
const codeBlockMarker = 'CODE_BLOCK_LINE_MARKER';
const overlapLen = Math.round(chunkLen * overlapRatio);
Expand Down Expand Up @@ -253,3 +290,29 @@ export const splitText2Chunks = (props: {
throw new Error(getErrText(err));
}
};

/**
* text split into chunks
* chunkLen - one chunk len. max: 3500
* overlapLen - The size of the before and after Text
* chunkLen > overlapLen
* markdown
*/
export const splitText2Chunks = (props: SplitProps): SplitResponse => {
let { text = '' } = props;

const splitWithCustomSign = text.split(CUSTOM_SPLIT_SIGN);

const splitResult = splitWithCustomSign.map((item) => {
if (strIsMdTable(text)) {
return markdownTableSplit(props);
}

return commonSplit(props);
});

return {
chunks: splitResult.map((item) => item.chunks).flat(),
chars: splitResult.reduce((sum, item) => sum + item.chars, 0)
};
};
2 changes: 1 addition & 1 deletion packages/service/common/string/tiktoken/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ export const countGptMessagesTokens = (

// 检测是否有内存泄漏
// addLog.info(`Count token time: ${Date.now() - start}, token: ${data}`);
// console.log(Object.keys(global.tiktokenWorker.callbackMap));
// console.log(process.memoryUsage());
};

worker.postMessage({
Expand Down
4 changes: 2 additions & 2 deletions packages/service/common/vectorStore/controller.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,6 @@ export type InsertVectorProps = {
export type EmbeddingRecallProps = {
teamId: string;
datasetIds: string[];
similarity?: number;
efSearch?: number;
// similarity?: number;
// efSearch?: number;
};
8 changes: 3 additions & 5 deletions packages/service/common/vectorStore/pg/controller.ts
Original file line number Diff line number Diff line change
Expand Up @@ -129,17 +129,15 @@ export const embeddingRecall = async (
): Promise<{
results: EmbeddingRecallItemType[];
}> => {
const { teamId, datasetIds, vectors, limit, similarity = 0, retry = 2, efSearch = 100 } = props;
const { datasetIds, vectors, limit, retry = 2 } = props;

try {
const results: any = await PgClient.query(
`BEGIN;
SET LOCAL hnsw.ef_search = ${efSearch};
SET LOCAL hnsw.ef_search = ${global.systemEnv?.pgHNSWEfSearch || 100};
select id, collection_id, vector <#> '[${vectors[0]}]' AS score
from ${PgDatasetTableName}
where team_id='${teamId}'
AND dataset_id IN (${datasetIds.map((id) => `'${String(id)}'`).join(',')})
AND vector <#> '[${vectors[0]}]' < -${similarity}
where dataset_id IN (${datasetIds.map((id) => `'${String(id)}'`).join(',')})
order by score limit ${limit};
COMMIT;`
);
Expand Down
3 changes: 1 addition & 2 deletions packages/service/core/dataset/search/controller.ts
Original file line number Diff line number Diff line change
Expand Up @@ -85,8 +85,7 @@ export async function searchDatasetData(props: SearchDatasetDataProps) {
teamId,
datasetIds,
vectors,
limit,
efSearch: global.systemEnv?.pgHNSWEfSearch
limit
});

// get q and a
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -127,8 +127,8 @@ const completions = async ({
});
const answer = data.choices?.[0].message?.content || '';

console.log(JSON.stringify(chats2GPTMessages({ messages, reserveId: false }), null, 2));
console.log(answer, '----');
// console.log(JSON.stringify(chats2GPTMessages({ messages, reserveId: false }), null, 2));
// console.log(answer, '----');

const id =
agents.find((item) => answer.includes(item.key))?.key ||
Expand Down
10 changes: 7 additions & 3 deletions packages/service/worker/file/extension/csv.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,13 @@ export const readCsvRawText = async (params: ReadRawTextByBuffer): Promise<ReadF

const header = csvArr[0];

const formatText = header
? csvArr.map((item) => item.map((item, i) => `${header[i]}:${item}`).join('\n')).join('\n')
: '';
// format to md table
const formatText = `| ${header.join(' | ')} |
| ${header.map(() => '---').join(' | ')} |
${csvArr
.slice(1)
.map((row) => `| ${row.map((item) => item.replace(/\n/g, '\\n')).join(' | ')} |`)
.join('\n')}`;

return {
rawText,
Expand Down
27 changes: 14 additions & 13 deletions packages/service/worker/file/extension/xlsx.ts
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import { CUSTOM_SPLIT_SIGN } from '@fastgpt/global/common/string/textSplitter';
import { ReadRawTextByBuffer, ReadFileResponse } from '../type';
import xlsx from 'node-xlsx';
import Papa from 'papaparse';
Expand All @@ -18,25 +19,25 @@ export const readXlsxRawText = async ({
});

const rawText = format2Csv.map((item) => item.csvText).join('\n');

const formatText = format2Csv
.map((item) => {
const csvArr = Papa.parse(item.csvText).data as string[][];
const header = csvArr[0];

const formatText = header
? csvArr
.map((item) =>
item
.map((item, i) => (item ? `${header[i]}:${item}` : ''))
.filter(Boolean)
.join('\n')
)
.join('\n')
: '';

return `${item.title}\n${formatText}`;
if (!header) return;

const formatText = `| ${header.join(' | ')} |
| ${header.map(() => '---').join(' | ')} |
${csvArr
.slice(1)
.map((row) => `| ${row.map((item) => item.replace(/\n/g, '\\n')).join(' | ')} |`)
.join('\n')}`;

return formatText;
})
.join('\n');
.filter(Boolean)
.join(CUSTOM_SPLIT_SIGN);

return {
rawText: rawText,
Expand Down
2 changes: 1 addition & 1 deletion packages/service/worker/file/read.ts
Original file line number Diff line number Diff line change
Expand Up @@ -67,5 +67,5 @@ parentPort?.on('message', async (props: ReadRawTextProps<Uint8Array>) => {
});
}

global?.close?.();
process.exit();
});
3 changes: 1 addition & 2 deletions packages/service/worker/htmlStr2Md/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,5 @@ parentPort?.on('message', (params: { html: string }) => {
data: error
});
}

global?.close?.();
process.exit();
});
Loading

0 comments on commit 434af56

Please sign in to comment.