4.8-alpha fix (#1424)

labring · May 9, 2024 · 434af56 · 434af56
1 parent 6463427
commit 434af56
Show file tree

Hide file tree

Showing 19 changed files with 252 additions and 145 deletions.
diff --git a/packages/global/common/string/textSplitter.ts b/packages/global/common/string/textSplitter.ts
@@ -1,24 +1,61 @@
 import { getErrText } from '../error/utils';
 import { replaceRegChars } from './tools';
 
-/**
- * text split into chunks
- * chunkLen - one chunk len. max: 3500
- * overlapLen - The size of the before and after Text
- * chunkLen > overlapLen
- * markdown
- */
-export const splitText2Chunks = (props: {
+export const CUSTOM_SPLIT_SIGN = '-----CUSTOM_SPLIT_SIGN-----';
+
+type SplitProps = {
   text: string;
   chunkLen: number;
   overlapRatio?: number;
   customReg?: string[];
-}): {
+};
+
+type SplitResponse = {
   chunks: string[];
   chars: number;
-  overlapRatio?: number;
-} => {
+};
+
+// 判断字符串是否为markdown的表格形式
+const strIsMdTable = (str: string) => {
+  const regex = /^(\|.*\|[\r]*)$/m;
+
+  return regex.test(str);
+};
+const markdownTableSplit = (props: SplitProps): SplitResponse => {
+  let { text = '', chunkLen } = props;
+  const splitText2Lines = text.split('\n');
+  const header = splitText2Lines[0];
+
+  const headerSize = header.split('|').length - 2;
+  const mdSplitString = `| ${new Array(headerSize)
+    .fill(0)
+    .map(() => '---')
+    .join(' | ')} |`;
+
+  const chunks: string[] = [];
+  let chunk = `${header}
+${mdSplitString}
+`;
+
+  for (let i = 2; i < splitText2Lines.length; i++) {
+    if (chunk.length + splitText2Lines[i].length > chunkLen * 1.2) {
+      chunks.push(chunk);
+      chunk = `${header}
+${mdSplitString}
+`;
+    }
+    chunk += `${splitText2Lines[i]}\n`;
+  }
+
+  return {
+    chunks,
+    chars: chunks.reduce((sum, chunk) => sum + chunk.length, 0)
+  };
+};
+
+const commonSplit = (props: SplitProps): SplitResponse => {
   let { text = '', chunkLen, overlapRatio = 0.2, customReg = [] } = props;
+
   const splitMarker = 'SPLIT_HERE_SPLIT_HERE';
   const codeBlockMarker = 'CODE_BLOCK_LINE_MARKER';
   const overlapLen = Math.round(chunkLen * overlapRatio);
@@ -253,3 +290,29 @@ export const splitText2Chunks = (props: {
     throw new Error(getErrText(err));
   }
 };
+
+/**
+ * text split into chunks
+ * chunkLen - one chunk len. max: 3500
+ * overlapLen - The size of the before and after Text
+ * chunkLen > overlapLen
+ * markdown
+ */
+export const splitText2Chunks = (props: SplitProps): SplitResponse => {
+  let { text = '' } = props;
+
+  const splitWithCustomSign = text.split(CUSTOM_SPLIT_SIGN);
+
+  const splitResult = splitWithCustomSign.map((item) => {
+    if (strIsMdTable(text)) {
+      return markdownTableSplit(props);
+    }
+
+    return commonSplit(props);
+  });
+
+  return {
+    chunks: splitResult.map((item) => item.chunks).flat(),
+    chars: splitResult.reduce((sum, item) => sum + item.chars, 0)
+  };
+};
diff --git a/packages/service/common/string/tiktoken/index.ts b/packages/service/common/string/tiktoken/index.ts
@@ -57,7 +57,7 @@ export const countGptMessagesTokens = (
 
       // 检测是否有内存泄漏
       // addLog.info(`Count token time: ${Date.now() - start}, token: ${data}`);
-      // console.log(Object.keys(global.tiktokenWorker.callbackMap));
+      // console.log(process.memoryUsage());
     };
 
     worker.postMessage({

diff --git a/packages/service/common/vectorStore/controller.d.ts b/packages/service/common/vectorStore/controller.d.ts
@@ -15,6 +15,6 @@ export type InsertVectorProps = {
 export type EmbeddingRecallProps = {
   teamId: string;
   datasetIds: string[];
-  similarity?: number;
-  efSearch?: number;
+  // similarity?: number;
+  // efSearch?: number;
 };
diff --git a/packages/service/common/vectorStore/pg/controller.ts b/packages/service/common/vectorStore/pg/controller.ts
@@ -129,17 +129,15 @@ export const embeddingRecall = async (
 ): Promise<{
   results: EmbeddingRecallItemType[];
 }> => {
-  const { teamId, datasetIds, vectors, limit, similarity = 0, retry = 2, efSearch = 100 } = props;
+  const { datasetIds, vectors, limit, retry = 2 } = props;
 
   try {
     const results: any = await PgClient.query(
       `BEGIN;
-        SET LOCAL hnsw.ef_search = ${efSearch};
+        SET LOCAL hnsw.ef_search = ${global.systemEnv?.pgHNSWEfSearch || 100};
         select id, collection_id, vector <#> '[${vectors[0]}]' AS score 
           from ${PgDatasetTableName} 
-          where team_id='${teamId}' 
-              AND dataset_id IN (${datasetIds.map((id) => `'${String(id)}'`).join(',')})
-              AND vector <#> '[${vectors[0]}]' < -${similarity}
+          where dataset_id IN (${datasetIds.map((id) => `'${String(id)}'`).join(',')})
           order by score limit ${limit};
         COMMIT;`
     );

diff --git a/packages/service/core/dataset/search/controller.ts b/packages/service/core/dataset/search/controller.ts
@@ -85,8 +85,7 @@ export async function searchDatasetData(props: SearchDatasetDataProps) {
       teamId,
       datasetIds,
       vectors,
-      limit,
-      efSearch: global.systemEnv?.pgHNSWEfSearch
+      limit
     });
 
     // get q and a

diff --git a/packages/service/core/workflow/dispatch/agent/classifyQuestion.ts b/packages/service/core/workflow/dispatch/agent/classifyQuestion.ts
@@ -127,8 +127,8 @@ const completions = async ({
   });
   const answer = data.choices?.[0].message?.content || '';
 
-  console.log(JSON.stringify(chats2GPTMessages({ messages, reserveId: false }), null, 2));
-  console.log(answer, '----');
+  // console.log(JSON.stringify(chats2GPTMessages({ messages, reserveId: false }), null, 2));
+  // console.log(answer, '----');
 
   const id =
     agents.find((item) => answer.includes(item.key))?.key ||

diff --git a/packages/service/worker/file/extension/csv.ts b/packages/service/worker/file/extension/csv.ts
@@ -10,9 +10,13 @@ export const readCsvRawText = async (params: ReadRawTextByBuffer): Promise<ReadF
 
   const header = csvArr[0];
 
-  const formatText = header
-    ? csvArr.map((item) => item.map((item, i) => `${header[i]}:${item}`).join('\n')).join('\n')
-    : '';
+  // format to md table
+  const formatText = `| ${header.join(' | ')} |
+| ${header.map(() => '---').join(' | ')} |
+${csvArr
+  .slice(1)
+  .map((row) => `| ${row.map((item) => item.replace(/\n/g, '\\n')).join(' | ')} |`)
+  .join('\n')}`;
 
   return {
     rawText,

diff --git a/packages/service/worker/file/extension/xlsx.ts b/packages/service/worker/file/extension/xlsx.ts
@@ -1,3 +1,4 @@
+import { CUSTOM_SPLIT_SIGN } from '@fastgpt/global/common/string/textSplitter';
 import { ReadRawTextByBuffer, ReadFileResponse } from '../type';
 import xlsx from 'node-xlsx';
 import Papa from 'papaparse';
@@ -18,25 +19,25 @@ export const readXlsxRawText = async ({
   });
 
   const rawText = format2Csv.map((item) => item.csvText).join('\n');
+
   const formatText = format2Csv
     .map((item) => {
       const csvArr = Papa.parse(item.csvText).data as string[][];
       const header = csvArr[0];
 
-      const formatText = header
-        ? csvArr
-            .map((item) =>
-              item
-                .map((item, i) => (item ? `${header[i]}:${item}` : ''))
-                .filter(Boolean)
-                .join('\n')
-            )
-            .join('\n')
-        : '';
-
-      return `${item.title}\n${formatText}`;
+      if (!header) return;
+
+      const formatText = `| ${header.join(' | ')} |
+      | ${header.map(() => '---').join(' | ')} |
+      ${csvArr
+        .slice(1)
+        .map((row) => `| ${row.map((item) => item.replace(/\n/g, '\\n')).join(' | ')} |`)
+        .join('\n')}`;
+
+      return formatText;
     })
-    .join('\n');
+    .filter(Boolean)
+    .join(CUSTOM_SPLIT_SIGN);
 
   return {
     rawText: rawText,

diff --git a/packages/service/worker/file/read.ts b/packages/service/worker/file/read.ts
@@ -67,5 +67,5 @@ parentPort?.on('message', async (props: ReadRawTextProps<Uint8Array>) => {
     });
   }
 
-  global?.close?.();
+  process.exit();
 });
diff --git a/packages/service/worker/htmlStr2Md/index.ts b/packages/service/worker/htmlStr2Md/index.ts
@@ -15,6 +15,5 @@ parentPort?.on('message', (params: { html: string }) => {
       data: error
     });
   }
-
-  global?.close?.();
+  process.exit();
 });
-Original file line number
+Diff line change
@@ Expand Up @@
         });
       }
-      global?.close?.();
+      process.exit();
     });