4.6.2-production (#518)

labring · Nov 26, 2023 · f818260 · f818260
1 parent 3acbf1a
commit f818260
Show file tree

Hide file tree

Showing 30 changed files with 472 additions and 278 deletions.
diff --git a/docSite/content/docs/development/configuration.md b/docSite/content/docs/development/configuration.md
@@ -26,7 +26,7 @@ weight: 520
     "qaMaxProcess": 15,  // QA 生成最大进程，结合数据库性能和 key 来设置
     "pgHNSWEfSearch": 100  // pg vector 索引参数，越大精度高但速度慢
   },
-  "ChatModels": [
+  "ChatModels": [ // 对话模型
     {
       "model": "gpt-3.5-turbo-1106",
       "name": "GPT35-1106",
@@ -76,7 +76,7 @@ weight: 520
       "defaultSystemChatPrompt": ""
     }
   ],
-  "QAModels": [
+  "QAModels": [ // QA 生成模型
     {
       "model": "gpt-3.5-turbo-16k",
       "name": "GPT35-16k",
@@ -85,14 +85,14 @@ weight: 520
       "price": 0
     }
   ],
-  "CQModels": [
+  "CQModels": [ // 问题分类模型
     {
       "model": "gpt-3.5-turbo-1106",
       "name": "GPT35-1106",
       "maxContext": 16000,
       "maxResponse": 4000,
       "price": 0,
-      "functionCall": true,
+      "functionCall": true, // 是否支持function call， 不支持的模型需要设置为 false，会走提示词生成
       "functionPrompt": ""
     },
     {
@@ -105,7 +105,7 @@ weight: 520
       "functionPrompt": ""
     }
   ],
-  "ExtractModels": [
+  "ExtractModels": [ // 内容提取模型
     {
       "model": "gpt-3.5-turbo-1106",
       "name": "GPT35-1106",
@@ -116,7 +116,7 @@ weight: 520
       "functionPrompt": ""
     }
   ],
-  "QGModels": [
+  "QGModels": [ // 生成下一步指引
     {
       "model": "gpt-3.5-turbo-1106",
       "name": "GPT35-1106",
@@ -125,7 +125,7 @@ weight: 520
       "price": 0
     }
   ],
-  "VectorModels": [
+  "VectorModels": [ // 向量模型
     {
       "model": "text-embedding-ada-002",
       "name": "Embedding-2",

diff --git a/docSite/content/docs/installation/upgrading/462.md b/docSite/content/docs/installation/upgrading/462.md
@@ -0,0 +1,31 @@
+---
+title: 'V4.6.2(需要初始化)'
+description: 'FastGPT V4.6.2'
+icon: 'upgrade'
+draft: false
+toc: true
+weight: 834
+---
+
+## 1。执行初始化 API
+
+发起 1 个 HTTP 请求 ({{rootkey}} 替换成环境变量里的 `rootkey`，{{host}} 替换成自己域名)
+
+1. https://xxxxx/api/admin/initv462
+
+```bash
+curl --location --request POST 'https://{{host}}/api/admin/initv462' \
+--header 'rootkey: {{rootkey}}' \
+--header 'Content-Type: application/json'
+```
+
+初始化说明：
+1. 初始化全文索引
+
+## V4.6.2 功能介绍
+
+1. 新增 - 全文索引（需配合 Rerank 模型，在看怎么放到开源版，模型接口比较特殊）
+2. 新增 - 插件来源（预计4.7/4.8版本会正式使用）
+3. 优化 - PDF读取
+4. 优化 - docx文件读取，转成 markdown 并保留其图片内容
+5. 修复和优化 TextSplitter 函数
diff --git a/packages/global/common/string/textSplitter.ts b/packages/global/common/string/textSplitter.ts
@@ -3,126 +3,184 @@ import { countPromptTokens } from './tiktoken';
 
 /**
  * text split into chunks
- * maxLen - one chunk len. max: 3500
+ * chunkLen - one chunk len. max: 3500
  * overlapLen - The size of the before and after Text
- * maxLen > overlapLen
+ * chunkLen > overlapLen
  * markdown
  */
-export const splitText2Chunks = (props: { text: string; maxLen: number; overlapLen?: number }) => {
-  const { text = '', maxLen, overlapLen = Math.floor(maxLen * 0.2) } = props;
-  const tempMarker = 'SPLIT_HERE_SPLIT_HERE';
-
-  const stepReg: Record<number, RegExp> = {
-    0: /^(#\s[^\n]+)\n/gm,
-    1: /^(##\s[^\n]+)\n/gm,
-    2: /^(###\s[^\n]+)\n/gm,
-    3: /^(####\s[^\n]+)\n/gm,
-
-    4: /(\n\n)/g,
-    5: /([\n])/g,
-    6: /([。]|(?!<[^a-zA-Z])\.\s)/g,
-    7: /([！？]|!\s|\?\s)/g,
-    8: /([；]|;\s)/g,
-    9: /([，]|,\s)/g
+export const splitText2Chunks = (props: {
+  text: string;
+  chunkLen: number;
+  overlapRatio?: number;
+}): {
+  chunks: string[];
+  tokens: number;
+} => {
+  const { text = '', chunkLen, overlapRatio = 0.2 } = props;
+  const splitMarker = 'SPLIT_HERE_SPLIT_HERE';
+  const overlapLen = Math.round(chunkLen * overlapRatio);
+
+  // The larger maxLen is, the next sentence is less likely to trigger splitting
+  const stepReges: { reg: RegExp; maxLen: number }[] = [
+    { reg: /^(#\s[^\n]+)\n/gm, maxLen: chunkLen * 1.4 },
+    { reg: /^(##\s[^\n]+)\n/gm, maxLen: chunkLen * 1.4 },
+    { reg: /^(###\s[^\n]+)\n/gm, maxLen: chunkLen * 1.4 },
+    { reg: /^(####\s[^\n]+)\n/gm, maxLen: chunkLen * 1.4 },
+
+    { reg: /([\n]{2})/g, maxLen: chunkLen * 1.4 },
+    { reg: /([\n](?![\*\-|>`0-9]))/g, maxLen: chunkLen * 1.8 }, // (?![\*\-|>`0-9]): markdown special char
+    { reg: /([\n])/g, maxLen: chunkLen * 1.4 },
+
+    { reg: /([。]|([a-zA-Z])\.\s)/g, maxLen: chunkLen * 1.4 },
+    { reg: /([！]|!\s)/g, maxLen: chunkLen * 1.4 },
+    { reg: /([？]|\?\s)/g, maxLen: chunkLen * 1.6 },
+    { reg: /([；]|;\s)/g, maxLen: chunkLen * 1.8 },
+    { reg: /([，]|,\s)/g, maxLen: chunkLen * 2 }
+  ];
+
+  const getSplitTexts = ({ text, step }: { text: string; step: number }) => {
+    if (step >= stepReges.length) {
+      return [text];
+    }
+    const isMarkdownSplit = step <= 3;
+    const { reg } = stepReges[step];
+
+    const splitTexts = text
+      .replace(reg, isMarkdownSplit ? `${splitMarker}$1` : `$1${splitMarker}`)
+      .split(`${splitMarker}`)
+      .filter((part) => part.trim());
+    return splitTexts;
+  };
+
+  const getOneTextOverlapText = ({ text, step }: { text: string; step: number }): string => {
+    const forbidOverlap = step <= 6;
+    const maxOverlapLen = chunkLen * 0.4;
+
+    // step >= stepReges.length: Do not overlap incomplete sentences
+    if (forbidOverlap || overlapLen === 0 || step >= stepReges.length) return '';
+
+    const splitTexts = getSplitTexts({ text, step });
+    let overlayText = '';
+
+    for (let i = splitTexts.length - 1; i >= 0; i--) {
+      const currentText = splitTexts[i];
+      const newText = currentText + overlayText;
+      const newTextLen = newText.length;
+
+      if (newTextLen > overlapLen) {
+        if (newTextLen > maxOverlapLen) {
+          const text = getOneTextOverlapText({ text: newText, step: step + 1 });
+          return text || overlayText;
+        }
+        return newText;
+      }
+
+      overlayText = newText;
+    }
+    return overlayText;
   };
 
   const splitTextRecursively = ({
     text = '',
     step,
-    lastChunk,
-    overlayChunk
+    lastText
   }: {
     text: string;
     step: number;
-    lastChunk: string;
-    overlayChunk: string;
-  }) => {
-    if (text.length <= maxLen) {
+    lastText: string;
+  }): string[] => {
+    // mini text
+    if (text.length <= chunkLen) {
       return [text];
     }
-    const reg = stepReg[step];
-    const isMarkdownSplit = step < 4;
 
-    if (!reg) {
-      // use slice-maxLen to split text
+    // oversize
+    if (step >= stepReges.length) {
+      if (text.length < chunkLen * 3) {
+        return [text];
+      }
+      // use slice-chunkLen to split text
       const chunks: string[] = [];
-      let chunk = '';
-      for (let i = 0; i < text.length; i += maxLen - overlapLen) {
-        chunk = text.slice(i, i + maxLen);
-        chunks.push(chunk);
+      for (let i = 0; i < text.length; i += chunkLen - overlapLen) {
+        chunks.push(text.slice(i, i + chunkLen));
       }
       return chunks;
     }
 
+    const { maxLen } = stepReges[step];
+    const minChunkLen = chunkLen * 0.7;
+
     // split text by special char
-    const splitTexts = (() => {
-      if (!reg.test(text)) {
-        return [text];
-      }
-      return text
-        .replace(reg, isMarkdownSplit ? `${tempMarker}$1` : `$1${tempMarker}`)
-        .split(`${tempMarker}`)
-        .filter((part) => part);
-    })();
+    const splitTexts = getSplitTexts({ text, step });
 
-    let chunks: string[] = [];
+    const chunks: string[] = [];
     for (let i = 0; i < splitTexts.length; i++) {
-      let text = splitTexts[i];
-      let chunkToken = lastChunk.length;
-      const textToken = text.length;
-
-      // next chunk is too large / new chunk is too large(The current chunk must be smaller than maxLen)
-      if (textToken >= maxLen || chunkToken + textToken > maxLen * 1.4) {
-        // last chunk is too large, push it to chunks, not add to next chunk
-        if (chunkToken > maxLen * 0.7) {
-          chunks.push(lastChunk);
-          lastChunk = '';
-          overlayChunk = '';
+      const currentText = splitTexts[i];
+      const currentTextLen = currentText.length;
+      const lastTextLen = lastText.length;
+      const newText = lastText + currentText;
+      const newTextLen = lastTextLen + currentTextLen;
+
+      // newText is too large(now, The lastText must be smaller than chunkLen)
+      if (newTextLen > maxLen) {
+        // lastText greater minChunkLen, direct push it to chunks, not add to next chunk. (large lastText)
+        if (lastTextLen > minChunkLen) {
+          chunks.push(lastText);
+          lastText = getOneTextOverlapText({ text: lastText, step }); // next chunk will start with overlayText
+          i--;
+          continue;
         }
-        // chunk is small, insert to next chunks
+
+        // split new Text, split chunks must will greater 1 (small lastText)
         const innerChunks = splitTextRecursively({
-          text,
+          text: newText,
           step: step + 1,
-          lastChunk,
-          overlayChunk
+          lastText: ''
         });
-        if (innerChunks.length === 0) continue;
-        chunks = chunks.concat(innerChunks);
-        lastChunk = '';
-        overlayChunk = '';
+        const lastChunk = innerChunks[innerChunks.length - 1];
+        // last chunk is too small, concat it to lastText
+        if (lastChunk.length < minChunkLen) {
+          chunks.push(...innerChunks.slice(0, -1));
+          lastText = lastChunk;
+        } else {
+          chunks.push(...innerChunks);
+          // compute new overlapText
+          lastText = getOneTextOverlapText({
+            text: lastChunk,
+            step
+          });
+        }
         continue;
       }
 
-      // size less than maxLen, push text to last chunk
-      lastChunk += text;
-      chunkToken += textToken; // Definitely less than 1.4 * maxLen
-
-      // size over lapLen, push it to next chunk
-      if (
-        overlapLen !== 0 &&
-        !isMarkdownSplit &&
-        chunkToken >= maxLen - overlapLen &&
-        textToken < overlapLen
-      ) {
-        overlayChunk += text;
-      }
-      if (chunkToken >= maxLen) {
-        chunks.push(lastChunk);
-        lastChunk = overlayChunk;
-        overlayChunk = '';
+      // size less than chunkLen, push text to last chunk. now, text definitely less than maxLen
+      lastText = newText;
+
+      // If the chunk size reaches, add a chunk
+      if (newTextLen >= chunkLen) {
+        chunks.push(lastText);
+        lastText = getOneTextOverlapText({ text: lastText, step });
       }
     }
 
     /* If the last chunk is independent, it needs to be push chunks. */
-    if (lastChunk && chunks[chunks.length - 1] && !chunks[chunks.length - 1].endsWith(lastChunk)) {
-      chunks.push(lastChunk);
+    if (lastText && chunks[chunks.length - 1] && !chunks[chunks.length - 1].endsWith(lastText)) {
+      if (lastText.length < chunkLen * 0.4) {
+        chunks[chunks.length - 1] = chunks[chunks.length - 1] + lastText;
+      } else {
+        chunks.push(lastText);
+      }
     }
 
     return chunks;
   };
 
   try {
-    const chunks = splitTextRecursively({ text, step: 0, lastChunk: '', overlayChunk: '' });
+    const chunks = splitTextRecursively({
+      text,
+      step: 0,
+      lastText: ''
+    });
 
     const tokens = chunks.reduce((sum, chunk) => sum + countPromptTokens(chunk, 'system'), 0);
 

diff --git a/packages/service/common/response/index.ts b/packages/service/common/response/index.ts
@@ -102,13 +102,13 @@ export function responseWriteController({
   readStream: any;
 }) {
   res.on('drain', () => {
-    readStream.resume();
+    readStream?.resume?.();
   });
 
   return (text: string | Buffer) => {
     const writeResult = res.write(text);
     if (!writeResult) {
-      readStream?.pause();
+      readStream?.pause?.();
     }
   };
 }

diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml