ChatGPTNextWeb · Dakai · Oct 13, 2024 · Oct 13, 2024 · Oct 13, 2024 · Oct 13, 2024
diff --git a/app/client/api.ts b/app/client/api.ts
@@ -39,6 +39,7 @@ export interface MultimodalContent {
 export interface RequestMessage {
   role: MessageRole;
   content: string | MultimodalContent[];
+  audio_url?: string;
 }
 
 export interface LLMConfig {

diff --git a/app/components/chat.module.scss b/app/components/chat.module.scss
@@ -443,6 +443,10 @@
   transition: all ease 0.3s;
 }
 
+.audio-message {
+  min-width: 350px;
+}
+
 .chat-message-item-image {
   width: 100%;
   margin-top: 10px;
@@ -471,6 +475,10 @@
   border: rgba($color: #888, $alpha: 0.2) 1px solid;
 }
 
+.chat-message-item-audio {
+  margin-top: 10px;
+  width: 100%;
+}
 
 @media only screen and (max-width: 600px) {
   $calc-image-width: calc(100vw/3*2/var(--image-count));
@@ -519,7 +527,7 @@
   background-color: var(--second);
 
   &:hover {
-    min-width: 0;
+    //min-width: 350px;
   }
 }
 
@@ -693,4 +701,4 @@
 .shortcut-key span {
   font-size: 12px;
   color: var(--black);
-}
+}
diff --git a/app/components/chat.tsx b/app/components/chat.tsx
@@ -117,7 +117,7 @@ import { MultimodalContent } from "../client/api";
 
 const localStorage = safeLocalStorage();
 import { ClientApi } from "../client/api";
-import { createTTSPlayer } from "../utils/audio";
+import { createTTSPlayer, arrayBufferToWav } from "../utils/audio";
 import { MsEdgeTTS, OUTPUT_FORMAT } from "../utils/ms_edge_tts";
 
 const ttsPlayer = createTTSPlayer();
@@ -1121,6 +1121,14 @@ function _Chat() {
     );
   };
 
+  const updateMessageAudio = (msgId?: string, audio_url?: string) => {
+    chatStore.updateCurrentSession((session) => {
+      session.messages = session.messages.map((m) =>
+        m.id === msgId ? { ...m, audio_url } : m,
+      );
+    });
+  };
+
   const onDelete = (msgId: string) => {
     deleteMessage(msgId);
   };
@@ -1197,7 +1205,7 @@ function _Chat() {
   const accessStore = useAccessStore();
   const [speechStatus, setSpeechStatus] = useState(false);
   const [speechLoading, setSpeechLoading] = useState(false);
-  async function openaiSpeech(text: string) {
+  async function openaiSpeech(text: string): Promise<string | undefined> {
     if (speechStatus) {
       ttsPlayer.stop();
       setSpeechStatus(false);
@@ -1227,16 +1235,22 @@ function _Chat() {
         });
       }
       setSpeechStatus(true);
-      ttsPlayer
-        .play(audioBuffer, () => {
-          setSpeechStatus(false);
-        })
-        .catch((e) => {
-          console.error("[OpenAI Speech]", e);
-          showToast(prettyObject(e));
+      try {
+        const waveFile = arrayBufferToWav(audioBuffer);
+        const audioFile = new Blob([waveFile], { type: "audio/wav" });
+
+        const audioUrl: string = await uploadImageRemote(audioFile);
+        await ttsPlayer.play(audioBuffer, () => {
           setSpeechStatus(false);
-        })
-        .finally(() => setSpeechLoading(false));
+        });
+        return audioUrl;
+      } catch (e) {
+        console.error("[Speech Error]", e);
+        showToast(prettyObject(e));
+        setSpeechStatus(false);
+      } finally {
+        setSpeechLoading(false);
+      }
     }
   }
 
@@ -1793,9 +1807,12 @@ function _Chat() {
                                       <SpeakIcon />
                                     )
                                   }
-                                  onClick={() =>
-                                    openaiSpeech(getMessageTextContent(message))
-                                  }
+                                  onClick={async () => {
+                                    const url = await openaiSpeech(
+                                      getMessageTextContent(message),
+                                    );
+                                    updateMessageAudio(message.id, url);
+                                  }}
                                 />
                               )}
                             </>
@@ -1830,7 +1847,11 @@ function _Chat() {
                       ))}
                     </div>
                   )}
-                  <div className={styles["chat-message-item"]}>
+                  <div
+                    className={`${styles["chat-message-item"]} ${
+                      message.audio_url ? styles["audio-message"] : ""
+                    }`}
+                  >
                     <Markdown
                       key={message.streaming ? "loading" : "done"}
                       content={getMessageTextContent(message)}
@@ -1879,6 +1900,16 @@ function _Chat() {
                         })}
                       </div>
                     )}
+                    {message.audio_url && (
+                      <audio
+                        preload="auto"
+                        controls
+                        className={styles["chat-message-item-audio"]}
+                      >
+                        <source type="audio/mp3" src={message.audio_url} />
+                        Sorry, your browser does not support HTML5 audio.
+                      </audio>
+                    )}
                   </div>
 
                   <div className={styles["chat-message-action-date"]}>

diff --git a/app/icons/play.svg b/app/icons/play.svg
diff --git a/app/icons/stop.svg b/app/icons/stop.svg
diff --git a/app/styles/globals.scss b/app/styles/globals.scss
@@ -399,3 +399,13 @@ pre {
 .copyable {
   user-select: text;
 }
+
+audio {
+  height: 35px;
+}
+audio::-webkit-media-controls-play-button,
+audio::-webkit-media-controls-panel,
+audio::-moz-media-controls-play-button,
+audio::-moz-media-controls-panel {
+  background: none;
+}
diff --git a/app/utils/audio.ts b/app/utils/audio.ts
@@ -43,3 +43,57 @@ export function createTTSPlayer(): TTSPlayer {
 
   return { init, play, stop };
 }
+
+export function arrayBufferToWav(buffer: ArrayBuffer): ArrayBuffer {
+  const numOfChannels = 1; // Mono
+  const sampleRate = 24000; // 24kHz
+  const bitsPerSample = 16;
+
+  const bytesPerSample = bitsPerSample / 8;
+  const blockAlign = numOfChannels * bytesPerSample;
+  const byteRate = sampleRate * blockAlign;
+
+  // WAV header size is 44 bytes
+  const wavHeaderSize = 44;
+  const dataSize = buffer.byteLength;
+  const totalSize = wavHeaderSize + dataSize;
+
+  const wavBuffer = new ArrayBuffer(totalSize);
+  const view = new DataView(wavBuffer);
+
+  // RIFF chunk descriptor
+  writeString(view, 0, "RIFF");
+  view.setUint32(4, totalSize - 8, true); // File size minus RIFF header
+  writeString(view, 8, "WAVE");
+
+  // FMT sub-chunk
+  writeString(view, 12, "fmt ");
+  view.setUint32(16, 16, true); // Sub-chunk size (16 for PCM)
+  view.setUint16(20, 1, true); // Audio format (1 for PCM)
+  view.setUint16(22, numOfChannels, true); // Number of channels
+  view.setUint32(24, sampleRate, true); // Sample rate
+  view.setUint32(28, byteRate, true); // Byte rate
+  view.setUint16(32, blockAlign, true); // Block align
+  view.setUint16(34, bitsPerSample, true); // Bits per sample
+
+  // Data sub-chunk
+  writeString(view, 36, "data");
+  view.setUint32(40, dataSize, true); // Data size
+
+  // Write the PCM samples
+  const audioData = new Uint8Array(buffer);
+  const wavData = new Uint8Array(wavBuffer);
+  wavData.set(audioData, wavHeaderSize);
+
+  return wavBuffer;
+}
+
+// Helper function to write a string to the DataView
+function writeString(view: DataView, offset: number, string: string) {
+  if (offset + string.length > view.byteLength) {
+    throw new Error("String is too long for the available space in DataView");
+  }
+  for (let i = 0; i < string.length; i++) {
+    view.setUint8(offset + i, string.charCodeAt(i));
+  }
+}
diff --git a/package.json b/package.json
@@ -33,8 +33,8 @@
     "html-to-image": "^1.11.11",
     "idb-keyval": "^6.2.1",
     "lodash-es": "^4.17.21",
-    "mermaid": "^10.6.1",
     "markdown-to-txt": "^2.0.1",
+    "mermaid": "^10.6.1",
     "nanoid": "^5.0.3",
     "next": "^14.1.1",
     "node-fetch": "^3.3.1",