feat: 增加语音识别功能和相关UI改进

在备忘录编辑器中添加语音识别功能，支持中文转写，并优化了录音按钮的UI。更新了index.html以支持新的资源和样式。
4 weeks ago · 2ba2f47030
parent 7a1c4e7e3e
commit 2ba2f47030
4 changed files with 204 additions and 7 deletions
--- a/server/router/frontend/dist/index.html
+++ b/server/router/frontend/dist/index.html
@ -1,11 +1,25 @@
-<!DOCTYPE html>
+<!doctype html>
 <html lang="en">
  <head>
    <meta charset="UTF-8" />
-    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
+    <link rel="apple-touch-icon" sizes="180x180" href="/apple-touch-icon.png" />
+    <link rel="icon" type="image/webp" href="/logo.webp" />
+    <link rel="manifest" href="/site.webmanifest" />
+    <meta name="theme-color" media="(prefers-color-scheme: light)" content="#f4f4f5" />
+    <meta name="theme-color" media="(prefers-color-scheme: dark)" content="#18181b" />
+    <meta name="viewport" content="width=device-width, initial-scale=1, user-scalable=no" />
+    <!-- memos.metadata.head -->
    <title>Memos</title>
+    <script type="module" crossorigin src="/assets/index-BPauSa-q.js"></script>
+    <link rel="modulepreload" crossorigin href="/assets/mui-vendor-Bq4rR2hV.js">
+    <link rel="modulepreload" crossorigin href="/assets/utils-vendor-CofsvS4N.js">
+    <link rel="modulepreload" crossorigin href="/assets/mermaid-vendor-CTsb84w1.js">
+    <link rel="modulepreload" crossorigin href="/assets/katex-vendor-ChWnQ-fc.js">
+    <link rel="modulepreload" crossorigin href="/assets/leaflet-vendor-DFXsBYSp.js">
+    <link rel="stylesheet" crossorigin href="/assets/index-Dpyk-JdG.css">
  </head>
-  <body>
-    No embeddable frontend found.
+  <body class="text-base w-full min-h-svh bg-zinc-50 dark:bg-zinc-900">
+    <div id="root" class="relative w-full min-h-full"></div>
+    <!-- memos.metadata.body -->
  </body>
 </html>
--- a/web/src/components/MemoEditor/ActionButton/RecordAudioButton.tsx
+++ b/web/src/components/MemoEditor/ActionButton/RecordAudioButton.tsx
@ -1,17 +1,169 @@
 import { Button } from "@usememos/mui";
 import { MicIcon, StopCircleIcon } from "lucide-react";
-import { useCallback, useContext, useState } from "react";
+import { useCallback, useContext, useState, useRef } from "react";
 import toast from "react-hot-toast";
 import { resourceStore } from "@/store/v2";
 import { Resource } from "@/types/proto/api/v1/resource_service";
 import { useTranslate } from "@/utils/i18n";
 import { MemoEditorContext } from "../types";

+// 声明 Web Speech API 类型
+interface ISpeechRecognition extends EventTarget {
+  continuous: boolean;
+  interimResults: boolean;
+  lang: string;
+  start(): void;
+  stop(): void;
+  abort(): void;
+  onstart: ((this: ISpeechRecognition, ev: Event) => any) | null;
+  onresult: ((this: ISpeechRecognition, ev: SpeechRecognitionEvent) => any) | null;
+  onerror: ((this: ISpeechRecognition, ev: SpeechRecognitionErrorEvent) => any) | null;
+  onend: ((this: ISpeechRecognition, ev: Event) => any) | null;
+}
+
+interface SpeechRecognitionEvent extends Event {
+  resultIndex: number;
+  results: SpeechRecognitionResultList;
+}
+
+interface SpeechRecognitionErrorEvent extends Event {
+  error: string;
+}
+
+declare global {
+  interface Window {
+    SpeechRecognition: new () => ISpeechRecognition;
+    webkitSpeechRecognition: new () => ISpeechRecognition;
+  }
+}
+
 const RecordAudioButton = () => {
  const t = useTranslate();
  const context = useContext(MemoEditorContext);
  const [isRecording, setIsRecording] = useState(false);
  const [mediaRecorder, setMediaRecorder] = useState<MediaRecorder | null>(null);
+  const [isTranscribing, setIsTranscribing] = useState(false);
+  const speechRecognitionRef = useRef<ISpeechRecognition | null>(null);
+  
+  // 用于跟踪临时转写文本的状态
+  const interimTranscriptRef = useRef<string>('');
+  const finalTranscriptRef = useRef<string>('');
+  const insertPositionRef = useRef<number>(0);
+
+  // 检测浏览器是否支持语音识别
+  const isSpeechRecognitionSupported = () => {
+    return 'webkitSpeechRecognition' in window || 'SpeechRecognition' in window;
+  };
+
+  // 初始化语音识别
+  const initSpeechRecognition = useCallback(() => {
+    if (!isSpeechRecognitionSupported()) {
+      return null;
+    }
+
+    const SpeechRecognition = window.SpeechRecognition || window.webkitSpeechRecognition;
+    const recognition = new SpeechRecognition();
+    
+    recognition.continuous = true;
+    recognition.interimResults = true;
+    recognition.lang = 'zh-CN'; // 默认中文，可以根据需要调整
+
+    recognition.onstart = () => {
+      setIsTranscribing(true);
+      console.log('语音识别已开始');
+      
+      // 记录开始位置
+      if (context.editorRef?.current) {
+        const editor = context.editorRef.current;
+        const currentContent = editor.getContent();
+        insertPositionRef.current = currentContent.length;
+        
+        // 清空转写状态
+        interimTranscriptRef.current = '';
+        finalTranscriptRef.current = '';
+      }
+    };
+
+    recognition.onresult = (event: SpeechRecognitionEvent) => {
+      let interimTranscript = '';
+      let finalTranscript = '';
+
+      // 处理所有结果
+      for (let i = event.resultIndex; i < event.results.length; i++) {
+        const transcript = event.results[i][0].transcript;
+        if (event.results[i].isFinal) {
+          finalTranscript += transcript;
+        } else {
+          interimTranscript += transcript;
+        }
+      }
+
+      if (context.editorRef?.current) {
+        const editor = context.editorRef.current;
+        const currentContent = editor.getContent();
+        
+        // 计算需要移除的旧文本长度
+        const oldTextLength = finalTranscriptRef.current.length + interimTranscriptRef.current.length;
+        
+        // 如果有旧的转写文本，先移除它
+        if (oldTextLength > 0) {
+          const newContent = currentContent.slice(0, insertPositionRef.current) + 
+                           currentContent.slice(insertPositionRef.current + oldTextLength);
+          editor.setContent(newContent);
+        }
+        
+        // 更新转写状态
+        if (finalTranscript) {
+          finalTranscriptRef.current += finalTranscript;
+        }
+        interimTranscriptRef.current = interimTranscript;
+        
+        // 插入新的转写文本
+        const newTranscriptText = finalTranscriptRef.current + interimTranscript;
+        if (newTranscriptText) {
+          const contentBeforeInsert = editor.getContent();
+          let textToInsert = newTranscriptText;
+          
+          // 在插入位置添加适当的空格
+          if (insertPositionRef.current > 0 && 
+              contentBeforeInsert[insertPositionRef.current - 1] &&
+              !contentBeforeInsert[insertPositionRef.current - 1].match(/[\s\n]/)) {
+            textToInsert = ' ' + textToInsert;
+          }
+          
+          // 插入文本
+          const newContent = contentBeforeInsert.slice(0, insertPositionRef.current) + 
+                           textToInsert + 
+                           contentBeforeInsert.slice(insertPositionRef.current);
+          editor.setContent(newContent);
+          
+          // 设置光标位置到文本末尾
+          const cursorPosition = insertPositionRef.current + textToInsert.length;
+          editor.setCursorPosition(cursorPosition);
+        }
+      }
+    };
+
+    recognition.onerror = (event: SpeechRecognitionErrorEvent) => {
+      console.error('语音识别错误:', event.error);
+      if (event.error === 'not-allowed') {
+        toast.error(t("message.microphone-not-available"));
+      } else {
+        toast.error(`语音识别错误: ${event.error}`);
+      }
+    };
+
+    recognition.onend = () => {
+      setIsTranscribing(false);
+      console.log('语音识别已结束');
+      
+      // 清空转写状态
+      interimTranscriptRef.current = '';
+      finalTranscriptRef.current = '';
+    };
+
+    return recognition;
+  }, [t, context]);

  // 检测浏览器支持的音频格式
  const getSupportedMimeType = () => {
@ -73,6 +225,9 @@ const RecordAudioButton = () => {
            }),
          });
          context.setResourceList([...context.resourceList, resource]);
+
+          // 录音完成提示
+          toast.success(`录音和转写已完成`);
        } catch (error: any) {
          console.error(error);
          toast.error(error.details);
@ -85,11 +240,23 @@ const RecordAudioButton = () => {
      recorder.start(1000);
      setMediaRecorder(recorder);
      setIsRecording(true);
+
+      // 开始语音识别
+      if (isSpeechRecognitionSupported()) {
+        const recognition = initSpeechRecognition();
+        if (recognition) {
+          speechRecognitionRef.current = recognition;
+          recognition.start();
+        }
+      } else {
+        toast.error("您的浏览器不支持语音识别功能");
+      }
+
    } catch (error) {
      console.error(error);
      toast.error(t("message.microphone-not-available"));
    }
-  }, [context, resourceStore, t]);
+  }, [context, resourceStore, t, initSpeechRecognition]);

  const stopRecording = useCallback(() => {
    if (mediaRecorder) {
@ -97,10 +264,23 @@ const RecordAudioButton = () => {
      setMediaRecorder(null);
      setIsRecording(false);
    }
+
+    // 停止语音识别
+    if (speechRecognitionRef.current) {
+      speechRecognitionRef.current.stop();
+      speechRecognitionRef.current = null;
+    }
+
+    setIsTranscribing(false);
  }, [mediaRecorder]);

  return (
-    <Button className="relative" size="sm" variant="plain" onClick={isRecording ? stopRecording : startRecording}>
+    <Button 
+      className={`p-0 relative ${isTranscribing ? 'text-green-500' : ''}`} 
+      size="sm" 
+      variant="plain" 
+      onClick={isRecording ? stopRecording : startRecording}
+    >
      {isRecording ? <StopCircleIcon className="w-5 h-5 mx-auto text-red-500" /> : <MicIcon className="w-5 h-5 mx-auto" />}
    </Button>
  );
--- a/web/src/components/MemoEditor/index.tsx
+++ b/web/src/components/MemoEditor/index.tsx
@ -476,6 +476,7 @@ const MemoEditor = observer((props: Props) => {
          }));
        },
        memoName,
+        editorRef,
      }}
    >
      <div
--- a/web/src/components/MemoEditor/types/context.ts
+++ b/web/src/components/MemoEditor/types/context.ts
@ -1,6 +1,7 @@
 import { createContext } from "react";
 import { MemoRelation } from "@/types/proto/api/v1/memo_service";
 import { Resource } from "@/types/proto/api/v1/resource_service";
+import { EditorRefActions } from "../Editor";

 interface Context {
  resourceList: Resource[];
@ -8,6 +9,7 @@ interface Context {
  setResourceList: (resourceList: Resource[]) => void;
  setRelationList: (relationList: MemoRelation[]) => void;
  memoName?: string;
+  editorRef?: React.RefObject<EditorRefActions>;
 }

 export const MemoEditorContext = createContext<Context>({