feat:실시간 자동 화자 감지 시스템 구현 (Spectral Centroid 기반)

- Web Audio API rawAnalyser를 오디오 체인에 삽입 (compressor 이전, 원본 신호 분석) - Spectral Centroid + VAD 기반 100ms 간격 실시간 화자 분류 엔진 구현 - 500ms 윈도우 다수결 투표로 화자 안정성 확보 - 수동 화자 선택 버튼 제거 → 자동 감지 인디케이터로 대체 - 최대 4명까지 자동 화자 프로필 등록 및 speakers 동기화 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-11 21:48:20 +09:00
parent 4169b5ce4e
commit 9ae6890141
1 changed files with 172 additions and 44 deletions
--- a/resources/views/juil/meeting-minutes.blade.php
+++ b/resources/views/juil/meeting-minutes.blade.php
@@ -257,8 +257,8 @@ function MeetingDetail({ meetingId, onBack, showToast }) {
    const [recordingTime, setRecordingTime] = useState(0);
    const [localSegments, setLocalSegments] = useState([]);
    const [interimText, setInterimText] = useState('');
-    const [currentSpeakerIdx, setCurrentSpeakerIdx] = useState(0);
-    const [speakers, setSpeakers] = useState([{ name: '화자 1' }, { name: '화자 2' }]);
+    const [detectedSpeaker, setDetectedSpeaker] = useState('화자 1');
+    const [speakers, setSpeakers] = useState([{ name: '화자 1' }]);
    const [sttLanguage, setSttLanguage] = useState('ko-KR');

    // 편집 상태
@@ -289,6 +289,11 @@ function MeetingDetail({ meetingId, onBack, showToast }) {
    const audioContextRef = useRef(null);
    const analyserRef = useRef(null);
    const gainNodeRef = useRef(null);
+    const rawAnalyserRef = useRef(null);
+    const speakerDetectorRef = useRef(null);
+    const speakerProfilesRef = useRef([]);
+    const recentFeaturesRef = useRef([]);
+    const detectedSpeakerRef = useRef('화자 1');

    const loadMeeting = useCallback(async () => {
        try {
@@ -337,14 +342,122 @@ function MeetingDetail({ meetingId, onBack, showToast }) {
        }
    }, [localSegments, interimText]);

-    const currentSpeaker = speakers[currentSpeakerIdx] || speakers[0];
-    const speakerColor = SPEAKER_COLORS[currentSpeakerIdx % SPEAKER_COLORS.length];
-
    const getSpeakerColor = (name) => {
        const idx = speakers.findIndex(s => s.name === name);
        return SPEAKER_COLORS[idx >= 0 ? idx % SPEAKER_COLORS.length : 0];
    };

+    // ===== 화자 분류 (Spectral Centroid 기반) =====
+    const classifySpeaker = useCallback((centroid, spread) => {
+        const profiles = speakerProfilesRef.current;
+
+        if (profiles.length === 0) {
+            profiles.push({ centroid, spread, count: 1, name: '화자 1' });
+            return '화자 1';
+        }
+
+        const distances = profiles.map(p => Math.abs(p.centroid - centroid) / p.centroid);
+        const minDist = Math.min(...distances);
+        const minIdx = distances.indexOf(minDist);
+
+        if (minDist < 0.15) {
+            const p = profiles[minIdx];
+            p.centroid = (p.centroid * p.count + centroid) / (p.count + 1);
+            p.spread = (p.spread * p.count + spread) / (p.count + 1);
+            p.count++;
+            return p.name;
+        }
+
+        if (profiles.length < 4) {
+            const newName = `화자 ${profiles.length + 1}`;
+            profiles.push({ centroid, spread, count: 1, name: newName });
+            setSpeakers(prev => {
+                if (prev.find(s => s.name === newName)) return prev;
+                return [...prev, { name: newName }];
+            });
+            return newName;
+        }
+
+        return profiles[minIdx].name;
+    }, []);
+
+    // ===== 화자 감지 (100ms 간격 호출) =====
+    const detectSpeaker = useCallback(() => {
+        const analyser = rawAnalyserRef.current;
+        if (!analyser) return;
+
+        const sampleRate = audioContextRef.current?.sampleRate || 48000;
+        const fftSize = analyser.fftSize;
+        const binCount = analyser.frequencyBinCount; // fftSize / 2
+        const binHz = sampleRate / fftSize; // 각 bin의 Hz 간격
+
+        // 주파수 데이터 (0~255)
+        const freqData = new Uint8Array(binCount);
+        analyser.getByteFrequencyData(freqData);
+
+        // 시간 도메인 데이터 (VAD용)
+        const timeData = new Uint8Array(fftSize);
+        analyser.getByteTimeDomainData(timeData);
+
+        // RMS 계산 (VAD)
+        let rmsSum = 0;
+        for (let i = 0; i < timeData.length; i++) {
+            const v = (timeData[i] - 128) / 128;
+            rmsSum += v * v;
+        }
+        const rms = Math.sqrt(rmsSum / timeData.length);
+
+        // VAD: 음성 미감지 시 무시
+        if (rms < 0.015) return;
+
+        // 음성 범위 bin (85Hz ~ 1000Hz)
+        const minBin = Math.floor(85 / binHz);
+        const maxBin = Math.min(Math.ceil(1000 / binHz), binCount - 1);
+
+        // Spectral Centroid 계산
+        let weightedSum = 0;
+        let amplitudeSum = 0;
+        for (let i = minBin; i <= maxBin; i++) {
+            const freq = i * binHz;
+            const amp = freqData[i];
+            weightedSum += freq * amp;
+            amplitudeSum += amp;
+        }
+
+        if (amplitudeSum < 10) return; // 에너지 부족
+
+        const centroid = weightedSum / amplitudeSum;
+
+        // Spectral Spread 계산
+        let spreadSum = 0;
+        for (let i = minBin; i <= maxBin; i++) {
+            const freq = i * binHz;
+            const amp = freqData[i];
+            spreadSum += Math.pow(freq - centroid, 2) * amp;
+        }
+        const spread = Math.sqrt(spreadSum / amplitudeSum);
+
+        // 최근 피처 버퍼에 추가 (500ms 윈도우)
+        const now = Date.now();
+        const features = recentFeaturesRef.current;
+        features.push({ centroid, spread, timestamp: now });
+
+        // 500ms 이전 피처 제거
+        const cutoff = now - 500;
+        recentFeaturesRef.current = features.filter(f => f.timestamp > cutoff);
+
+        // 다수결: 최근 피처들의 화자 분류
+        const votes = recentFeaturesRef.current.map(f => classifySpeaker(f.centroid, f.spread));
+        if (votes.length === 0) return;
+
+        // 최빈값 계산
+        const counts = {};
+        votes.forEach(v => { counts[v] = (counts[v] || 0) + 1; });
+        const winner = Object.entries(counts).sort((a, b) => b[1] - a[1])[0][0];
+
+        setDetectedSpeaker(winner);
+    }, [classifySpeaker]);
+
    // ===== 녹음 시작 =====
    const startRecording = async () => {
        try {
@@ -386,8 +499,15 @@ function MeetingDetail({ meetingId, onBack, showToast }) {
            // MediaStreamDestination: 처리된 스트림
            const destination = audioCtx.createMediaStreamDestination();

-            // 체인 연결: source → gain → compressor → analyser → destination
-            source.connect(gainNode);
+            // rawAnalyser: 화자 감지용 (compressor 이전, 원본 신호 분석)
+            const rawAnalyser = audioCtx.createAnalyser();
+            rawAnalyser.fftSize = 2048;
+            rawAnalyser.smoothingTimeConstant = 0.3;
+            rawAnalyserRef.current = rawAnalyser;
+
+            // 체인 연결: source → rawAnalyser(화자감지) → gain → compressor → analyser → destination
+            source.connect(rawAnalyser);
+            rawAnalyser.connect(gainNode);
            gainNode.connect(compressor);
            compressor.connect(analyser);
            analyser.connect(destination);
@@ -429,7 +549,7 @@ function MeetingDetail({ meetingId, onBack, showToast }) {
                            const now = Date.now();
                            const startMs = startTimeRef.current ? now - startTimeRef.current : 0;
                            setLocalSegments(prev => [...prev, {
-                                speaker_name: currentSpeaker.name,
+                                speaker_name: detectedSpeakerRef.current,
                                text: text.trim(),
                                start_time_ms: startMs,
                                end_time_ms: null,
@@ -461,6 +581,13 @@ function MeetingDetail({ meetingId, onBack, showToast }) {
            startTimeRef.current = Date.now();
            setRecordingTime(0);
            setIsRecording(true);
+
+            // 화자 감지 시작
+            speakerProfilesRef.current = [];
+            recentFeaturesRef.current = [];
+            setDetectedSpeaker('화자 1');
+            setSpeakers([{ name: '화자 1' }]);
+            speakerDetectorRef.current = setInterval(detectSpeaker, 100);
        } catch (e) {
            showToast('마이크 접근 권한이 필요합니다.', 'error');
        }
@@ -469,6 +596,7 @@ function MeetingDetail({ meetingId, onBack, showToast }) {
    // isRecording ref (onend에서 접근)
    const isRecordingRef = useRef(false);
    useEffect(() => { isRecordingRef.current = isRecording; }, [isRecording]);
+    useEffect(() => { detectedSpeakerRef.current = detectedSpeaker; }, [detectedSpeaker]);

    // ===== 녹음 중지 =====
    const stopRecording = async () => {
@@ -485,12 +613,19 @@ function MeetingDetail({ meetingId, onBack, showToast }) {
            streamRef.current = null;
        }

+        // 화자 감지 중지
+        if (speakerDetectorRef.current) {
+            clearInterval(speakerDetectorRef.current);
+            speakerDetectorRef.current = null;
+        }
+
        // AudioContext 정리
        if (audioContextRef.current) {
            audioContextRef.current.close().catch(() => {});
            audioContextRef.current = null;
            analyserRef.current = null;
            gainNodeRef.current = null;
+            rawAnalyserRef.current = null;
        }

        // MediaRecorder 중지 → blob 생성
@@ -613,17 +748,6 @@ function MeetingDetail({ meetingId, onBack, showToast }) {
        }
    };

-    // ===== 화자 전환 =====
-    const switchSpeaker = (idx) => {
-        setCurrentSpeakerIdx(idx);
-    };
-
-    const addSpeaker = () => {
-        if (speakers.length >= 4) return;
-        const newIdx = speakers.length;
-        setSpeakers(prev => [...prev, { name: `화자 ${newIdx + 1}` }]);
-    };
-
    // ===== 제목 인라인 편집 =====
    const saveTitle = async () => {
        if (!titleValue.trim()) return;
@@ -881,7 +1005,7 @@ function MeetingDetail({ meetingId, onBack, showToast }) {
                {/* Left: Transcript */}
                <div className="flex-1 overflow-y-auto p-4" ref={transcriptRef}>
                    {activeTab === 'conversation' ? (
-                        <ConversationView segments={segments} interimText={interimText} isRecording={isRecording} currentSpeaker={currentSpeaker} getSpeakerColor={getSpeakerColor} editing={editingSegments} onEditText={handleEditText} onEditSpeaker={handleEditSpeaker} onDeleteSegment={handleDeleteSegment} speakers={speakers} />
+                        <ConversationView segments={segments} interimText={interimText} isRecording={isRecording} detectedSpeaker={detectedSpeaker} getSpeakerColor={getSpeakerColor} editing={editingSegments} onEditText={handleEditText} onEditSpeaker={handleEditSpeaker} onDeleteSegment={handleDeleteSegment} speakers={speakers} />
                    ) : (
                        <ScriptView segments={segments} interimText={interimText} isRecording={isRecording} />
                    )}
@@ -899,13 +1023,11 @@ function MeetingDetail({ meetingId, onBack, showToast }) {
            <RecordingControlBar
                isRecording={isRecording}
                recordingTime={recordingTime}
-                currentSpeakerIdx={currentSpeakerIdx}
-                speakers={speakers}
+                detectedSpeaker={detectedSpeaker}
+                speakerCount={speakers.length}
                sttLanguage={sttLanguage}
                onStart={startRecording}
                onStop={stopRecording}
-                onSwitchSpeaker={switchSpeaker}
-                onAddSpeaker={addSpeaker}
                onLanguageChange={setSttLanguage}
                onSummarize={handleSummarize}
                onDiarize={handleDiarize}
@@ -918,6 +1040,7 @@ function MeetingDetail({ meetingId, onBack, showToast }) {
                micGain={micGain}
                onMicGainChange={setMicGain}
                gainNodeRef={gainNodeRef}
+                getSpeakerColor={getSpeakerColor}
            />
        </div>
    );
@@ -945,7 +1068,7 @@ className={className}
 }

 // ========== ConversationView ==========
-function ConversationView({ segments, interimText, isRecording, currentSpeaker, getSpeakerColor, editing, onEditText, onEditSpeaker, onDeleteSegment, speakers }) {
+function ConversationView({ segments, interimText, isRecording, detectedSpeaker, getSpeakerColor, editing, onEditText, onEditSpeaker, onDeleteSegment, speakers }) {
    if (segments.length === 0 && !interimText) {
        return (
            <div className="flex flex-col items-center justify-center h-full text-gray-400">
@@ -1047,9 +1170,9 @@ className="w-full text-sm text-gray-800 leading-relaxed bg-white/70 border borde
            {isRecording && interimText && (
                <div className="rounded-lg border-2 border-dashed border-gray-300 p-3 bg-white">
                    <div className="flex items-center gap-2 mb-1.5">
-                        <span className={`inline-flex items-center px-2 py-0.5 rounded-full text-xs font-medium ${getSpeakerColor(currentSpeaker.name).badge}`}>
+                        <span className={`inline-flex items-center px-2 py-0.5 rounded-full text-xs font-medium ${getSpeakerColor(detectedSpeaker).badge}`}>
                            <span className={`w-2 h-2 rounded-full bg-red-500 mr-1 animate-pulse`}></span>
-                            {currentSpeaker.name}
+                            {detectedSpeaker}
                        </span>
                        <span className="text-xs text-gray-400">인식 중...</span>
                    </div>
@@ -1209,7 +1332,9 @@ function AudioLevelMeter({ analyserRef, isRecording }) {
 }

 // ========== RecordingControlBar ==========
-function RecordingControlBar({ isRecording, recordingTime, currentSpeakerIdx, speakers, sttLanguage, onStart, onStop, onSwitchSpeaker, onAddSpeaker, onLanguageChange, onSummarize, onDiarize, saving, summarizing, diarizing, hasSegments, hasAudio, analyserRef, micGain, onMicGainChange, gainNodeRef }) {
+function RecordingControlBar({ isRecording, recordingTime, detectedSpeaker, speakerCount, sttLanguage, onStart, onStop, onLanguageChange, onSummarize, onDiarize, saving, summarizing, diarizing, hasSegments, hasAudio, analyserRef, micGain, onMicGainChange, gainNodeRef, getSpeakerColor }) {
+    const detectedColor = getSpeakerColor ? getSpeakerColor(detectedSpeaker) : SPEAKER_COLORS[0];
+
    return (
        <div className="bg-white border-t shadow-lg px-4 py-3 flex-shrink-0">
            <div className="flex items-center justify-between gap-3">
@@ -1220,20 +1345,25 @@ function RecordingControlBar({ isRecording, recordingTime, currentSpeakerIdx, sp
                    </select>
                </div>

-                {/* Speaker Selection */}
-                <div className="flex items-center gap-1">
-                    {speakers.map((sp, idx) => {
-                        const c = SPEAKER_COLORS[idx % SPEAKER_COLORS.length];
-                        return (
-                            <button key={idx} onClick={() => onSwitchSpeaker(idx)} className={`px-3 py-1.5 rounded-full text-xs font-medium transition ${currentSpeakerIdx === idx ? `${c.badge} ring-2 ring-offset-1 ring-blue-400` : 'bg-gray-100 text-gray-600 hover:bg-gray-200'}`}>
-                                {sp.name}
-                            </button>
-                        );
-                    })}
-                    {speakers.length < 4 && (
-                        <button onClick={onAddSpeaker} disabled={isRecording} className="px-2 py-1.5 rounded-full text-xs text-gray-400 hover:text-gray-600 hover:bg-gray-100 disabled:opacity-50" title="화자 추가">+</button>
-                    )}
-                </div>
+                {/* Auto Speaker Detection Indicator */}
+                {isRecording ? (
+                    <div className="flex items-center gap-2">
+                        <div className="flex items-center gap-1.5 px-3 py-1.5 rounded-full bg-emerald-50 border border-emerald-200">
+                            <svg className="w-3.5 h-3.5 text-emerald-500" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M19 11a7 7 0 01-7 7m0 0a7 7 0 01-7-7m7 7v4m0 0H8m4 0h4m-4-8a3 3 0 01-3-3V5a3 3 0 116 0v6a3 3 0 01-3 3z" /></svg>
+                            <span className="text-xs font-medium text-emerald-700">화자 자동 감지</span>
+                        </div>
+                        <span className={`inline-flex items-center px-2.5 py-1 rounded-full text-xs font-medium ${detectedColor.badge} transition-all`}>
+                            <span className={`w-2 h-2 rounded-full ${detectedColor.dot} mr-1 animate-pulse`}></span>
+                            {detectedSpeaker}
+                        </span>
+                        <span className="text-xs text-gray-400">{speakerCount}명 감지</span>
+                    </div>
+                ) : (
+                    <div className="flex items-center gap-1.5 px-3 py-1.5 rounded-full bg-gray-50 border border-gray-200">
+                        <svg className="w-3.5 h-3.5 text-gray-400" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M19 11a7 7 0 01-7 7m0 0a7 7 0 01-7-7m7 7v4m0 0H8m4 0h4m-4-8a3 3 0 01-3-3V5a3 3 0 116 0v6a3 3 0 01-3 3z" /></svg>
+                        <span className="text-xs text-gray-500">화자 자동 감지</span>
+                    </div>
+                )}

                {/* Mic Gain Slider */}
                {isRecording && (
@@ -1284,12 +1414,10 @@ className="w-16 h-1 accent-blue-500"

                {/* Action Buttons */}
                <div className="flex items-center gap-2">
-                    {/* 자동 화자 분리 버튼 */}
                    <button onClick={onDiarize} disabled={diarizing || summarizing || isRecording || !hasAudio} className="bg-indigo-600 text-white px-3 py-1.5 rounded-lg hover:bg-indigo-700 transition text-xs font-medium disabled:opacity-50 disabled:cursor-not-allowed flex items-center gap-1" title="업로드된 오디오에서 AI로 자동 화자 구분">
                        <svg className="w-4 h-4" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M17 20h5v-2a3 3 0 00-5.356-1.857M17 20H7m10 0v-2c0-.656-.126-1.283-.356-1.857M7 20H2v-2a3 3 0 015.356-1.857M7 20v-2c0-.656.126-1.283.356-1.857m0 0a5.002 5.002 0 019.288 0M15 7a3 3 0 11-6 0 3 3 0 016 0z" /></svg>
                        화자 분리
                    </button>
-                    {/* AI 요약 버튼 */}
                    <button onClick={onSummarize} disabled={summarizing || diarizing || isRecording || !hasSegments} className="bg-purple-600 text-white px-3 py-1.5 rounded-lg hover:bg-purple-700 transition text-xs font-medium disabled:opacity-50 disabled:cursor-not-allowed flex items-center gap-1">
                        <svg className="w-4 h-4" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M9.663 17h4.673M12 3v1m6.364 1.636l-.707.707M21 12h-1M4 12H3m3.343-5.657l-.707-.707m2.828 9.9a5 5 0 117.072 0l-.548.547A3.374 3.374 0 0014 18.469V19a2 2 0 11-4 0v-.531c0-.895-.356-1.754-.988-2.386l-.548-.547z" /></svg>
                        AI 요약