feat:실시간 자동 화자 감지 시스템 구현 (Spectral Centroid 기반)

- Web Audio API rawAnalyser를 오디오 체인에 삽입 (compressor 이전, 원본 신호 분석)
- Spectral Centroid + VAD 기반 100ms 간격 실시간 화자 분류 엔진 구현
- 500ms 윈도우 다수결 투표로 화자 안정성 확보
- 수동 화자 선택 버튼 제거 → 자동 감지 인디케이터로 대체
- 최대 4명까지 자동 화자 프로필 등록 및 speakers 동기화

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
김보곤
2026-02-11 21:48:20 +09:00
parent 4169b5ce4e
commit 9ae6890141

View File

@@ -257,8 +257,8 @@ function MeetingDetail({ meetingId, onBack, showToast }) {
const [recordingTime, setRecordingTime] = useState(0);
const [localSegments, setLocalSegments] = useState([]);
const [interimText, setInterimText] = useState('');
const [currentSpeakerIdx, setCurrentSpeakerIdx] = useState(0);
const [speakers, setSpeakers] = useState([{ name: '화자 1' }, { name: '화자 2' }]);
const [detectedSpeaker, setDetectedSpeaker] = useState('화자 1');
const [speakers, setSpeakers] = useState([{ name: '화자 1' }]);
const [sttLanguage, setSttLanguage] = useState('ko-KR');
// 편집 상태
@@ -289,6 +289,11 @@ function MeetingDetail({ meetingId, onBack, showToast }) {
const audioContextRef = useRef(null);
const analyserRef = useRef(null);
const gainNodeRef = useRef(null);
const rawAnalyserRef = useRef(null);
const speakerDetectorRef = useRef(null);
const speakerProfilesRef = useRef([]);
const recentFeaturesRef = useRef([]);
const detectedSpeakerRef = useRef('화자 1');
const loadMeeting = useCallback(async () => {
try {
@@ -337,14 +342,122 @@ function MeetingDetail({ meetingId, onBack, showToast }) {
}
}, [localSegments, interimText]);
const currentSpeaker = speakers[currentSpeakerIdx] || speakers[0];
const speakerColor = SPEAKER_COLORS[currentSpeakerIdx % SPEAKER_COLORS.length];
const getSpeakerColor = (name) => {
const idx = speakers.findIndex(s => s.name === name);
return SPEAKER_COLORS[idx >= 0 ? idx % SPEAKER_COLORS.length : 0];
};
// ===== 화자 분류 (Spectral Centroid 기반) =====
const classifySpeaker = useCallback((centroid, spread) => {
const profiles = speakerProfilesRef.current;
if (profiles.length === 0) {
profiles.push({ centroid, spread, count: 1, name: '화자 1' });
return '화자 1';
}
const distances = profiles.map(p => Math.abs(p.centroid - centroid) / p.centroid);
const minDist = Math.min(...distances);
const minIdx = distances.indexOf(minDist);
if (minDist < 0.15) {
const p = profiles[minIdx];
p.centroid = (p.centroid * p.count + centroid) / (p.count + 1);
p.spread = (p.spread * p.count + spread) / (p.count + 1);
p.count++;
return p.name;
}
if (profiles.length < 4) {
const newName = `화자 ${profiles.length + 1}`;
profiles.push({ centroid, spread, count: 1, name: newName });
setSpeakers(prev => {
if (prev.find(s => s.name === newName)) return prev;
return [...prev, { name: newName }];
});
return newName;
}
return profiles[minIdx].name;
}, []);
// ===== 화자 감지 (100ms 간격 호출) =====
const detectSpeaker = useCallback(() => {
const analyser = rawAnalyserRef.current;
if (!analyser) return;
const sampleRate = audioContextRef.current?.sampleRate || 48000;
const fftSize = analyser.fftSize;
const binCount = analyser.frequencyBinCount; // fftSize / 2
const binHz = sampleRate / fftSize; // 각 bin의 Hz 간격
// 주파수 데이터 (0~255)
const freqData = new Uint8Array(binCount);
analyser.getByteFrequencyData(freqData);
// 시간 도메인 데이터 (VAD용)
const timeData = new Uint8Array(fftSize);
analyser.getByteTimeDomainData(timeData);
// RMS 계산 (VAD)
let rmsSum = 0;
for (let i = 0; i < timeData.length; i++) {
const v = (timeData[i] - 128) / 128;
rmsSum += v * v;
}
const rms = Math.sqrt(rmsSum / timeData.length);
// VAD: 음성 미감지 시 무시
if (rms < 0.015) return;
// 음성 범위 bin (85Hz ~ 1000Hz)
const minBin = Math.floor(85 / binHz);
const maxBin = Math.min(Math.ceil(1000 / binHz), binCount - 1);
// Spectral Centroid 계산
let weightedSum = 0;
let amplitudeSum = 0;
for (let i = minBin; i <= maxBin; i++) {
const freq = i * binHz;
const amp = freqData[i];
weightedSum += freq * amp;
amplitudeSum += amp;
}
if (amplitudeSum < 10) return; // 에너지 부족
const centroid = weightedSum / amplitudeSum;
// Spectral Spread 계산
let spreadSum = 0;
for (let i = minBin; i <= maxBin; i++) {
const freq = i * binHz;
const amp = freqData[i];
spreadSum += Math.pow(freq - centroid, 2) * amp;
}
const spread = Math.sqrt(spreadSum / amplitudeSum);
// 최근 피처 버퍼에 추가 (500ms 윈도우)
const now = Date.now();
const features = recentFeaturesRef.current;
features.push({ centroid, spread, timestamp: now });
// 500ms 이전 피처 제거
const cutoff = now - 500;
recentFeaturesRef.current = features.filter(f => f.timestamp > cutoff);
// 다수결: 최근 피처들의 화자 분류
const votes = recentFeaturesRef.current.map(f => classifySpeaker(f.centroid, f.spread));
if (votes.length === 0) return;
// 최빈값 계산
const counts = {};
votes.forEach(v => { counts[v] = (counts[v] || 0) + 1; });
const winner = Object.entries(counts).sort((a, b) => b[1] - a[1])[0][0];
setDetectedSpeaker(winner);
}, [classifySpeaker]);
// ===== 녹음 시작 =====
const startRecording = async () => {
try {
@@ -386,8 +499,15 @@ function MeetingDetail({ meetingId, onBack, showToast }) {
// MediaStreamDestination: 처리된 스트림
const destination = audioCtx.createMediaStreamDestination();
// 체인 연결: source → gain → compressor → analyser → destination
source.connect(gainNode);
// rawAnalyser: 화자 감지용 (compressor 이전, 원본 신호 분석)
const rawAnalyser = audioCtx.createAnalyser();
rawAnalyser.fftSize = 2048;
rawAnalyser.smoothingTimeConstant = 0.3;
rawAnalyserRef.current = rawAnalyser;
// 체인 연결: source → rawAnalyser(화자감지) → gain → compressor → analyser → destination
source.connect(rawAnalyser);
rawAnalyser.connect(gainNode);
gainNode.connect(compressor);
compressor.connect(analyser);
analyser.connect(destination);
@@ -429,7 +549,7 @@ function MeetingDetail({ meetingId, onBack, showToast }) {
const now = Date.now();
const startMs = startTimeRef.current ? now - startTimeRef.current : 0;
setLocalSegments(prev => [...prev, {
speaker_name: currentSpeaker.name,
speaker_name: detectedSpeakerRef.current,
text: text.trim(),
start_time_ms: startMs,
end_time_ms: null,
@@ -461,6 +581,13 @@ function MeetingDetail({ meetingId, onBack, showToast }) {
startTimeRef.current = Date.now();
setRecordingTime(0);
setIsRecording(true);
// 화자 감지 시작
speakerProfilesRef.current = [];
recentFeaturesRef.current = [];
setDetectedSpeaker('화자 1');
setSpeakers([{ name: '화자 1' }]);
speakerDetectorRef.current = setInterval(detectSpeaker, 100);
} catch (e) {
showToast('마이크 접근 권한이 필요합니다.', 'error');
}
@@ -469,6 +596,7 @@ function MeetingDetail({ meetingId, onBack, showToast }) {
// isRecording ref (onend에서 접근)
const isRecordingRef = useRef(false);
useEffect(() => { isRecordingRef.current = isRecording; }, [isRecording]);
useEffect(() => { detectedSpeakerRef.current = detectedSpeaker; }, [detectedSpeaker]);
// ===== 녹음 중지 =====
const stopRecording = async () => {
@@ -485,12 +613,19 @@ function MeetingDetail({ meetingId, onBack, showToast }) {
streamRef.current = null;
}
// 화자 감지 중지
if (speakerDetectorRef.current) {
clearInterval(speakerDetectorRef.current);
speakerDetectorRef.current = null;
}
// AudioContext 정리
if (audioContextRef.current) {
audioContextRef.current.close().catch(() => {});
audioContextRef.current = null;
analyserRef.current = null;
gainNodeRef.current = null;
rawAnalyserRef.current = null;
}
// MediaRecorder 중지 → blob 생성
@@ -613,17 +748,6 @@ function MeetingDetail({ meetingId, onBack, showToast }) {
}
};
// ===== 화자 전환 =====
const switchSpeaker = (idx) => {
setCurrentSpeakerIdx(idx);
};
const addSpeaker = () => {
if (speakers.length >= 4) return;
const newIdx = speakers.length;
setSpeakers(prev => [...prev, { name: `화자 ${newIdx + 1}` }]);
};
// ===== 제목 인라인 편집 =====
const saveTitle = async () => {
if (!titleValue.trim()) return;
@@ -881,7 +1005,7 @@ function MeetingDetail({ meetingId, onBack, showToast }) {
{/* Left: Transcript */}
<div className="flex-1 overflow-y-auto p-4" ref={transcriptRef}>
{activeTab === 'conversation' ? (
<ConversationView segments={segments} interimText={interimText} isRecording={isRecording} currentSpeaker={currentSpeaker} getSpeakerColor={getSpeakerColor} editing={editingSegments} onEditText={handleEditText} onEditSpeaker={handleEditSpeaker} onDeleteSegment={handleDeleteSegment} speakers={speakers} />
<ConversationView segments={segments} interimText={interimText} isRecording={isRecording} detectedSpeaker={detectedSpeaker} getSpeakerColor={getSpeakerColor} editing={editingSegments} onEditText={handleEditText} onEditSpeaker={handleEditSpeaker} onDeleteSegment={handleDeleteSegment} speakers={speakers} />
) : (
<ScriptView segments={segments} interimText={interimText} isRecording={isRecording} />
)}
@@ -899,13 +1023,11 @@ function MeetingDetail({ meetingId, onBack, showToast }) {
<RecordingControlBar
isRecording={isRecording}
recordingTime={recordingTime}
currentSpeakerIdx={currentSpeakerIdx}
speakers={speakers}
detectedSpeaker={detectedSpeaker}
speakerCount={speakers.length}
sttLanguage={sttLanguage}
onStart={startRecording}
onStop={stopRecording}
onSwitchSpeaker={switchSpeaker}
onAddSpeaker={addSpeaker}
onLanguageChange={setSttLanguage}
onSummarize={handleSummarize}
onDiarize={handleDiarize}
@@ -918,6 +1040,7 @@ function MeetingDetail({ meetingId, onBack, showToast }) {
micGain={micGain}
onMicGainChange={setMicGain}
gainNodeRef={gainNodeRef}
getSpeakerColor={getSpeakerColor}
/>
</div>
);
@@ -945,7 +1068,7 @@ className={className}
}
// ========== ConversationView ==========
function ConversationView({ segments, interimText, isRecording, currentSpeaker, getSpeakerColor, editing, onEditText, onEditSpeaker, onDeleteSegment, speakers }) {
function ConversationView({ segments, interimText, isRecording, detectedSpeaker, getSpeakerColor, editing, onEditText, onEditSpeaker, onDeleteSegment, speakers }) {
if (segments.length === 0 && !interimText) {
return (
<div className="flex flex-col items-center justify-center h-full text-gray-400">
@@ -1047,9 +1170,9 @@ className="w-full text-sm text-gray-800 leading-relaxed bg-white/70 border borde
{isRecording && interimText && (
<div className="rounded-lg border-2 border-dashed border-gray-300 p-3 bg-white">
<div className="flex items-center gap-2 mb-1.5">
<span className={`inline-flex items-center px-2 py-0.5 rounded-full text-xs font-medium ${getSpeakerColor(currentSpeaker.name).badge}`}>
<span className={`inline-flex items-center px-2 py-0.5 rounded-full text-xs font-medium ${getSpeakerColor(detectedSpeaker).badge}`}>
<span className={`w-2 h-2 rounded-full bg-red-500 mr-1 animate-pulse`}></span>
{currentSpeaker.name}
{detectedSpeaker}
</span>
<span className="text-xs text-gray-400">인식 ...</span>
</div>
@@ -1209,7 +1332,9 @@ function AudioLevelMeter({ analyserRef, isRecording }) {
}
// ========== RecordingControlBar ==========
function RecordingControlBar({ isRecording, recordingTime, currentSpeakerIdx, speakers, sttLanguage, onStart, onStop, onSwitchSpeaker, onAddSpeaker, onLanguageChange, onSummarize, onDiarize, saving, summarizing, diarizing, hasSegments, hasAudio, analyserRef, micGain, onMicGainChange, gainNodeRef }) {
function RecordingControlBar({ isRecording, recordingTime, detectedSpeaker, speakerCount, sttLanguage, onStart, onStop, onLanguageChange, onSummarize, onDiarize, saving, summarizing, diarizing, hasSegments, hasAudio, analyserRef, micGain, onMicGainChange, gainNodeRef, getSpeakerColor }) {
const detectedColor = getSpeakerColor ? getSpeakerColor(detectedSpeaker) : SPEAKER_COLORS[0];
return (
<div className="bg-white border-t shadow-lg px-4 py-3 flex-shrink-0">
<div className="flex items-center justify-between gap-3">
@@ -1220,20 +1345,25 @@ function RecordingControlBar({ isRecording, recordingTime, currentSpeakerIdx, sp
</select>
</div>
{/* Speaker Selection */}
<div className="flex items-center gap-1">
{speakers.map((sp, idx) => {
const c = SPEAKER_COLORS[idx % SPEAKER_COLORS.length];
return (
<button key={idx} onClick={() => onSwitchSpeaker(idx)} className={`px-3 py-1.5 rounded-full text-xs font-medium transition ${currentSpeakerIdx === idx ? `${c.badge} ring-2 ring-offset-1 ring-blue-400` : 'bg-gray-100 text-gray-600 hover:bg-gray-200'}`}>
{sp.name}
</button>
);
})}
{speakers.length < 4 && (
<button onClick={onAddSpeaker} disabled={isRecording} className="px-2 py-1.5 rounded-full text-xs text-gray-400 hover:text-gray-600 hover:bg-gray-100 disabled:opacity-50" title="화자 추가">+</button>
)}
</div>
{/* Auto Speaker Detection Indicator */}
{isRecording ? (
<div className="flex items-center gap-2">
<div className="flex items-center gap-1.5 px-3 py-1.5 rounded-full bg-emerald-50 border border-emerald-200">
<svg className="w-3.5 h-3.5 text-emerald-500" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M19 11a7 7 0 01-7 7m0 0a7 7 0 01-7-7m7 7v4m0 0H8m4 0h4m-4-8a3 3 0 01-3-3V5a3 3 0 116 0v6a3 3 0 01-3 3z" /></svg>
<span className="text-xs font-medium text-emerald-700">화자 자동 감지</span>
</div>
<span className={`inline-flex items-center px-2.5 py-1 rounded-full text-xs font-medium ${detectedColor.badge} transition-all`}>
<span className={`w-2 h-2 rounded-full ${detectedColor.dot} mr-1 animate-pulse`}></span>
{detectedSpeaker}
</span>
<span className="text-xs text-gray-400">{speakerCount} 감지</span>
</div>
) : (
<div className="flex items-center gap-1.5 px-3 py-1.5 rounded-full bg-gray-50 border border-gray-200">
<svg className="w-3.5 h-3.5 text-gray-400" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M19 11a7 7 0 01-7 7m0 0a7 7 0 01-7-7m7 7v4m0 0H8m4 0h4m-4-8a3 3 0 01-3-3V5a3 3 0 116 0v6a3 3 0 01-3 3z" /></svg>
<span className="text-xs text-gray-500">화자 자동 감지</span>
</div>
)}
{/* Mic Gain Slider */}
{isRecording && (
@@ -1284,12 +1414,10 @@ className="w-16 h-1 accent-blue-500"
{/* Action Buttons */}
<div className="flex items-center gap-2">
{/* 자동 화자 분리 버튼 */}
<button onClick={onDiarize} disabled={diarizing || summarizing || isRecording || !hasAudio} className="bg-indigo-600 text-white px-3 py-1.5 rounded-lg hover:bg-indigo-700 transition text-xs font-medium disabled:opacity-50 disabled:cursor-not-allowed flex items-center gap-1" title="업로드된 오디오에서 AI로 자동 화자 구분">
<svg className="w-4 h-4" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M17 20h5v-2a3 3 0 00-5.356-1.857M17 20H7m10 0v-2c0-.656-.126-1.283-.356-1.857M7 20H2v-2a3 3 0 015.356-1.857M7 20v-2c0-.656.126-1.283.356-1.857m0 0a5.002 5.002 0 019.288 0M15 7a3 3 0 11-6 0 3 3 0 016 0z" /></svg>
화자 분리
</button>
{/* AI 요약 버튼 */}
<button onClick={onSummarize} disabled={summarizing || diarizing || isRecording || !hasSegments} className="bg-purple-600 text-white px-3 py-1.5 rounded-lg hover:bg-purple-700 transition text-xs font-medium disabled:opacity-50 disabled:cursor-not-allowed flex items-center gap-1">
<svg className="w-4 h-4" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M9.663 17h4.673M12 3v1m6.364 1.636l-.707.707M21 12h-1M4 12H3m3.343-5.657l-.707-.707m2.828 9.9a5 5 0 117.072 0l-.548.547A3.374 3.374 0 0014 18.469V19a2 2 0 11-4 0v-.531c0-.895-.356-1.754-.988-2.386l-.548-.547z" /></svg>
AI 요약