feat:실시간 자동 화자 감지 시스템 구현 (Spectral Centroid 기반)
- Web Audio API rawAnalyser를 오디오 체인에 삽입 (compressor 이전, 원본 신호 분석) - Spectral Centroid + VAD 기반 100ms 간격 실시간 화자 분류 엔진 구현 - 500ms 윈도우 다수결 투표로 화자 안정성 확보 - 수동 화자 선택 버튼 제거 → 자동 감지 인디케이터로 대체 - 최대 4명까지 자동 화자 프로필 등록 및 speakers 동기화 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -257,8 +257,8 @@ function MeetingDetail({ meetingId, onBack, showToast }) {
|
||||
const [recordingTime, setRecordingTime] = useState(0);
|
||||
const [localSegments, setLocalSegments] = useState([]);
|
||||
const [interimText, setInterimText] = useState('');
|
||||
const [currentSpeakerIdx, setCurrentSpeakerIdx] = useState(0);
|
||||
const [speakers, setSpeakers] = useState([{ name: '화자 1' }, { name: '화자 2' }]);
|
||||
const [detectedSpeaker, setDetectedSpeaker] = useState('화자 1');
|
||||
const [speakers, setSpeakers] = useState([{ name: '화자 1' }]);
|
||||
const [sttLanguage, setSttLanguage] = useState('ko-KR');
|
||||
|
||||
// 편집 상태
|
||||
@@ -289,6 +289,11 @@ function MeetingDetail({ meetingId, onBack, showToast }) {
|
||||
const audioContextRef = useRef(null);
|
||||
const analyserRef = useRef(null);
|
||||
const gainNodeRef = useRef(null);
|
||||
const rawAnalyserRef = useRef(null);
|
||||
const speakerDetectorRef = useRef(null);
|
||||
const speakerProfilesRef = useRef([]);
|
||||
const recentFeaturesRef = useRef([]);
|
||||
const detectedSpeakerRef = useRef('화자 1');
|
||||
|
||||
const loadMeeting = useCallback(async () => {
|
||||
try {
|
||||
@@ -337,14 +342,122 @@ function MeetingDetail({ meetingId, onBack, showToast }) {
|
||||
}
|
||||
}, [localSegments, interimText]);
|
||||
|
||||
const currentSpeaker = speakers[currentSpeakerIdx] || speakers[0];
|
||||
const speakerColor = SPEAKER_COLORS[currentSpeakerIdx % SPEAKER_COLORS.length];
|
||||
|
||||
const getSpeakerColor = (name) => {
|
||||
const idx = speakers.findIndex(s => s.name === name);
|
||||
return SPEAKER_COLORS[idx >= 0 ? idx % SPEAKER_COLORS.length : 0];
|
||||
};
|
||||
|
||||
// ===== 화자 분류 (Spectral Centroid 기반) =====
|
||||
const classifySpeaker = useCallback((centroid, spread) => {
|
||||
const profiles = speakerProfilesRef.current;
|
||||
|
||||
if (profiles.length === 0) {
|
||||
profiles.push({ centroid, spread, count: 1, name: '화자 1' });
|
||||
return '화자 1';
|
||||
}
|
||||
|
||||
const distances = profiles.map(p => Math.abs(p.centroid - centroid) / p.centroid);
|
||||
const minDist = Math.min(...distances);
|
||||
const minIdx = distances.indexOf(minDist);
|
||||
|
||||
if (minDist < 0.15) {
|
||||
const p = profiles[minIdx];
|
||||
p.centroid = (p.centroid * p.count + centroid) / (p.count + 1);
|
||||
p.spread = (p.spread * p.count + spread) / (p.count + 1);
|
||||
p.count++;
|
||||
return p.name;
|
||||
}
|
||||
|
||||
if (profiles.length < 4) {
|
||||
const newName = `화자 ${profiles.length + 1}`;
|
||||
profiles.push({ centroid, spread, count: 1, name: newName });
|
||||
setSpeakers(prev => {
|
||||
if (prev.find(s => s.name === newName)) return prev;
|
||||
return [...prev, { name: newName }];
|
||||
});
|
||||
return newName;
|
||||
}
|
||||
|
||||
return profiles[minIdx].name;
|
||||
}, []);
|
||||
|
||||
// ===== 화자 감지 (100ms 간격 호출) =====
|
||||
const detectSpeaker = useCallback(() => {
|
||||
const analyser = rawAnalyserRef.current;
|
||||
if (!analyser) return;
|
||||
|
||||
const sampleRate = audioContextRef.current?.sampleRate || 48000;
|
||||
const fftSize = analyser.fftSize;
|
||||
const binCount = analyser.frequencyBinCount; // fftSize / 2
|
||||
const binHz = sampleRate / fftSize; // 각 bin의 Hz 간격
|
||||
|
||||
// 주파수 데이터 (0~255)
|
||||
const freqData = new Uint8Array(binCount);
|
||||
analyser.getByteFrequencyData(freqData);
|
||||
|
||||
// 시간 도메인 데이터 (VAD용)
|
||||
const timeData = new Uint8Array(fftSize);
|
||||
analyser.getByteTimeDomainData(timeData);
|
||||
|
||||
// RMS 계산 (VAD)
|
||||
let rmsSum = 0;
|
||||
for (let i = 0; i < timeData.length; i++) {
|
||||
const v = (timeData[i] - 128) / 128;
|
||||
rmsSum += v * v;
|
||||
}
|
||||
const rms = Math.sqrt(rmsSum / timeData.length);
|
||||
|
||||
// VAD: 음성 미감지 시 무시
|
||||
if (rms < 0.015) return;
|
||||
|
||||
// 음성 범위 bin (85Hz ~ 1000Hz)
|
||||
const minBin = Math.floor(85 / binHz);
|
||||
const maxBin = Math.min(Math.ceil(1000 / binHz), binCount - 1);
|
||||
|
||||
// Spectral Centroid 계산
|
||||
let weightedSum = 0;
|
||||
let amplitudeSum = 0;
|
||||
for (let i = minBin; i <= maxBin; i++) {
|
||||
const freq = i * binHz;
|
||||
const amp = freqData[i];
|
||||
weightedSum += freq * amp;
|
||||
amplitudeSum += amp;
|
||||
}
|
||||
|
||||
if (amplitudeSum < 10) return; // 에너지 부족
|
||||
|
||||
const centroid = weightedSum / amplitudeSum;
|
||||
|
||||
// Spectral Spread 계산
|
||||
let spreadSum = 0;
|
||||
for (let i = minBin; i <= maxBin; i++) {
|
||||
const freq = i * binHz;
|
||||
const amp = freqData[i];
|
||||
spreadSum += Math.pow(freq - centroid, 2) * amp;
|
||||
}
|
||||
const spread = Math.sqrt(spreadSum / amplitudeSum);
|
||||
|
||||
// 최근 피처 버퍼에 추가 (500ms 윈도우)
|
||||
const now = Date.now();
|
||||
const features = recentFeaturesRef.current;
|
||||
features.push({ centroid, spread, timestamp: now });
|
||||
|
||||
// 500ms 이전 피처 제거
|
||||
const cutoff = now - 500;
|
||||
recentFeaturesRef.current = features.filter(f => f.timestamp > cutoff);
|
||||
|
||||
// 다수결: 최근 피처들의 화자 분류
|
||||
const votes = recentFeaturesRef.current.map(f => classifySpeaker(f.centroid, f.spread));
|
||||
if (votes.length === 0) return;
|
||||
|
||||
// 최빈값 계산
|
||||
const counts = {};
|
||||
votes.forEach(v => { counts[v] = (counts[v] || 0) + 1; });
|
||||
const winner = Object.entries(counts).sort((a, b) => b[1] - a[1])[0][0];
|
||||
|
||||
setDetectedSpeaker(winner);
|
||||
}, [classifySpeaker]);
|
||||
|
||||
// ===== 녹음 시작 =====
|
||||
const startRecording = async () => {
|
||||
try {
|
||||
@@ -386,8 +499,15 @@ function MeetingDetail({ meetingId, onBack, showToast }) {
|
||||
// MediaStreamDestination: 처리된 스트림
|
||||
const destination = audioCtx.createMediaStreamDestination();
|
||||
|
||||
// 체인 연결: source → gain → compressor → analyser → destination
|
||||
source.connect(gainNode);
|
||||
// rawAnalyser: 화자 감지용 (compressor 이전, 원본 신호 분석)
|
||||
const rawAnalyser = audioCtx.createAnalyser();
|
||||
rawAnalyser.fftSize = 2048;
|
||||
rawAnalyser.smoothingTimeConstant = 0.3;
|
||||
rawAnalyserRef.current = rawAnalyser;
|
||||
|
||||
// 체인 연결: source → rawAnalyser(화자감지) → gain → compressor → analyser → destination
|
||||
source.connect(rawAnalyser);
|
||||
rawAnalyser.connect(gainNode);
|
||||
gainNode.connect(compressor);
|
||||
compressor.connect(analyser);
|
||||
analyser.connect(destination);
|
||||
@@ -429,7 +549,7 @@ function MeetingDetail({ meetingId, onBack, showToast }) {
|
||||
const now = Date.now();
|
||||
const startMs = startTimeRef.current ? now - startTimeRef.current : 0;
|
||||
setLocalSegments(prev => [...prev, {
|
||||
speaker_name: currentSpeaker.name,
|
||||
speaker_name: detectedSpeakerRef.current,
|
||||
text: text.trim(),
|
||||
start_time_ms: startMs,
|
||||
end_time_ms: null,
|
||||
@@ -461,6 +581,13 @@ function MeetingDetail({ meetingId, onBack, showToast }) {
|
||||
startTimeRef.current = Date.now();
|
||||
setRecordingTime(0);
|
||||
setIsRecording(true);
|
||||
|
||||
// 화자 감지 시작
|
||||
speakerProfilesRef.current = [];
|
||||
recentFeaturesRef.current = [];
|
||||
setDetectedSpeaker('화자 1');
|
||||
setSpeakers([{ name: '화자 1' }]);
|
||||
speakerDetectorRef.current = setInterval(detectSpeaker, 100);
|
||||
} catch (e) {
|
||||
showToast('마이크 접근 권한이 필요합니다.', 'error');
|
||||
}
|
||||
@@ -469,6 +596,7 @@ function MeetingDetail({ meetingId, onBack, showToast }) {
|
||||
// isRecording ref (onend에서 접근)
|
||||
const isRecordingRef = useRef(false);
|
||||
useEffect(() => { isRecordingRef.current = isRecording; }, [isRecording]);
|
||||
useEffect(() => { detectedSpeakerRef.current = detectedSpeaker; }, [detectedSpeaker]);
|
||||
|
||||
// ===== 녹음 중지 =====
|
||||
const stopRecording = async () => {
|
||||
@@ -485,12 +613,19 @@ function MeetingDetail({ meetingId, onBack, showToast }) {
|
||||
streamRef.current = null;
|
||||
}
|
||||
|
||||
// 화자 감지 중지
|
||||
if (speakerDetectorRef.current) {
|
||||
clearInterval(speakerDetectorRef.current);
|
||||
speakerDetectorRef.current = null;
|
||||
}
|
||||
|
||||
// AudioContext 정리
|
||||
if (audioContextRef.current) {
|
||||
audioContextRef.current.close().catch(() => {});
|
||||
audioContextRef.current = null;
|
||||
analyserRef.current = null;
|
||||
gainNodeRef.current = null;
|
||||
rawAnalyserRef.current = null;
|
||||
}
|
||||
|
||||
// MediaRecorder 중지 → blob 생성
|
||||
@@ -613,17 +748,6 @@ function MeetingDetail({ meetingId, onBack, showToast }) {
|
||||
}
|
||||
};
|
||||
|
||||
// ===== 화자 전환 =====
|
||||
const switchSpeaker = (idx) => {
|
||||
setCurrentSpeakerIdx(idx);
|
||||
};
|
||||
|
||||
const addSpeaker = () => {
|
||||
if (speakers.length >= 4) return;
|
||||
const newIdx = speakers.length;
|
||||
setSpeakers(prev => [...prev, { name: `화자 ${newIdx + 1}` }]);
|
||||
};
|
||||
|
||||
// ===== 제목 인라인 편집 =====
|
||||
const saveTitle = async () => {
|
||||
if (!titleValue.trim()) return;
|
||||
@@ -881,7 +1005,7 @@ function MeetingDetail({ meetingId, onBack, showToast }) {
|
||||
{/* Left: Transcript */}
|
||||
<div className="flex-1 overflow-y-auto p-4" ref={transcriptRef}>
|
||||
{activeTab === 'conversation' ? (
|
||||
<ConversationView segments={segments} interimText={interimText} isRecording={isRecording} currentSpeaker={currentSpeaker} getSpeakerColor={getSpeakerColor} editing={editingSegments} onEditText={handleEditText} onEditSpeaker={handleEditSpeaker} onDeleteSegment={handleDeleteSegment} speakers={speakers} />
|
||||
<ConversationView segments={segments} interimText={interimText} isRecording={isRecording} detectedSpeaker={detectedSpeaker} getSpeakerColor={getSpeakerColor} editing={editingSegments} onEditText={handleEditText} onEditSpeaker={handleEditSpeaker} onDeleteSegment={handleDeleteSegment} speakers={speakers} />
|
||||
) : (
|
||||
<ScriptView segments={segments} interimText={interimText} isRecording={isRecording} />
|
||||
)}
|
||||
@@ -899,13 +1023,11 @@ function MeetingDetail({ meetingId, onBack, showToast }) {
|
||||
<RecordingControlBar
|
||||
isRecording={isRecording}
|
||||
recordingTime={recordingTime}
|
||||
currentSpeakerIdx={currentSpeakerIdx}
|
||||
speakers={speakers}
|
||||
detectedSpeaker={detectedSpeaker}
|
||||
speakerCount={speakers.length}
|
||||
sttLanguage={sttLanguage}
|
||||
onStart={startRecording}
|
||||
onStop={stopRecording}
|
||||
onSwitchSpeaker={switchSpeaker}
|
||||
onAddSpeaker={addSpeaker}
|
||||
onLanguageChange={setSttLanguage}
|
||||
onSummarize={handleSummarize}
|
||||
onDiarize={handleDiarize}
|
||||
@@ -918,6 +1040,7 @@ function MeetingDetail({ meetingId, onBack, showToast }) {
|
||||
micGain={micGain}
|
||||
onMicGainChange={setMicGain}
|
||||
gainNodeRef={gainNodeRef}
|
||||
getSpeakerColor={getSpeakerColor}
|
||||
/>
|
||||
</div>
|
||||
);
|
||||
@@ -945,7 +1068,7 @@ className={className}
|
||||
}
|
||||
|
||||
// ========== ConversationView ==========
|
||||
function ConversationView({ segments, interimText, isRecording, currentSpeaker, getSpeakerColor, editing, onEditText, onEditSpeaker, onDeleteSegment, speakers }) {
|
||||
function ConversationView({ segments, interimText, isRecording, detectedSpeaker, getSpeakerColor, editing, onEditText, onEditSpeaker, onDeleteSegment, speakers }) {
|
||||
if (segments.length === 0 && !interimText) {
|
||||
return (
|
||||
<div className="flex flex-col items-center justify-center h-full text-gray-400">
|
||||
@@ -1047,9 +1170,9 @@ className="w-full text-sm text-gray-800 leading-relaxed bg-white/70 border borde
|
||||
{isRecording && interimText && (
|
||||
<div className="rounded-lg border-2 border-dashed border-gray-300 p-3 bg-white">
|
||||
<div className="flex items-center gap-2 mb-1.5">
|
||||
<span className={`inline-flex items-center px-2 py-0.5 rounded-full text-xs font-medium ${getSpeakerColor(currentSpeaker.name).badge}`}>
|
||||
<span className={`inline-flex items-center px-2 py-0.5 rounded-full text-xs font-medium ${getSpeakerColor(detectedSpeaker).badge}`}>
|
||||
<span className={`w-2 h-2 rounded-full bg-red-500 mr-1 animate-pulse`}></span>
|
||||
{currentSpeaker.name}
|
||||
{detectedSpeaker}
|
||||
</span>
|
||||
<span className="text-xs text-gray-400">인식 중...</span>
|
||||
</div>
|
||||
@@ -1209,7 +1332,9 @@ function AudioLevelMeter({ analyserRef, isRecording }) {
|
||||
}
|
||||
|
||||
// ========== RecordingControlBar ==========
|
||||
function RecordingControlBar({ isRecording, recordingTime, currentSpeakerIdx, speakers, sttLanguage, onStart, onStop, onSwitchSpeaker, onAddSpeaker, onLanguageChange, onSummarize, onDiarize, saving, summarizing, diarizing, hasSegments, hasAudio, analyserRef, micGain, onMicGainChange, gainNodeRef }) {
|
||||
function RecordingControlBar({ isRecording, recordingTime, detectedSpeaker, speakerCount, sttLanguage, onStart, onStop, onLanguageChange, onSummarize, onDiarize, saving, summarizing, diarizing, hasSegments, hasAudio, analyserRef, micGain, onMicGainChange, gainNodeRef, getSpeakerColor }) {
|
||||
const detectedColor = getSpeakerColor ? getSpeakerColor(detectedSpeaker) : SPEAKER_COLORS[0];
|
||||
|
||||
return (
|
||||
<div className="bg-white border-t shadow-lg px-4 py-3 flex-shrink-0">
|
||||
<div className="flex items-center justify-between gap-3">
|
||||
@@ -1220,20 +1345,25 @@ function RecordingControlBar({ isRecording, recordingTime, currentSpeakerIdx, sp
|
||||
</select>
|
||||
</div>
|
||||
|
||||
{/* Speaker Selection */}
|
||||
<div className="flex items-center gap-1">
|
||||
{speakers.map((sp, idx) => {
|
||||
const c = SPEAKER_COLORS[idx % SPEAKER_COLORS.length];
|
||||
return (
|
||||
<button key={idx} onClick={() => onSwitchSpeaker(idx)} className={`px-3 py-1.5 rounded-full text-xs font-medium transition ${currentSpeakerIdx === idx ? `${c.badge} ring-2 ring-offset-1 ring-blue-400` : 'bg-gray-100 text-gray-600 hover:bg-gray-200'}`}>
|
||||
{sp.name}
|
||||
</button>
|
||||
);
|
||||
})}
|
||||
{speakers.length < 4 && (
|
||||
<button onClick={onAddSpeaker} disabled={isRecording} className="px-2 py-1.5 rounded-full text-xs text-gray-400 hover:text-gray-600 hover:bg-gray-100 disabled:opacity-50" title="화자 추가">+</button>
|
||||
)}
|
||||
</div>
|
||||
{/* Auto Speaker Detection Indicator */}
|
||||
{isRecording ? (
|
||||
<div className="flex items-center gap-2">
|
||||
<div className="flex items-center gap-1.5 px-3 py-1.5 rounded-full bg-emerald-50 border border-emerald-200">
|
||||
<svg className="w-3.5 h-3.5 text-emerald-500" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M19 11a7 7 0 01-7 7m0 0a7 7 0 01-7-7m7 7v4m0 0H8m4 0h4m-4-8a3 3 0 01-3-3V5a3 3 0 116 0v6a3 3 0 01-3 3z" /></svg>
|
||||
<span className="text-xs font-medium text-emerald-700">화자 자동 감지</span>
|
||||
</div>
|
||||
<span className={`inline-flex items-center px-2.5 py-1 rounded-full text-xs font-medium ${detectedColor.badge} transition-all`}>
|
||||
<span className={`w-2 h-2 rounded-full ${detectedColor.dot} mr-1 animate-pulse`}></span>
|
||||
{detectedSpeaker}
|
||||
</span>
|
||||
<span className="text-xs text-gray-400">{speakerCount}명 감지</span>
|
||||
</div>
|
||||
) : (
|
||||
<div className="flex items-center gap-1.5 px-3 py-1.5 rounded-full bg-gray-50 border border-gray-200">
|
||||
<svg className="w-3.5 h-3.5 text-gray-400" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M19 11a7 7 0 01-7 7m0 0a7 7 0 01-7-7m7 7v4m0 0H8m4 0h4m-4-8a3 3 0 01-3-3V5a3 3 0 116 0v6a3 3 0 01-3 3z" /></svg>
|
||||
<span className="text-xs text-gray-500">화자 자동 감지</span>
|
||||
</div>
|
||||
)}
|
||||
|
||||
{/* Mic Gain Slider */}
|
||||
{isRecording && (
|
||||
@@ -1284,12 +1414,10 @@ className="w-16 h-1 accent-blue-500"
|
||||
|
||||
{/* Action Buttons */}
|
||||
<div className="flex items-center gap-2">
|
||||
{/* 자동 화자 분리 버튼 */}
|
||||
<button onClick={onDiarize} disabled={diarizing || summarizing || isRecording || !hasAudio} className="bg-indigo-600 text-white px-3 py-1.5 rounded-lg hover:bg-indigo-700 transition text-xs font-medium disabled:opacity-50 disabled:cursor-not-allowed flex items-center gap-1" title="업로드된 오디오에서 AI로 자동 화자 구분">
|
||||
<svg className="w-4 h-4" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M17 20h5v-2a3 3 0 00-5.356-1.857M17 20H7m10 0v-2c0-.656-.126-1.283-.356-1.857M7 20H2v-2a3 3 0 015.356-1.857M7 20v-2c0-.656.126-1.283.356-1.857m0 0a5.002 5.002 0 019.288 0M15 7a3 3 0 11-6 0 3 3 0 016 0z" /></svg>
|
||||
화자 분리
|
||||
</button>
|
||||
{/* AI 요약 버튼 */}
|
||||
<button onClick={onSummarize} disabled={summarizing || diarizing || isRecording || !hasSegments} className="bg-purple-600 text-white px-3 py-1.5 rounded-lg hover:bg-purple-700 transition text-xs font-medium disabled:opacity-50 disabled:cursor-not-allowed flex items-center gap-1">
|
||||
<svg className="w-4 h-4" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M9.663 17h4.673M12 3v1m6.364 1.636l-.707.707M21 12h-1M4 12H3m3.343-5.657l-.707-.707m2.828 9.9a5 5 0 117.072 0l-.548.547A3.374 3.374 0 0014 18.469V19a2 2 0 11-4 0v-.531c0-.895-.356-1.754-.988-2.386l-.548-.547z" /></svg>
|
||||
AI 요약
|
||||
|
||||
Reference in New Issue
Block a user