feat:최고 품질 음성 녹음 시스템 구축 (STT V2 + Chirp 2 + Web Audio)

- 프론트엔드: Web Audio API 전처리 파이프라인 (GainNode + DynamicsCompressor + AnalyserNode)
- 프론트엔드: VU 미터 실시간 레벨 표시 + 마이크 감도 슬라이더 (0.5x~3.0x)
- 프론트엔드: getUserMedia constraints 강화 + MediaRecorder 128kbps Opus
- 백엔드: Google STT V2 API + Chirp 2 모델 batchRecognize 메서드 추가
- 백엔드: V2→V1 자동 폴백 래퍼 (speechToTextWithDiarizationAuto)
- 백엔드: Speech Adaptation 도메인 용어 힌트 (블라인드/스크린 등 22개)
- 백엔드: V2 SentencePiece 토큰 자동 감지 분기 처리
- 설정: config/services.php에 google.location 추가

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
김보곤
2026-02-11 21:01:37 +09:00
parent 55b5ab6d1d
commit 7f1f7165a5
4 changed files with 543 additions and 9 deletions

View File

@@ -545,6 +545,364 @@ private function cleanSttText(string $text): string
return trim(preg_replace('/\s{2,}/', ' ', $cleaned));
}
/**
* Speech-to-Text V2 API + Chirp 2 모델
*
* @return array|null ['segments' => [...], 'full_transcript' => '...', 'speaker_count' => int]
*/
public function speechToTextV2(
string $gcsUri,
string $languageCode = 'ko-KR',
int $minSpeakers = 2,
int $maxSpeakers = 6,
array $phraseHints = []
): ?array {
$token = $this->getAccessToken();
if (! $token) {
return null;
}
$projectId = $this->serviceAccount['project_id'] ?? null;
if (! $projectId) {
Log::warning('Google Cloud: STT V2 - project_id 없음, V1 폴백 필요');
return null;
}
$location = config('services.google.location', 'us-central1');
try {
$requestBody = [
'config' => [
'configMask' => 'auto_decoding_config,model,language_codes,features',
'auto_decoding_config' => (object) [],
'model' => 'chirp_2',
'language_codes' => [$languageCode],
'features' => [
'enableAutomaticPunctuation' => true,
'enableWordTimeOffsets' => true,
'diarizationConfig' => [
'minSpeakerCount' => $minSpeakers,
'maxSpeakerCount' => $maxSpeakers,
],
],
],
'files' => [
['uri' => $gcsUri],
],
'recognitionOutputConfig' => [
'inlineResponseConfig' => (object) [],
],
];
// Speech Adaptation: phrase hints
if (! empty($phraseHints)) {
$phrases = array_map(fn ($p) => ['value' => $p, 'boost' => 10.0], $phraseHints);
$requestBody['config']['adaptation'] = [
'phraseSets' => [
[
'inlinePhraseSet' => [
'phrases' => $phrases,
],
],
],
];
$requestBody['config']['configMask'] .= ',adaptation';
}
$url = "https://speech.googleapis.com/v2/projects/{$projectId}/locations/{$location}/recognizers/_:batchRecognize";
Log::info('Google Cloud: STT V2 (Chirp 2) 요청 시작', [
'gcsUri' => $gcsUri,
'model' => 'chirp_2',
'language' => $languageCode,
'phraseHints' => count($phraseHints),
]);
$response = Http::withToken($token)
->timeout(30)
->post($url, $requestBody);
if (! $response->successful()) {
Log::error('Google Cloud: STT V2 요청 실패', [
'status' => $response->status(),
'response' => $response->body(),
]);
return null;
}
$operation = $response->json();
$operationName = $operation['name'] ?? null;
if (! $operationName) {
Log::error('Google Cloud: STT V2 작업 이름 없음');
return null;
}
Log::info('Google Cloud: STT V2 작업 시작됨', ['operationName' => $operationName]);
$operationResult = $this->waitForV2Operation($operationName);
if (! $operationResult) {
return null;
}
return $this->parseV2Result($operationResult, $gcsUri);
} catch (\Exception $e) {
Log::error('Google Cloud: STT V2 예외', ['error' => $e->getMessage()]);
return null;
}
}
/**
* V2 Operation 폴링
*/
private function waitForV2Operation(string $operationName, int $maxAttempts = 60): ?array
{
$token = $this->getAccessToken();
if (! $token) {
return null;
}
for ($i = 0; $i < $maxAttempts; $i++) {
sleep(5);
$response = Http::withToken($token)
->get("https://speech.googleapis.com/v2/{$operationName}");
if (! $response->successful()) {
continue;
}
$result = $response->json();
if (isset($result['done']) && $result['done']) {
if (isset($result['error'])) {
Log::error('Google Cloud: STT V2 작업 실패', ['error' => $result['error']]);
return null;
}
Log::info('Google Cloud: STT V2 작업 완료');
return $result;
}
}
Log::error('Google Cloud: STT V2 작업 타임아웃');
return null;
}
/**
* V2 batchRecognize 결과 파싱
*/
private function parseV2Result(array $operationResult, string $gcsUri): ?array
{
// V2 응답 구조: response.results[uri].transcript.results[].alternatives[].words[]
$batchResults = $operationResult['response']['results'] ?? [];
// URI 키로 결과 찾기
$transcriptData = $batchResults[$gcsUri] ?? null;
// URI가 정확히 매치하지 않으면 첫 번째 결과 사용
if (! $transcriptData && ! empty($batchResults)) {
$transcriptData = reset($batchResults);
}
if (! $transcriptData) {
Log::warning('Google Cloud: STT V2 결과 없음');
return null;
}
$results = $transcriptData['transcript']['results'] ?? [];
if (empty($results)) {
Log::warning('Google Cloud: STT V2 transcript 결과 없음');
return null;
}
// 마지막 result에 전체 word-level diarization 정보가 있음
$lastResult = end($results);
$words = $lastResult['alternatives'][0]['words'] ?? [];
if (empty($words)) {
// word-level 결과 없으면 일반 transcript 사용
$transcript = '';
foreach ($results as $res) {
$transcript .= ($res['alternatives'][0]['transcript'] ?? '') . ' ';
}
$transcript = trim($transcript);
return [
'segments' => [[
'speaker_name' => '화자 1',
'speaker_label' => '1',
'text' => $transcript,
'start_time_ms' => 0,
'end_time_ms' => null,
'is_manual_speaker' => false,
]],
'full_transcript' => '[화자 1] ' . $transcript,
'speaker_count' => 1,
];
}
// SentencePiece 토큰 여부 감지 (▁ 문자 포함 시)
$hasSentencePiece = false;
foreach ($words as $w) {
if (preg_match('/[\x{2581}_]/u', $w['word'] ?? '')) {
$hasSentencePiece = true;
break;
}
}
// word-level 화자 정보를 세그먼트로 그룹핑
$segments = [];
$currentSpeaker = null;
$currentTokens = [];
$currentWords = [];
$segmentStartMs = 0;
foreach ($words as $word) {
$speakerTag = $word['speakerTag'] ?? 0;
$wordText = $word['word'] ?? '';
$startMs = $this->parseGoogleTimeToMs($word['startOffset'] ?? $word['startTime'] ?? '0s');
$endMs = $this->parseGoogleTimeToMs($word['endOffset'] ?? $word['endTime'] ?? '0s');
if ($hasSentencePiece) {
// SentencePiece 방식 처리
$isNewWord = preg_match('/^[\x{2581}_]/u', $wordText);
$cleanToken = preg_replace('/[\x{2581}_]/u', '', $wordText);
if (trim($cleanToken) === '') {
continue;
}
if ($speakerTag !== $currentSpeaker && $currentSpeaker !== null && ! empty($currentTokens)) {
$segments[] = [
'speaker_name' => '화자 ' . $currentSpeaker,
'speaker_label' => (string) $currentSpeaker,
'text' => $this->joinSentencePieceTokens($currentTokens),
'start_time_ms' => $segmentStartMs,
'end_time_ms' => $startMs,
'is_manual_speaker' => false,
];
$currentTokens = [];
$segmentStartMs = $startMs;
}
$currentSpeaker = $speakerTag;
$currentTokens[] = ['text' => $cleanToken, 'new_word' => (bool) $isNewWord];
} else {
// 일반 단어 방식 처리 (Chirp 2)
if ($speakerTag !== $currentSpeaker && $currentSpeaker !== null && ! empty($currentWords)) {
$segments[] = [
'speaker_name' => '화자 ' . $currentSpeaker,
'speaker_label' => (string) $currentSpeaker,
'text' => implode(' ', $currentWords),
'start_time_ms' => $segmentStartMs,
'end_time_ms' => $startMs,
'is_manual_speaker' => false,
];
$currentWords = [];
$segmentStartMs = $startMs;
}
$currentSpeaker = $speakerTag;
$currentWords[] = trim($wordText);
}
}
// 마지막 세그먼트
if ($hasSentencePiece && ! empty($currentTokens)) {
$lastWord = end($words);
$segments[] = [
'speaker_name' => '화자 ' . $currentSpeaker,
'speaker_label' => (string) $currentSpeaker,
'text' => $this->joinSentencePieceTokens($currentTokens),
'start_time_ms' => $segmentStartMs,
'end_time_ms' => $this->parseGoogleTimeToMs($lastWord['endOffset'] ?? $lastWord['endTime'] ?? '0s'),
'is_manual_speaker' => false,
];
} elseif (! $hasSentencePiece && ! empty($currentWords)) {
$lastWord = end($words);
$segments[] = [
'speaker_name' => '화자 ' . $currentSpeaker,
'speaker_label' => (string) $currentSpeaker,
'text' => implode(' ', $currentWords),
'start_time_ms' => $segmentStartMs,
'end_time_ms' => $this->parseGoogleTimeToMs($lastWord['endOffset'] ?? $lastWord['endTime'] ?? '0s'),
'is_manual_speaker' => false,
];
}
// full_transcript 생성
$fullTranscript = '';
foreach ($segments as $seg) {
$fullTranscript .= "[{$seg['speaker_name']}] {$seg['text']}\n";
}
$speakerCount = count(array_unique(array_column($segments, 'speaker_label')));
Log::info('Google Cloud: STT V2 파싱 완료', [
'segments' => count($segments),
'speakers' => $speakerCount,
'sentencePiece' => $hasSentencePiece,
]);
return [
'segments' => $segments,
'full_transcript' => trim($fullTranscript),
'speaker_count' => $speakerCount,
];
}
/**
* V2 + Chirp 2 시도 → 실패 시 V1 + latest_long 자동 폴백
*
* @return array|null ['segments' => [...], 'full_transcript' => '...', 'speaker_count' => int, 'engine' => 'v2'|'v1']
*/
public function speechToTextWithDiarizationAuto(
string $gcsUri,
string $languageCode = 'ko-KR',
int $minSpeakers = 2,
int $maxSpeakers = 6,
array $phraseHints = []
): ?array {
$projectId = $this->serviceAccount['project_id'] ?? null;
// V2 + Chirp 2 시도
if ($projectId) {
Log::info('Google Cloud: STT V2 (Chirp 2) 시도');
$v2Result = $this->speechToTextV2($gcsUri, $languageCode, $minSpeakers, $maxSpeakers, $phraseHints);
if ($v2Result && ! empty($v2Result['segments'])) {
$v2Result['engine'] = 'v2';
return $v2Result;
}
Log::warning('Google Cloud: STT V2 실패, V1 폴백');
}
// V1 + latest_long 폴백
Log::info('Google Cloud: STT V1 (latest_long) 폴백 실행');
$v1Result = $this->speechToTextWithDiarization($gcsUri, $languageCode, $minSpeakers, $maxSpeakers);
if ($v1Result) {
$v1Result['engine'] = 'v1';
}
return $v1Result;
}
/**
* GCS 파일 삭제
*/

View File

@@ -208,6 +208,7 @@ public function logSttUsage(int $durationSeconds): void
/**
* 업로드된 오디오에 대해 자동 화자 분리(Speaker Diarization) 실행
* V2 + Chirp 2 우선 시도, 실패 시 V1 + latest_long 자동 폴백
*/
public function processDiarization(MeetingMinute $meeting, int $minSpeakers = 2, int $maxSpeakers = 6): ?array
{
@@ -218,11 +219,12 @@ public function processDiarization(MeetingMinute $meeting, int $minSpeakers = 2,
$meeting->update(['status' => MeetingMinute::STATUS_PROCESSING]);
try {
$result = $this->googleCloudService->speechToTextWithDiarization(
$result = $this->googleCloudService->speechToTextWithDiarizationAuto(
$meeting->audio_gcs_uri,
$meeting->stt_language ?? 'ko-KR',
$minSpeakers,
$maxSpeakers
$maxSpeakers,
$this->getDefaultPhraseHints()
);
if (! $result || empty($result['segments'])) {
@@ -232,6 +234,13 @@ public function processDiarization(MeetingMinute $meeting, int $minSpeakers = 2,
return null;
}
$engine = $result['engine'] ?? 'v1';
Log::info('MeetingMinute: 화자 분리 완료', [
'meeting_id' => $meeting->id,
'engine' => $engine,
'segments' => count($result['segments']),
]);
// 기존 세그먼트 교체
$meeting->segments()->delete();
$fullTranscript = '';
@@ -258,9 +267,10 @@ public function processDiarization(MeetingMinute $meeting, int $minSpeakers = 2,
'status' => MeetingMinute::STATUS_DRAFT,
]);
// STT 사용량 기록
// STT 사용량 기록 (엔진 구분)
if ($meeting->duration_seconds > 0) {
AiTokenHelper::saveSttUsage('회의록-화자분리', $meeting->duration_seconds);
$usageLabel = $engine === 'v2' ? '회의록-화자분리(Chirp2)' : '회의록-화자분리';
AiTokenHelper::saveSttUsage($usageLabel, $meeting->duration_seconds);
}
return [
@@ -279,6 +289,20 @@ public function processDiarization(MeetingMinute $meeting, int $minSpeakers = 2,
}
}
/**
* Speech Adaptation 도메인 용어 힌트
*/
private function getDefaultPhraseHints(): array
{
return [
'블라인드', '스크린', '롤스크린', '허니콤', '버티컬',
'원단', '바텀레일', '헤드레일', '브라켓',
'주일', '경동', '주일블라인드', '경동블라인드',
'수주', '발주', '납기', '출하', '재고', '원가', '단가',
'SAM', 'ERP', 'MES',
];
}
private function buildSummaryPrompt(string $transcript): string
{
return <<<PROMPT

View File

@@ -47,6 +47,7 @@
'google' => [
'credentials_path' => env('GOOGLE_APPLICATION_CREDENTIALS'),
'storage_bucket' => env('GOOGLE_STORAGE_BUCKET'),
'location' => env('GOOGLE_STT_LOCATION', 'us-central1'),
],
/*

View File

@@ -269,6 +269,9 @@ function MeetingDetail({ meetingId, onBack, showToast }) {
const [diarizing, setDiarizing] = useState(false);
const [alertModal, setAlertModal] = useState(null);
// 마이크 감도
const [micGain, setMicGain] = useState(1.5);
// 세그먼트 편집 상태
const [editingSegments, setEditingSegments] = useState(false);
const [editBackup, setEditBackup] = useState([]);
@@ -281,6 +284,9 @@ function MeetingDetail({ meetingId, onBack, showToast }) {
const timerRef = useRef(null);
const transcriptRef = useRef(null);
const startTimeRef = useRef(null);
const audioContextRef = useRef(null);
const analyserRef = useRef(null);
const gainNodeRef = useRef(null);
const loadMeeting = useCallback(async () => {
try {
@@ -340,11 +346,56 @@ function MeetingDetail({ meetingId, onBack, showToast }) {
// ===== 녹음 시작 =====
const startRecording = async () => {
try {
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
const stream = await navigator.mediaDevices.getUserMedia({
audio: {
echoCancellation: true,
noiseSuppression: true,
autoGainControl: true,
channelCount: 1,
sampleRate: 48000,
}
});
streamRef.current = stream;
// MediaRecorder
const recorder = new MediaRecorder(stream);
// Web Audio API 전처리 체인
const audioCtx = new (window.AudioContext || window.webkitAudioContext)({ sampleRate: 48000 });
audioContextRef.current = audioCtx;
const source = audioCtx.createMediaStreamSource(stream);
// GainNode: 감도 조절 (기본 1.5x)
const gainNode = audioCtx.createGain();
gainNode.gain.value = micGain;
gainNodeRef.current = gainNode;
// DynamicsCompressor: 작은 소리 증폭 + 큰 소리 억제
const compressor = audioCtx.createDynamicsCompressor();
compressor.threshold.value = -50;
compressor.knee.value = 40;
compressor.ratio.value = 12;
compressor.attack.value = 0;
compressor.release.value = 0.25;
// AnalyserNode: VU 미터용
const analyser = audioCtx.createAnalyser();
analyser.fftSize = 2048;
analyserRef.current = analyser;
// MediaStreamDestination: 처리된 스트림
const destination = audioCtx.createMediaStreamDestination();
// 체인 연결: source → gain → compressor → analyser → destination
source.connect(gainNode);
gainNode.connect(compressor);
compressor.connect(analyser);
analyser.connect(destination);
const processedStream = destination.stream;
// MediaRecorder (처리된 스트림 사용)
const mimeType = MediaRecorder.isTypeSupported('audio/webm;codecs=opus')
? 'audio/webm;codecs=opus' : 'audio/webm';
const recorder = new MediaRecorder(processedStream, { mimeType, audioBitsPerSecond: 128000 });
audioChunksRef.current = [];
recorder.ondataavailable = (e) => { if (e.data.size > 0) audioChunksRef.current.push(e.data); };
recorder.start();
@@ -423,6 +474,14 @@ function MeetingDetail({ meetingId, onBack, showToast }) {
streamRef.current = null;
}
// AudioContext 정리
if (audioContextRef.current) {
audioContextRef.current.close().catch(() => {});
audioContextRef.current = null;
analyserRef.current = null;
gainNodeRef.current = null;
}
// MediaRecorder 중지 → blob 생성
const recorder = mediaRecorderRef.current;
if (recorder && recorder.state !== 'inactive') {
@@ -831,6 +890,10 @@ function MeetingDetail({ meetingId, onBack, showToast }) {
diarizing={diarizing}
hasSegments={segments.length > 0}
hasAudio={!!meeting?.audio_gcs_uri}
analyserRef={analyserRef}
micGain={micGain}
onMicGainChange={setMicGain}
gainNodeRef={gainNodeRef}
/>
</div>
);
@@ -1056,8 +1119,73 @@ function SummaryPanel({ meeting, onSummarize, summarizing }) {
);
}
// ========== AudioLevelMeter ==========
function AudioLevelMeter({ analyserRef, isRecording }) {
const canvasRef = useRef(null);
const animFrameRef = useRef(null);
useEffect(() => {
if (!isRecording || !analyserRef?.current) {
if (animFrameRef.current) cancelAnimationFrame(animFrameRef.current);
// 녹음 중지 시 캔버스 클리어
const canvas = canvasRef.current;
if (canvas) {
const ctx = canvas.getContext('2d');
ctx.clearRect(0, 0, canvas.width, canvas.height);
}
return;
}
const analyser = analyserRef.current;
const dataArray = new Uint8Array(analyser.fftSize);
const draw = () => {
analyser.getByteTimeDomainData(dataArray);
// RMS 계산
let sum = 0;
for (let i = 0; i < dataArray.length; i++) {
const v = (dataArray[i] - 128) / 128;
sum += v * v;
}
const rms = Math.sqrt(sum / dataArray.length);
const level = Math.min(1, rms * 3); // 0~1 정규화
const canvas = canvasRef.current;
if (!canvas) return;
const ctx = canvas.getContext('2d');
const w = canvas.width;
const h = canvas.height;
ctx.clearRect(0, 0, w, h);
// 바 그리기
const barWidth = Math.round(w * level);
const gradient = ctx.createLinearGradient(0, 0, w, 0);
gradient.addColorStop(0, '#22c55e');
gradient.addColorStop(0.6, '#eab308');
gradient.addColorStop(1, '#ef4444');
ctx.fillStyle = gradient;
ctx.fillRect(0, 0, barWidth, h);
// 배경 바
ctx.fillStyle = 'rgba(0,0,0,0.06)';
ctx.fillRect(barWidth, 0, w - barWidth, h);
animFrameRef.current = requestAnimationFrame(draw);
};
draw();
return () => {
if (animFrameRef.current) cancelAnimationFrame(animFrameRef.current);
};
}, [isRecording, analyserRef?.current]);
return <canvas ref={canvasRef} width={120} height={12} className="rounded-full" style={{ display: 'block' }} />;
}
// ========== RecordingControlBar ==========
function RecordingControlBar({ isRecording, recordingTime, currentSpeakerIdx, speakers, sttLanguage, onStart, onStop, onSwitchSpeaker, onAddSpeaker, onLanguageChange, onSummarize, onDiarize, saving, summarizing, diarizing, hasSegments, hasAudio }) {
function RecordingControlBar({ isRecording, recordingTime, currentSpeakerIdx, speakers, sttLanguage, onStart, onStop, onSwitchSpeaker, onAddSpeaker, onLanguageChange, onSummarize, onDiarize, saving, summarizing, diarizing, hasSegments, hasAudio, analyserRef, micGain, onMicGainChange, gainNodeRef }) {
return (
<div className="bg-white border-t shadow-lg px-4 py-3 flex-shrink-0">
<div className="flex items-center justify-between gap-3">
@@ -1083,7 +1211,29 @@ function RecordingControlBar({ isRecording, recordingTime, currentSpeakerIdx, sp
)}
</div>
{/* Record Button */}
{/* Mic Gain Slider */}
{isRecording && (
<div className="flex items-center gap-1.5">
<svg className="w-4 h-4 text-gray-400" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M19 11a7 7 0 01-7 7m0 0a7 7 0 01-7-7m7 7v4m0 0H8m4 0h4m-4-8a3 3 0 01-3-3V5a3 3 0 116 0v6a3 3 0 01-3 3z" /></svg>
<input
type="range"
min="0.5"
max="3.0"
step="0.1"
value={micGain}
onChange={(e) => {
const val = parseFloat(e.target.value);
onMicGainChange(val);
if (gainNodeRef?.current) gainNodeRef.current.gain.value = val;
}}
className="w-16 h-1 accent-blue-500"
title={`감도 ${micGain.toFixed(1)}x`}
/>
<span className="text-xs text-gray-400 min-w-[28px]">{micGain.toFixed(1)}x</span>
</div>
)}
{/* Record Button + VU Meter */}
<div className="flex items-center gap-3">
{isRecording ? (
<button onClick={onStop} disabled={saving} className="bg-red-600 text-white px-5 py-2 rounded-full hover:bg-red-700 transition flex items-center gap-2 text-sm font-medium shadow-lg disabled:opacity-50">
@@ -1096,6 +1246,7 @@ function RecordingControlBar({ isRecording, recordingTime, currentSpeakerIdx, sp
녹음
</button>
)}
{isRecording && <AudioLevelMeter analyserRef={analyserRef} isRecording={isRecording} />}
<span className="text-sm font-mono text-gray-600 min-w-[60px]">{formatDuration(recordingTime)}</span>
{isRecording && <span className="flex items-center gap-1 text-xs text-red-500"><span className="w-2 h-2 bg-red-500 rounded-full animate-pulse"></span>REC</span>}
</div>