feat:회의록 자동 화자 분리(Phase 2) 구현 및 세그먼트 저장 에러 수정

- GoogleCloudService에 speechToTextWithDiarization 메서드 추가 - Google STT V1 diarizationConfig 활성화로 자동 화자 구분 - MeetingMinuteService에 processDiarization 메서드 추가 - POST /{id}/diarize 엔드포인트 및 라우트 추가 - 프론트엔드에 '화자 분리' 버튼 추가 (RecordingControlBar) - saveSegments 컨트롤러에 try-catch 에러 핸들링 추가 - 빈 텍스트 세그먼트 필터링 로직 추가 (서버/클라이언트 양쪽) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-10 10:29:16 +09:00
parent 0f312bcf77
commit b2fbd3d113
5 changed files with 418 additions and 20 deletions
--- a/app/Services/GoogleCloudService.php
+++ b/app/Services/GoogleCloudService.php
@@ -288,6 +288,218 @@ private function waitForSttOperation(string $operationName, int $maxAttempts = 6
        return null;
    }

+    /**
+     * Speaker Diarization을 포함한 Speech-to-Text API 호출
+     *
+     * @return array|null ['segments' => [...], 'full_transcript' => '...']
+     */
+    public function speechToTextWithDiarization(
+        string $gcsUri,
+        string $languageCode = 'ko-KR',
+        int $minSpeakers = 2,
+        int $maxSpeakers = 6
+    ): ?array {
+        $token = $this->getAccessToken();
+        if (! $token) {
+            return null;
+        }
+
+        try {
+            $response = Http::withToken($token)
+                ->post('https://speech.googleapis.com/v1/speech:longrunningrecognize', [
+                    'config' => [
+                        'encoding' => 'WEBM_OPUS',
+                        'sampleRateHertz' => 48000,
+                        'languageCode' => $languageCode,
+                        'enableAutomaticPunctuation' => true,
+                        'model' => 'latest_long',
+                        'enableWordTimeOffsets' => true,
+                        'diarizationConfig' => [
+                            'enableSpeakerDiarization' => true,
+                            'minSpeakerCount' => $minSpeakers,
+                            'maxSpeakerCount' => $maxSpeakers,
+                        ],
+                    ],
+                    'audio' => [
+                        'uri' => $gcsUri,
+                    ],
+                ]);
+
+            if (! $response->successful()) {
+                Log::error('Google Cloud: STT Diarization 요청 실패', ['response' => $response->body()]);
+
+                return null;
+            }
+
+            $operation = $response->json();
+            $operationName = $operation['name'] ?? null;
+
+            if (! $operationName) {
+                Log::error('Google Cloud: STT Diarization 작업 이름 없음');
+
+                return null;
+            }
+
+            Log::info('Google Cloud: STT Diarization 요청 시작', ['operationName' => $operationName]);
+
+            $rawResult = $this->waitForSttDiarizationOperation($operationName);
+
+            if (! $rawResult) {
+                return null;
+            }
+
+            return $this->parseDiarizationResult($rawResult);
+        } catch (\Exception $e) {
+            Log::error('Google Cloud: STT Diarization 예외', ['error' => $e->getMessage()]);
+
+            return null;
+        }
+    }
+
+    /**
+     * STT Diarization 작업 완료 대기 (raw 결과 반환)
+     */
+    private function waitForSttDiarizationOperation(string $operationName, int $maxAttempts = 60): ?array
+    {
+        $token = $this->getAccessToken();
+        if (! $token) {
+            return null;
+        }
+
+        for ($i = 0; $i < $maxAttempts; $i++) {
+            sleep(5);
+
+            $response = Http::withToken($token)
+                ->get("https://speech.googleapis.com/v1/operations/{$operationName}");
+
+            if (! $response->successful()) {
+                continue;
+            }
+
+            $result = $response->json();
+
+            if (isset($result['done']) && $result['done']) {
+                if (isset($result['error'])) {
+                    Log::error('Google Cloud: STT Diarization 작업 실패', ['error' => $result['error']]);
+
+                    return null;
+                }
+
+                return $result;
+            }
+        }
+
+        Log::error('Google Cloud: STT Diarization 작업 타임아웃');
+
+        return null;
+    }
+
+    /**
+     * Diarization 결과를 화자별 세그먼트로 파싱
+     */
+    private function parseDiarizationResult(array $operationResult): ?array
+    {
+        $results = $operationResult['response']['results'] ?? [];
+
+        if (empty($results)) {
+            return null;
+        }
+
+        // Diarization 결과는 마지막 result의 alternatives[0].words에 전체 word-level 정보가 있음
+        $lastResult = end($results);
+        $words = $lastResult['alternatives'][0]['words'] ?? [];
+
+        if (empty($words)) {
+            // word-level 결과 없으면 일반 transcript로 폴백
+            $transcript = '';
+            foreach ($results as $res) {
+                $transcript .= ($res['alternatives'][0]['transcript'] ?? '') . ' ';
+            }
+
+            return [
+                'segments' => [[
+                    'speaker_name' => '화자 1',
+                    'speaker_label' => '1',
+                    'text' => trim($transcript),
+                    'start_time_ms' => 0,
+                    'end_time_ms' => null,
+                    'is_manual_speaker' => false,
+                ]],
+                'full_transcript' => '[화자 1] ' . trim($transcript),
+                'speaker_count' => 1,
+            ];
+        }
+
+        // word-level 화자 정보를 세그먼트로 그룹핑
+        $segments = [];
+        $currentSpeaker = null;
+        $currentWords = [];
+        $segmentStartMs = 0;
+
+        foreach ($words as $word) {
+            $speakerTag = $word['speakerTag'] ?? 0;
+            $wordText = $word['word'] ?? '';
+            $startMs = $this->parseGoogleTimeToMs($word['startTime'] ?? '0s');
+            $endMs = $this->parseGoogleTimeToMs($word['endTime'] ?? '0s');
+
+            if ($speakerTag !== $currentSpeaker && $currentSpeaker !== null && ! empty($currentWords)) {
+                $segments[] = [
+                    'speaker_name' => '화자 ' . $currentSpeaker,
+                    'speaker_label' => (string) $currentSpeaker,
+                    'text' => trim(implode(' ', $currentWords)),
+                    'start_time_ms' => $segmentStartMs,
+                    'end_time_ms' => $startMs,
+                    'is_manual_speaker' => false,
+                ];
+                $currentWords = [];
+                $segmentStartMs = $startMs;
+            }
+
+            $currentSpeaker = $speakerTag;
+            $currentWords[] = $wordText;
+        }
+
+        // 마지막 세그먼트
+        if (! empty($currentWords)) {
+            $lastWord = end($words);
+            $segments[] = [
+                'speaker_name' => '화자 ' . $currentSpeaker,
+                'speaker_label' => (string) $currentSpeaker,
+                'text' => trim(implode(' ', $currentWords)),
+                'start_time_ms' => $segmentStartMs,
+                'end_time_ms' => $this->parseGoogleTimeToMs($lastWord['endTime'] ?? '0s'),
+                'is_manual_speaker' => false,
+            ];
+        }
+
+        // full_transcript 생성
+        $fullTranscript = '';
+        foreach ($segments as $seg) {
+            $fullTranscript .= "[{$seg['speaker_name']}] {$seg['text']}\n";
+        }
+
+        // 고유 화자 수
+        $speakerCount = count(array_unique(array_column($segments, 'speaker_label')));
+
+        return [
+            'segments' => $segments,
+            'full_transcript' => trim($fullTranscript),
+            'speaker_count' => $speakerCount,
+        ];
+    }
+
+    /**
+     * Google STT 시간 형식("1.500s")을 밀리초로 변환
+     */
+    private function parseGoogleTimeToMs(string $timeStr): int
+    {
+        if (preg_match('/^([\d.]+)s$/', $timeStr, $matches)) {
+            return (int) round((float) $matches[1] * 1000);
+        }
+
+        return 0;
+    }
+
    /**
     * GCS 파일 삭제
     */