diff --git a/app/Services/GoogleCloudService.php b/app/Services/GoogleCloudService.php index 832a2e6e..cd5fc4d5 100644 --- a/app/Services/GoogleCloudService.php +++ b/app/Services/GoogleCloudService.php @@ -432,9 +432,10 @@ private function parseDiarizationResult(array $operationResult): ?array } // word-level 화자 정보를 세그먼트로 그룹핑 + // Google STT의 SentencePiece 토크나이저: ▁(U+2581)는 새 단어 시작 표시 $segments = []; $currentSpeaker = null; - $currentWords = []; + $currentTokens = []; $segmentStartMs = 0; foreach ($words as $word) { @@ -443,36 +444,39 @@ private function parseDiarizationResult(array $operationResult): ?array $startMs = $this->parseGoogleTimeToMs($word['startTime'] ?? '0s'); $endMs = $this->parseGoogleTimeToMs($word['endTime'] ?? '0s'); - // 언더스코어 노이즈 제거 (단어 앞뒤/내부 모두) - $cleanWord = str_replace('_', '', $wordText); - if (trim($cleanWord) === '') { + // SentencePiece: ▁(U+2581) 또는 _로 시작하면 새 단어 + $isNewWord = preg_match('/^[\x{2581}_]/u', $wordText); + + // 모든 구분자 문자 제거: _(U+005F), ▁(U+2581) + $cleanToken = preg_replace('/[\x{2581}_]/u', '', $wordText); + if (trim($cleanToken) === '') { continue; } - if ($speakerTag !== $currentSpeaker && $currentSpeaker !== null && ! empty($currentWords)) { + if ($speakerTag !== $currentSpeaker && $currentSpeaker !== null && ! empty($currentTokens)) { $segments[] = [ 'speaker_name' => '화자 ' . $currentSpeaker, 'speaker_label' => (string) $currentSpeaker, - 'text' => $this->cleanSttText(implode(' ', $currentWords)), + 'text' => $this->joinSentencePieceTokens($currentTokens), 'start_time_ms' => $segmentStartMs, 'end_time_ms' => $startMs, 'is_manual_speaker' => false, ]; - $currentWords = []; + $currentTokens = []; $segmentStartMs = $startMs; } $currentSpeaker = $speakerTag; - $currentWords[] = $cleanWord; + $currentTokens[] = ['text' => $cleanToken, 'new_word' => (bool) $isNewWord]; } // 마지막 세그먼트 - if (! empty($currentWords)) { + if (! empty($currentTokens)) { $lastWord = end($words); $segments[] = [ 'speaker_name' => '화자 ' . $currentSpeaker, 'speaker_label' => (string) $currentSpeaker, - 'text' => $this->cleanSttText(implode(' ', $currentWords)), + 'text' => $this->joinSentencePieceTokens($currentTokens), 'start_time_ms' => $segmentStartMs, 'end_time_ms' => $this->parseGoogleTimeToMs($lastWord['endTime'] ?? '0s'), 'is_manual_speaker' => false, @@ -508,12 +512,35 @@ private function parseGoogleTimeToMs(string $timeStr): int } /** - * STT 텍스트에서 언더스코어 노이즈 제거 + * SentencePiece 토큰 배열을 자연스러운 텍스트로 결합 + * + * ▁(U+2581)가 있던 토큰은 새 단어 시작 → 앞에 공백 추가 + * ▁가 없던 토큰은 이전 단어에 바로 붙임 + */ + private function joinSentencePieceTokens(array $tokens): string + { + $result = ''; + foreach ($tokens as $i => $token) { + if ($i === 0) { + $result = $token['text']; + } elseif ($token['new_word']) { + $result .= ' ' . $token['text']; + } else { + $result .= $token['text']; + } + } + + return trim(preg_replace('/\s{2,}/', ' ', $result)); + } + + /** + * STT 텍스트에서 SentencePiece/언더스코어 노이즈 제거 */ private function cleanSttText(string $text): string { - // 언더스코어 제거 후 연속 공백 정리 - $cleaned = str_replace('_', '', $text); + // ▁(U+2581)를 공백으로, _(U+005F)는 제거, 연속 공백 정리 + $cleaned = preg_replace('/\x{2581}/u', ' ', $text); + $cleaned = str_replace('_', '', $cleaned); return trim(preg_replace('/\s{2,}/', ' ', $cleaned)); } diff --git a/resources/views/juil/meeting-minutes.blade.php b/resources/views/juil/meeting-minutes.blade.php index 3f12612a..ce34bdd6 100644 --- a/resources/views/juil/meeting-minutes.blade.php +++ b/resources/views/juil/meeting-minutes.blade.php @@ -947,7 +947,7 @@ className="w-full text-sm text-gray-800 leading-relaxed bg-white/70 border borde )}