- URL 하드코딩 → .env APP_URL 기반 동적 URL로 변경 - DB 연결 하드코딩 → .env 기반으로 변경 - MySQL strict mode DATE 오류 수정
143 lines
5.2 KiB
PHP
143 lines
5.2 KiB
PHP
<?php
|
|
// chatbot/md_rag/ingest.php
|
|
|
|
|
|
require_once($_SERVER['DOCUMENT_ROOT'] . '/vendor/autoload.php'); // Load Google Client
|
|
|
|
ini_set('max_execution_time', 0); // No time limit
|
|
ini_set('memory_limit', '512M');
|
|
|
|
$projectRoot = dirname(__DIR__, 2);
|
|
$googleApiKey = trim(file_get_contents($projectRoot . "/apikey/google_vertex_api.txt"));
|
|
|
|
// --- GCS Configuration ---
|
|
$credentialsPath = $projectRoot . "/apikey/google_service_account.json";
|
|
$bucketName = 'codebridge-speech-audio-files';
|
|
$folderPrefix = 'tenant_knowledge_base/';
|
|
|
|
// Initialize Google Client
|
|
$client = new Google_Client();
|
|
$client->setAuthConfig($credentialsPath);
|
|
$client->addScope(Google_Service_Storage::CLOUD_PLATFORM);
|
|
$storage = new Google_Service_Storage($client);
|
|
|
|
// Directories
|
|
// $filesDir = __DIR__ . '/files'; // No longer used
|
|
$dataDir = __DIR__ . '/data';
|
|
if (!is_dir($dataDir)) mkdir($dataDir, 0777, true);
|
|
|
|
$vectorsFile = $dataDir . '/vectors.json';
|
|
$progressFile = $dataDir . '/progress.json';
|
|
|
|
// Helper: Update Progress
|
|
function updateProgress($file, $current, $total, $lastTitle, $startTime) {
|
|
file_put_contents($file, json_encode([
|
|
'current' => $current,
|
|
'total' => $total,
|
|
'last_title' => $lastTitle,
|
|
'start_time' => $startTime
|
|
]));
|
|
}
|
|
|
|
// 1. Scan MD Files from GCS
|
|
$mdFiles = [];
|
|
try {
|
|
$objects = $storage->objects->listObjects($bucketName, ['prefix' => $folderPrefix]);
|
|
if ($objects->getItems()) {
|
|
foreach ($objects->getItems() as $object) {
|
|
$name = $object->getName();
|
|
// Filter out the folder itself
|
|
if ($name === $folderPrefix) continue;
|
|
// Only process .md files
|
|
if (substr($name, -3) !== '.md') continue;
|
|
|
|
$mdFiles[] = $name;
|
|
}
|
|
}
|
|
} catch (Exception $e) {
|
|
die("Error listing GCS files: " . $e->getMessage());
|
|
}
|
|
|
|
$total = count($mdFiles);
|
|
$startTime = time();
|
|
|
|
updateProgress($progressFile, 0, $total, "Initialization...", $startTime);
|
|
|
|
$vectors = [];
|
|
$count = 0;
|
|
|
|
foreach ($mdFiles as $gcsObjectName) {
|
|
$fileName = basename($gcsObjectName); // Display name
|
|
$count++;
|
|
updateProgress($progressFile, $count, $total, "Processing: $fileName", $startTime);
|
|
|
|
// Download Content from GCS
|
|
try {
|
|
// Use 'alt' => 'media' to download the actual file content
|
|
// This returns the content string directly (or a Guzzle Response depending on client config,
|
|
// but typically string in simple usage or via getBody() if it returns response.
|
|
// However, standard Google_Service_Storage usage for media download:
|
|
$content = $storage->objects->get($bucketName, $gcsObjectName, ['alt' => 'media']);
|
|
|
|
// If it returns a GuzzleHttp\Psr7\Response object (unlikely with default config but possible):
|
|
if (is_object($content) && method_exists($content, 'getBody')) {
|
|
$content = $content->getBody()->getContents();
|
|
}
|
|
} catch (Exception $e) {
|
|
// Skip on error
|
|
updateProgress($progressFile, $count, $total, "Error downloading $fileName: " . $e->getMessage(), $startTime);
|
|
continue;
|
|
}
|
|
|
|
// 2. Chunking Logic
|
|
// Simple strategy: Split by H1/H2 headers (#, ##)
|
|
// If no headers, treat whole file as one or split by length.
|
|
$chunks = preg_split('/^(?=#{1,3}\s)/m', $content); // Split at #, ##, ### at start of line
|
|
|
|
foreach ($chunks as $chunkIndex => $chunkText) {
|
|
$chunkText = trim($chunkText);
|
|
if (mb_strlen($chunkText) < 10) continue; // Skip empty/tiny chunks
|
|
|
|
// Extract Title from Header if exists, else use Filename
|
|
$lines = explode("\n", $chunkText);
|
|
$sectionTitle = $fileName;
|
|
if (preg_match('/^#{1,3}\s+(.*)$/', $lines[0], $matches)) {
|
|
$sectionTitle = $matches[1] . " (" . $fileName . ")";
|
|
}
|
|
|
|
// 3. Embed (Vertex AI)
|
|
$url = "https://generativelanguage.googleapis.com/v1beta/models/text-embedding-004:embedContent?key=" . $googleApiKey;
|
|
$data = ['model' => 'models/text-embedding-004', 'content' => ['parts' => [['text' => $chunkText]]]];
|
|
|
|
$ch = curl_init($url);
|
|
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
|
|
curl_setopt($ch, CURLOPT_POST, true);
|
|
curl_setopt($ch, CURLOPT_POSTFIELDS, json_encode($data));
|
|
curl_setopt($ch, CURLOPT_HTTPHEADER, ['Content-Type: application/json']);
|
|
$response = curl_exec($ch);
|
|
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
|
|
curl_close($ch);
|
|
|
|
if ($httpCode == 200) {
|
|
$respData = json_decode($response, true);
|
|
if (isset($respData['embedding']['values'])) {
|
|
$vectors[] = [
|
|
'id' => md5($fileName . $chunkIndex),
|
|
'title' => $sectionTitle,
|
|
'url' => $fileName, // Using filename as URL ID
|
|
'text' => $chunkText,
|
|
'vector' => $respData['embedding']['values']
|
|
];
|
|
}
|
|
}
|
|
usleep(100000); // 0.1s rate limit
|
|
}
|
|
}
|
|
|
|
// 4. Save Vectors
|
|
file_put_contents($vectorsFile, json_encode($vectors));
|
|
updateProgress($progressFile, $total, $total, "Complete! (Local Saved)", $startTime);
|
|
|
|
echo "Processing Complete. " . count($vectors) . " vectors created.";
|
|
?>
|