Files
sam-kd/chatbot/md_rag/ingest.php

143 lines
5.2 KiB
PHP
Raw Permalink Normal View History

<?php
// chatbot/md_rag/ingest.php
require_once($_SERVER['DOCUMENT_ROOT'] . '/vendor/autoload.php'); // Load Google Client
ini_set('max_execution_time', 0); // No time limit
ini_set('memory_limit', '512M');
$projectRoot = dirname(__DIR__, 2);
$googleApiKey = trim(file_get_contents($projectRoot . "/apikey/google_vertex_api.txt"));
// --- GCS Configuration ---
$credentialsPath = $projectRoot . "/apikey/google_service_account.json";
$bucketName = 'codebridge-speech-audio-files';
$folderPrefix = 'tenant_knowledge_base/';
// Initialize Google Client
$client = new Google_Client();
$client->setAuthConfig($credentialsPath);
$client->addScope(Google_Service_Storage::CLOUD_PLATFORM);
$storage = new Google_Service_Storage($client);
// Directories
// $filesDir = __DIR__ . '/files'; // No longer used
$dataDir = __DIR__ . '/data';
if (!is_dir($dataDir)) mkdir($dataDir, 0777, true);
$vectorsFile = $dataDir . '/vectors.json';
$progressFile = $dataDir . '/progress.json';
// Helper: Update Progress
function updateProgress($file, $current, $total, $lastTitle, $startTime) {
file_put_contents($file, json_encode([
'current' => $current,
'total' => $total,
'last_title' => $lastTitle,
'start_time' => $startTime
]));
}
// 1. Scan MD Files from GCS
$mdFiles = [];
try {
$objects = $storage->objects->listObjects($bucketName, ['prefix' => $folderPrefix]);
if ($objects->getItems()) {
foreach ($objects->getItems() as $object) {
$name = $object->getName();
// Filter out the folder itself
if ($name === $folderPrefix) continue;
// Only process .md files
if (substr($name, -3) !== '.md') continue;
$mdFiles[] = $name;
}
}
} catch (Exception $e) {
die("Error listing GCS files: " . $e->getMessage());
}
$total = count($mdFiles);
$startTime = time();
updateProgress($progressFile, 0, $total, "Initialization...", $startTime);
$vectors = [];
$count = 0;
foreach ($mdFiles as $gcsObjectName) {
$fileName = basename($gcsObjectName); // Display name
$count++;
updateProgress($progressFile, $count, $total, "Processing: $fileName", $startTime);
// Download Content from GCS
try {
// Use 'alt' => 'media' to download the actual file content
// This returns the content string directly (or a Guzzle Response depending on client config,
// but typically string in simple usage or via getBody() if it returns response.
// However, standard Google_Service_Storage usage for media download:
$content = $storage->objects->get($bucketName, $gcsObjectName, ['alt' => 'media']);
// If it returns a GuzzleHttp\Psr7\Response object (unlikely with default config but possible):
if (is_object($content) && method_exists($content, 'getBody')) {
$content = $content->getBody()->getContents();
}
} catch (Exception $e) {
// Skip on error
updateProgress($progressFile, $count, $total, "Error downloading $fileName: " . $e->getMessage(), $startTime);
continue;
}
// 2. Chunking Logic
// Simple strategy: Split by H1/H2 headers (#, ##)
// If no headers, treat whole file as one or split by length.
$chunks = preg_split('/^(?=#{1,3}\s)/m', $content); // Split at #, ##, ### at start of line
foreach ($chunks as $chunkIndex => $chunkText) {
$chunkText = trim($chunkText);
if (mb_strlen($chunkText) < 10) continue; // Skip empty/tiny chunks
// Extract Title from Header if exists, else use Filename
$lines = explode("\n", $chunkText);
$sectionTitle = $fileName;
if (preg_match('/^#{1,3}\s+(.*)$/', $lines[0], $matches)) {
$sectionTitle = $matches[1] . " (" . $fileName . ")";
}
// 3. Embed (Vertex AI)
$url = "https://generativelanguage.googleapis.com/v1beta/models/text-embedding-004:embedContent?key=" . $googleApiKey;
$data = ['model' => 'models/text-embedding-004', 'content' => ['parts' => [['text' => $chunkText]]]];
$ch = curl_init($url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_POST, true);
curl_setopt($ch, CURLOPT_POSTFIELDS, json_encode($data));
curl_setopt($ch, CURLOPT_HTTPHEADER, ['Content-Type: application/json']);
$response = curl_exec($ch);
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
curl_close($ch);
if ($httpCode == 200) {
$respData = json_decode($response, true);
if (isset($respData['embedding']['values'])) {
$vectors[] = [
'id' => md5($fileName . $chunkIndex),
'title' => $sectionTitle,
'url' => $fileName, // Using filename as URL ID
'text' => $chunkText,
'vector' => $respData['embedding']['values']
];
}
}
usleep(100000); // 0.1s rate limit
}
}
// 4. Save Vectors
file_put_contents($vectorsFile, json_encode($vectors));
updateProgress($progressFile, $total, $total, "Complete! (Local Saved)", $startTime);
echo "Processing Complete. " . count($vectors) . " vectors created.";
?>