setAuthConfig($credentialsPath); $client->addScope(Google_Service_Storage::CLOUD_PLATFORM); $storage = new Google_Service_Storage($client); // Directories // $filesDir = __DIR__ . '/files'; // No longer used $dataDir = __DIR__ . '/data'; if (!is_dir($dataDir)) mkdir($dataDir, 0777, true); $vectorsFile = $dataDir . '/vectors.json'; $progressFile = $dataDir . '/progress.json'; // Helper: Update Progress function updateProgress($file, $current, $total, $lastTitle, $startTime) { file_put_contents($file, json_encode([ 'current' => $current, 'total' => $total, 'last_title' => $lastTitle, 'start_time' => $startTime ])); } // 1. Scan MD Files from GCS $mdFiles = []; try { $objects = $storage->objects->listObjects($bucketName, ['prefix' => $folderPrefix]); if ($objects->getItems()) { foreach ($objects->getItems() as $object) { $name = $object->getName(); // Filter out the folder itself if ($name === $folderPrefix) continue; // Only process .md files if (substr($name, -3) !== '.md') continue; $mdFiles[] = $name; } } } catch (Exception $e) { die("Error listing GCS files: " . $e->getMessage()); } $total = count($mdFiles); $startTime = time(); updateProgress($progressFile, 0, $total, "Initialization...", $startTime); $vectors = []; $count = 0; foreach ($mdFiles as $gcsObjectName) { $fileName = basename($gcsObjectName); // Display name $count++; updateProgress($progressFile, $count, $total, "Processing: $fileName", $startTime); // Download Content from GCS try { // Use 'alt' => 'media' to download the actual file content // This returns the content string directly (or a Guzzle Response depending on client config, // but typically string in simple usage or via getBody() if it returns response. // However, standard Google_Service_Storage usage for media download: $content = $storage->objects->get($bucketName, $gcsObjectName, ['alt' => 'media']); // If it returns a GuzzleHttp\Psr7\Response object (unlikely with default config but possible): if (is_object($content) && method_exists($content, 'getBody')) { $content = $content->getBody()->getContents(); } } catch (Exception $e) { // Skip on error updateProgress($progressFile, $count, $total, "Error downloading $fileName: " . $e->getMessage(), $startTime); continue; } // 2. Chunking Logic // Simple strategy: Split by H1/H2 headers (#, ##) // If no headers, treat whole file as one or split by length. $chunks = preg_split('/^(?=#{1,3}\s)/m', $content); // Split at #, ##, ### at start of line foreach ($chunks as $chunkIndex => $chunkText) { $chunkText = trim($chunkText); if (mb_strlen($chunkText) < 10) continue; // Skip empty/tiny chunks // Extract Title from Header if exists, else use Filename $lines = explode("\n", $chunkText); $sectionTitle = $fileName; if (preg_match('/^#{1,3}\s+(.*)$/', $lines[0], $matches)) { $sectionTitle = $matches[1] . " (" . $fileName . ")"; } // 3. Embed (Vertex AI) $url = "https://generativelanguage.googleapis.com/v1beta/models/text-embedding-004:embedContent?key=" . $googleApiKey; $data = ['model' => 'models/text-embedding-004', 'content' => ['parts' => [['text' => $chunkText]]]]; $ch = curl_init($url); curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); curl_setopt($ch, CURLOPT_POST, true); curl_setopt($ch, CURLOPT_POSTFIELDS, json_encode($data)); curl_setopt($ch, CURLOPT_HTTPHEADER, ['Content-Type: application/json']); $response = curl_exec($ch); $httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE); curl_close($ch); if ($httpCode == 200) { $respData = json_decode($response, true); if (isset($respData['embedding']['values'])) { $vectors[] = [ 'id' => md5($fileName . $chunkIndex), 'title' => $sectionTitle, 'url' => $fileName, // Using filename as URL ID 'text' => $chunkText, 'vector' => $respData['embedding']['values'] ]; } } usleep(100000); // 0.1s rate limit } } // 4. Save Vectors file_put_contents($vectorsFile, json_encode($vectors)); updateProgress($progressFile, $total, $total, "Complete! (Local Saved)", $startTime); echo "Processing Complete. " . count($vectors) . " vectors created."; ?>