$current, 'total' => $total, 'last_title' => $lastTitle, 'start_time' => $startTime ])); } // 1. Fetch Pages function fetchAllNotionPages($apiKey) { $pages = []; $hasMore = true; $nextCursor = null; while ($hasMore) { $url = "https://api.notion.com/v1/search"; $data = [ 'filter' => ['value' => 'page', 'property' => 'object'], 'sort' => ['direction' => 'descending', 'timestamp' => 'last_edited_time'], 'page_size' => 100 ]; if ($nextCursor) $data['start_cursor'] = $nextCursor; $ch = curl_init($url); curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); curl_setopt($ch, CURLOPT_POST, true); curl_setopt($ch, CURLOPT_POSTFIELDS, json_encode($data)); curl_setopt($ch, CURLOPT_HTTPHEADER, [ 'Authorization: Bearer ' . $apiKey, 'Notion-Version: 2022-06-28', 'Content-Type: application/json' ]); $response = curl_exec($ch); curl_close($ch); $result = json_decode($response, true); if (isset($result['results'])) $pages = array_merge($pages, $result['results']); $hasMore = $result['has_more'] ?? false; $nextCursor = $result['next_cursor'] ?? null; // Rate limit guard usleep(500000); } return $pages; } // Start $startTime = time(); updateProgress($progressFile, 0, 0, "Fetching Page List...", $startTime); $notionpages = fetchAllNotionPages($notionApiKey); $total = count($notionpages); updateProgress($progressFile, 0, $total, "Starting Processing...", $startTime); $notionClient = new NotionClient($notionApiKey); $count = 0; foreach ($notionpages as $index => $page) { $pageId = $page['id']; // Resume Logic: Skip if already processed // if (in_array($pageId, $processedIds)) { // $count++; // continue; // } // (Simpler: just overwrite or append? For now, let's process all to ensure freshness, // unless we strictly want to resume. Given the timeout previously, maybe safest to re-process but save often.) // Title $title = "Untitled"; if (isset($page['properties']['Name']['title'][0]['plain_text'])) { $title = $page['properties']['Name']['title'][0]['plain_text']; } elseif (isset($page['properties']['title']['title'][0]['plain_text'])) { $title = $page['properties']['title']['title'][0]['plain_text']; } // Update Progress $count++; updateProgress($progressFile, $count, $total, $title, $startTime); // Content $content = $notionClient->getPageContent($pageId); $fullText = "Title: $title\n\n$content"; // Chunking $chunks = function_exists('mb_str_split') ? mb_str_split($fullText, 500) : str_split($fullText, 500); foreach ($chunks as $chunkIndex => $chunkText) { if (mb_strlen(trim($chunkText)) < 10) continue; // Embed $url = "https://generativelanguage.googleapis.com/v1beta/models/text-embedding-004:embedContent?key=" . $googleApiKey; $data = ['model' => 'models/text-embedding-004', 'content' => ['parts' => [['text' => $chunkText]]]]; $ch = curl_init($url); curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); curl_setopt($ch, CURLOPT_POST, true); curl_setopt($ch, CURLOPT_POSTFIELDS, json_encode($data)); curl_setopt($ch, CURLOPT_HTTPHEADER, ['Content-Type: application/json']); $response = curl_exec($ch); $httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE); curl_close($ch); if ($httpCode == 200) { $respData = json_decode($response, true); if (isset($respData['embedding']['values'])) { $vectors[] = [ 'id' => $pageId . "_" . $chunkIndex, 'title' => $title, 'url' => $page['url'] ?? '', 'text' => $chunkText, 'vector' => $respData['embedding']['values'] ]; } } usleep(100000); // 0.1s delay } // Save periodically (every 10 pages) to prevent total loss if ($count % 10 == 0) { file_put_contents($vectorsFile, json_encode($vectors)); } } // Final Save file_put_contents($vectorsFile, json_encode($vectors)); updateProgress($progressFile, $total, $total, "Uploading to Google Cloud Storage...", $startTime); // GCS Upload require_once 'gcs_helper.php'; try { $gcs = new GCSHelper(); if ($gcs->getBucketName()) { $gcs->upload($vectorsFile, 'chatbot/vectors.json'); updateProgress($progressFile, $total, $total, "Complete! (Saved to GCS)", $startTime); echo "Successfully uploaded to GCS: " . $gcs->getBucketName() . "/chatbot/vectors.json"; } else { updateProgress($progressFile, $total, $total, "Complete! (Local Only - No Bucket Config)", $startTime); } } catch (Exception $e) { echo "GCS Upload Error: " . $e->getMessage(); updateProgress($progressFile, $total, $total, "Complete (Local Saved, GCS Error)", $startTime); } ?>