Files
sam-kd/chatbot/rag/ingest.php
hskwon aca1767eb9 초기 커밋: 5130 레거시 시스템
- URL 하드코딩 → .env APP_URL 기반 동적 URL로 변경
- DB 연결 하드코딩 → .env 기반으로 변경
- MySQL strict mode DATE 오류 수정
2025-12-10 20:14:31 +09:00

178 lines
6.2 KiB
PHP

<?php
// chatbot/rag/ingest.php
// CLI Environment Check
if (php_sapi_name() !== 'cli') {
// If run from browser, detach process? For now, we assume user runs or we trigger via CLI.
}
ini_set('max_execution_time', 0); // 무제한
ini_set('memory_limit', '512M');
$projectRoot = dirname(__DIR__, 2);
$notionApiKey = trim(file_get_contents($projectRoot . "/apikey/notion.txt"));
$googleApiKey = trim(file_get_contents($projectRoot . "/apikey/google_vertex_api.txt"));
require_once dirname(__DIR__) . '/notion_client.php';
// Data Directories
$dataDir = __DIR__ . '/data';
if (!is_dir($dataDir)) mkdir($dataDir, 0777, true);
$vectorsFile = $dataDir . '/vectors.json';
$progressFile = $dataDir . '/progress.json';
// Load existing vectors if any (for resume)
$vectors = [];
if (file_exists($vectorsFile)) {
$vectors = json_decode(file_get_contents($vectorsFile), true);
}
$processedIds = array_map(function($v) { return explode('_', $v['id'])[0]; }, $vectors);
$processedIds = array_unique($processedIds);
// Helper to update progress
function updateProgress($file, $current, $total, $lastTitle, $startTime) {
file_put_contents($file, json_encode([
'current' => $current,
'total' => $total,
'last_title' => $lastTitle,
'start_time' => $startTime
]));
}
// 1. Fetch Pages
function fetchAllNotionPages($apiKey) {
$pages = [];
$hasMore = true;
$nextCursor = null;
while ($hasMore) {
$url = "https://api.notion.com/v1/search";
$data = [
'filter' => ['value' => 'page', 'property' => 'object'],
'sort' => ['direction' => 'descending', 'timestamp' => 'last_edited_time'],
'page_size' => 100
];
if ($nextCursor) $data['start_cursor'] = $nextCursor;
$ch = curl_init($url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_POST, true);
curl_setopt($ch, CURLOPT_POSTFIELDS, json_encode($data));
curl_setopt($ch, CURLOPT_HTTPHEADER, [
'Authorization: Bearer ' . $apiKey,
'Notion-Version: 2022-06-28',
'Content-Type: application/json'
]);
$response = curl_exec($ch);
curl_close($ch);
$result = json_decode($response, true);
if (isset($result['results'])) $pages = array_merge($pages, $result['results']);
$hasMore = $result['has_more'] ?? false;
$nextCursor = $result['next_cursor'] ?? null;
// Rate limit guard
usleep(500000);
}
return $pages;
}
// Start
$startTime = time();
updateProgress($progressFile, 0, 0, "Fetching Page List...", $startTime);
$notionpages = fetchAllNotionPages($notionApiKey);
$total = count($notionpages);
updateProgress($progressFile, 0, $total, "Starting Processing...", $startTime);
$notionClient = new NotionClient($notionApiKey);
$count = 0;
foreach ($notionpages as $index => $page) {
$pageId = $page['id'];
// Resume Logic: Skip if already processed
// if (in_array($pageId, $processedIds)) {
// $count++;
// continue;
// }
// (Simpler: just overwrite or append? For now, let's process all to ensure freshness,
// unless we strictly want to resume. Given the timeout previously, maybe safest to re-process but save often.)
// Title
$title = "Untitled";
if (isset($page['properties']['Name']['title'][0]['plain_text'])) {
$title = $page['properties']['Name']['title'][0]['plain_text'];
} elseif (isset($page['properties']['title']['title'][0]['plain_text'])) {
$title = $page['properties']['title']['title'][0]['plain_text'];
}
// Update Progress
$count++;
updateProgress($progressFile, $count, $total, $title, $startTime);
// Content
$content = $notionClient->getPageContent($pageId);
$fullText = "Title: $title\n\n$content";
// Chunking
$chunks = function_exists('mb_str_split') ? mb_str_split($fullText, 500) : str_split($fullText, 500);
foreach ($chunks as $chunkIndex => $chunkText) {
if (mb_strlen(trim($chunkText)) < 10) continue;
// Embed
$url = "https://generativelanguage.googleapis.com/v1beta/models/text-embedding-004:embedContent?key=" . $googleApiKey;
$data = ['model' => 'models/text-embedding-004', 'content' => ['parts' => [['text' => $chunkText]]]];
$ch = curl_init($url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_POST, true);
curl_setopt($ch, CURLOPT_POSTFIELDS, json_encode($data));
curl_setopt($ch, CURLOPT_HTTPHEADER, ['Content-Type: application/json']);
$response = curl_exec($ch);
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
curl_close($ch);
if ($httpCode == 200) {
$respData = json_decode($response, true);
if (isset($respData['embedding']['values'])) {
$vectors[] = [
'id' => $pageId . "_" . $chunkIndex,
'title' => $title,
'url' => $page['url'] ?? '',
'text' => $chunkText,
'vector' => $respData['embedding']['values']
];
}
}
usleep(100000); // 0.1s delay
}
// Save periodically (every 10 pages) to prevent total loss
if ($count % 10 == 0) {
file_put_contents($vectorsFile, json_encode($vectors));
}
}
// Final Save
file_put_contents($vectorsFile, json_encode($vectors));
updateProgress($progressFile, $total, $total, "Uploading to Google Cloud Storage...", $startTime);
// GCS Upload
require_once 'gcs_helper.php';
try {
$gcs = new GCSHelper();
if ($gcs->getBucketName()) {
$gcs->upload($vectorsFile, 'chatbot/vectors.json');
updateProgress($progressFile, $total, $total, "Complete! (Saved to GCS)", $startTime);
echo "Successfully uploaded to GCS: " . $gcs->getBucketName() . "/chatbot/vectors.json";
} else {
updateProgress($progressFile, $total, $total, "Complete! (Local Only - No Bucket Config)", $startTime);
}
} catch (Exception $e) {
echo "GCS Upload Error: " . $e->getMessage();
updateProgress($progressFile, $total, $total, "Complete (Local Saved, GCS Error)", $startTime);
}
?>