- URL 하드코딩 → .env APP_URL 기반 동적 URL로 변경 - DB 연결 하드코딩 → .env 기반으로 변경 - MySQL strict mode DATE 오류 수정
178 lines
6.2 KiB
PHP
178 lines
6.2 KiB
PHP
<?php
|
|
// chatbot/rag/ingest.php
|
|
// CLI Environment Check
|
|
if (php_sapi_name() !== 'cli') {
|
|
// If run from browser, detach process? For now, we assume user runs or we trigger via CLI.
|
|
}
|
|
|
|
ini_set('max_execution_time', 0); // 무제한
|
|
ini_set('memory_limit', '512M');
|
|
|
|
$projectRoot = dirname(__DIR__, 2);
|
|
$notionApiKey = trim(file_get_contents($projectRoot . "/apikey/notion.txt"));
|
|
$googleApiKey = trim(file_get_contents($projectRoot . "/apikey/google_vertex_api.txt"));
|
|
|
|
require_once dirname(__DIR__) . '/notion_client.php';
|
|
|
|
// Data Directories
|
|
$dataDir = __DIR__ . '/data';
|
|
if (!is_dir($dataDir)) mkdir($dataDir, 0777, true);
|
|
|
|
$vectorsFile = $dataDir . '/vectors.json';
|
|
$progressFile = $dataDir . '/progress.json';
|
|
|
|
// Load existing vectors if any (for resume)
|
|
$vectors = [];
|
|
if (file_exists($vectorsFile)) {
|
|
$vectors = json_decode(file_get_contents($vectorsFile), true);
|
|
}
|
|
$processedIds = array_map(function($v) { return explode('_', $v['id'])[0]; }, $vectors);
|
|
$processedIds = array_unique($processedIds);
|
|
|
|
// Helper to update progress
|
|
function updateProgress($file, $current, $total, $lastTitle, $startTime) {
|
|
file_put_contents($file, json_encode([
|
|
'current' => $current,
|
|
'total' => $total,
|
|
'last_title' => $lastTitle,
|
|
'start_time' => $startTime
|
|
]));
|
|
}
|
|
|
|
// 1. Fetch Pages
|
|
function fetchAllNotionPages($apiKey) {
|
|
$pages = [];
|
|
$hasMore = true;
|
|
$nextCursor = null;
|
|
while ($hasMore) {
|
|
$url = "https://api.notion.com/v1/search";
|
|
$data = [
|
|
'filter' => ['value' => 'page', 'property' => 'object'],
|
|
'sort' => ['direction' => 'descending', 'timestamp' => 'last_edited_time'],
|
|
'page_size' => 100
|
|
];
|
|
if ($nextCursor) $data['start_cursor'] = $nextCursor;
|
|
|
|
$ch = curl_init($url);
|
|
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
|
|
curl_setopt($ch, CURLOPT_POST, true);
|
|
curl_setopt($ch, CURLOPT_POSTFIELDS, json_encode($data));
|
|
curl_setopt($ch, CURLOPT_HTTPHEADER, [
|
|
'Authorization: Bearer ' . $apiKey,
|
|
'Notion-Version: 2022-06-28',
|
|
'Content-Type: application/json'
|
|
]);
|
|
$response = curl_exec($ch);
|
|
curl_close($ch);
|
|
|
|
$result = json_decode($response, true);
|
|
if (isset($result['results'])) $pages = array_merge($pages, $result['results']);
|
|
|
|
$hasMore = $result['has_more'] ?? false;
|
|
$nextCursor = $result['next_cursor'] ?? null;
|
|
|
|
// Rate limit guard
|
|
usleep(500000);
|
|
}
|
|
return $pages;
|
|
}
|
|
|
|
// Start
|
|
$startTime = time();
|
|
updateProgress($progressFile, 0, 0, "Fetching Page List...", $startTime);
|
|
|
|
$notionpages = fetchAllNotionPages($notionApiKey);
|
|
$total = count($notionpages);
|
|
updateProgress($progressFile, 0, $total, "Starting Processing...", $startTime);
|
|
|
|
$notionClient = new NotionClient($notionApiKey);
|
|
$count = 0;
|
|
|
|
foreach ($notionpages as $index => $page) {
|
|
$pageId = $page['id'];
|
|
|
|
// Resume Logic: Skip if already processed
|
|
// if (in_array($pageId, $processedIds)) {
|
|
// $count++;
|
|
// continue;
|
|
// }
|
|
// (Simpler: just overwrite or append? For now, let's process all to ensure freshness,
|
|
// unless we strictly want to resume. Given the timeout previously, maybe safest to re-process but save often.)
|
|
|
|
// Title
|
|
$title = "Untitled";
|
|
if (isset($page['properties']['Name']['title'][0]['plain_text'])) {
|
|
$title = $page['properties']['Name']['title'][0]['plain_text'];
|
|
} elseif (isset($page['properties']['title']['title'][0]['plain_text'])) {
|
|
$title = $page['properties']['title']['title'][0]['plain_text'];
|
|
}
|
|
|
|
// Update Progress
|
|
$count++;
|
|
updateProgress($progressFile, $count, $total, $title, $startTime);
|
|
|
|
// Content
|
|
$content = $notionClient->getPageContent($pageId);
|
|
$fullText = "Title: $title\n\n$content";
|
|
|
|
// Chunking
|
|
$chunks = function_exists('mb_str_split') ? mb_str_split($fullText, 500) : str_split($fullText, 500);
|
|
|
|
foreach ($chunks as $chunkIndex => $chunkText) {
|
|
if (mb_strlen(trim($chunkText)) < 10) continue;
|
|
|
|
// Embed
|
|
$url = "https://generativelanguage.googleapis.com/v1beta/models/text-embedding-004:embedContent?key=" . $googleApiKey;
|
|
$data = ['model' => 'models/text-embedding-004', 'content' => ['parts' => [['text' => $chunkText]]]];
|
|
|
|
$ch = curl_init($url);
|
|
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
|
|
curl_setopt($ch, CURLOPT_POST, true);
|
|
curl_setopt($ch, CURLOPT_POSTFIELDS, json_encode($data));
|
|
curl_setopt($ch, CURLOPT_HTTPHEADER, ['Content-Type: application/json']);
|
|
$response = curl_exec($ch);
|
|
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
|
|
curl_close($ch);
|
|
|
|
if ($httpCode == 200) {
|
|
$respData = json_decode($response, true);
|
|
if (isset($respData['embedding']['values'])) {
|
|
$vectors[] = [
|
|
'id' => $pageId . "_" . $chunkIndex,
|
|
'title' => $title,
|
|
'url' => $page['url'] ?? '',
|
|
'text' => $chunkText,
|
|
'vector' => $respData['embedding']['values']
|
|
];
|
|
}
|
|
}
|
|
usleep(100000); // 0.1s delay
|
|
}
|
|
|
|
// Save periodically (every 10 pages) to prevent total loss
|
|
if ($count % 10 == 0) {
|
|
file_put_contents($vectorsFile, json_encode($vectors));
|
|
}
|
|
}
|
|
|
|
// Final Save
|
|
file_put_contents($vectorsFile, json_encode($vectors));
|
|
updateProgress($progressFile, $total, $total, "Uploading to Google Cloud Storage...", $startTime);
|
|
|
|
// GCS Upload
|
|
require_once 'gcs_helper.php';
|
|
try {
|
|
$gcs = new GCSHelper();
|
|
if ($gcs->getBucketName()) {
|
|
$gcs->upload($vectorsFile, 'chatbot/vectors.json');
|
|
updateProgress($progressFile, $total, $total, "Complete! (Saved to GCS)", $startTime);
|
|
echo "Successfully uploaded to GCS: " . $gcs->getBucketName() . "/chatbot/vectors.json";
|
|
} else {
|
|
updateProgress($progressFile, $total, $total, "Complete! (Local Only - No Bucket Config)", $startTime);
|
|
}
|
|
} catch (Exception $e) {
|
|
echo "GCS Upload Error: " . $e->getMessage();
|
|
updateProgress($progressFile, $total, $total, "Complete (Local Saved, GCS Error)", $startTime);
|
|
}
|
|
?>
|