diff --git a/app/Config/services.php b/app/Config/services.php index d7345823150..aafe0bacc99 100644 --- a/app/Config/services.php +++ b/app/Config/services.php @@ -22,6 +22,18 @@ // Callback URL for social authentication methods 'callback_url' => env('APP_URL', false), + // LLM Service + // Options: openai + 'llm' => env('LLM_SERVICE', ''), + + // OpenAI API-compatible service details + 'openai' => [ + 'endpoint' => env('OPENAI_ENDPOINT', 'https://api.openai.com'), + 'key' => env('OPENAI_KEY', ''), + 'embedding_model' => env('OPENAI_EMBEDDING_MODEL', 'text-embedding-3-small'), + 'query_model' => env('OPENAI_QUERY_MODEL', 'gpt-4o'), + ], + 'github' => [ 'client_id' => env('GITHUB_APP_ID', false), 'client_secret' => env('GITHUB_APP_SECRET', false), diff --git a/app/Console/Commands/RegenerateVectorsCommand.php b/app/Console/Commands/RegenerateVectorsCommand.php new file mode 100644 index 00000000000..26259e94345 --- /dev/null +++ b/app/Console/Commands/RegenerateVectorsCommand.php @@ -0,0 +1,46 @@ +delete(); + + $types = $entityProvider->all(); + foreach ($types as $type => $typeInstance) { + $this->info("Creating jobs to store vectors for {$type} data..."); + /** @var Entity[] $entities */ + $typeInstance->newQuery()->chunkById(100, function ($entities) { + foreach ($entities as $entity) { + dispatch(new StoreEntityVectorsJob($entity)); + } + }); + } + } +} diff --git a/app/Search/Queries/EntityVectorGenerator.php b/app/Search/Queries/EntityVectorGenerator.php new file mode 100644 index 00000000000..741f1827af4 --- /dev/null +++ b/app/Search/Queries/EntityVectorGenerator.php @@ -0,0 +1,89 @@ +vectorQueryServiceProvider->get(); + + $text = $this->entityToPlainText($entity); + $chunks = $this->chunkText($text); + $embeddings = $this->chunksToEmbeddings($chunks, $vectorService); + + $this->deleteExistingEmbeddingsForEntity($entity); + $this->storeEmbeddings($embeddings, $chunks, $entity); + } + + protected function deleteExistingEmbeddingsForEntity(Entity $entity): void + { + SearchVector::query() + ->where('entity_type', '=', $entity->getMorphClass()) + ->where('entity_id', '=', $entity->id) + ->delete(); + } + + protected function storeEmbeddings(array $embeddings, array $textChunks, Entity $entity): void + { + $toInsert = []; + + foreach ($embeddings as $index => $embedding) { + $text = $textChunks[$index]; + $toInsert[] = [ + 'entity_id' => $entity->id, + 'entity_type' => $entity->getMorphClass(), + 'embedding' => DB::raw('VEC_FROMTEXT("[' . implode(',', $embedding) . ']")'), + 'text' => $text, + ]; + } + + $chunks = array_chunk($toInsert, 500); + foreach ($chunks as $chunk) { + SearchVector::query()->insert($chunk); + } + } + + /** + * @param string[] $chunks + * @return float[] array + */ + protected function chunksToEmbeddings(array $chunks, LlmQueryService $vectorQueryService): array + { + $embeddings = []; + foreach ($chunks as $index => $chunk) { + $embeddings[$index] = $vectorQueryService->generateEmbeddings($chunk); + } + return $embeddings; + } + + /** + * @return string[] + */ + protected function chunkText(string $text): array + { + return (new TextChunker(500, ["\n", '.', ' ', '']))->chunk($text); + } + + protected function entityToPlainText(Entity $entity): string + { + $tags = $entity->tags()->get(); + $tagText = $tags->map(function (Tag $tag) { + return $tag->name . ': ' . $tag->value; + })->join('\n'); + + return $entity->name . "\n{$tagText}\n" . $entity->{$entity->textField}; + } +} diff --git a/app/Search/Queries/LlmQueryRunner.php b/app/Search/Queries/LlmQueryRunner.php new file mode 100644 index 00000000000..598d6183b1b --- /dev/null +++ b/app/Search/Queries/LlmQueryRunner.php @@ -0,0 +1,40 @@ +vectorQueryServiceProvider->get(); + + return $queryService->queryToSearchTerms($query); + } + + /** + * Run a query against the configured LLM to produce a text response. + * @param Entity[] $searchResults + * @throws Exception + */ + public function run(string $query, array $searchResults): string + { + $queryService = $this->vectorQueryServiceProvider->get(); + return $queryService->query($query, $searchResults); + } +} diff --git a/app/Search/Queries/LlmQueryServiceProvider.php b/app/Search/Queries/LlmQueryServiceProvider.php new file mode 100644 index 00000000000..99b88e7592f --- /dev/null +++ b/app/Search/Queries/LlmQueryServiceProvider.php @@ -0,0 +1,38 @@ +getServiceName(); + + if ($service === 'openai') { + return new OpenAiLlmQueryService(config('services.openai'), $this->http); + } + + throw new \Exception("No '{$service}' LLM service found"); + } + + protected static function getServiceName(): string + { + return strtolower(config('services.llm')); + } + + public static function isEnabled(): bool + { + return !empty(static::getServiceName()); + } +} diff --git a/app/Search/Queries/QueryController.php b/app/Search/Queries/QueryController.php new file mode 100644 index 00000000000..e072a2ace5e --- /dev/null +++ b/app/Search/Queries/QueryController.php @@ -0,0 +1,65 @@ +middleware(function ($request, $next) { + if (!LlmQueryServiceProvider::isEnabled()) { + $this->showPermissionError('/'); + } + return $next($request); + }); + } + + /** + * Show the view to start a vector/LLM-based query search. + */ + public function show(Request $request) + { + $query = $request->get('ask', ''); + + // TODO - Set page title + + return view('search.query', [ + 'query' => $query, + ]); + } + + /** + * Perform an LLM-based query search. + */ + public function run(Request $request, LlmQueryRunner $llmRunner) + { + // TODO - Rate limiting + $query = $request->get('query', ''); + + return response()->eventStream(function () use ($query, $llmRunner) { + + $searchTerms = $llmRunner->queryToSearchTerms($query); + $searchOptions = SearchOptions::fromTermArray($searchTerms); + $searchResults = $this->searchRunner->searchEntities($searchOptions, count: 10)['results']; + + $entities = []; + foreach ($searchResults as $entity) { + $entityKey = $entity->getMorphClass() . ':' . $entity->id; + if (!isset($entities[$entityKey])) { + $entities[$entityKey] = $entity; + } + } + + yield ['view' => view('entities.list', ['entities' => $entities])->render()]; + + yield ['result' => $llmRunner->run($query, array_values($entities))]; + }); + } +} diff --git a/app/Search/Queries/Services/LlmQueryService.php b/app/Search/Queries/Services/LlmQueryService.php new file mode 100644 index 00000000000..ec1cf04a0e0 --- /dev/null +++ b/app/Search/Queries/Services/LlmQueryService.php @@ -0,0 +1,25 @@ +key = $this->options['key'] ?? ''; + $this->endpoint = $this->options['endpoint'] ?? ''; + $this->embeddingModel = $this->options['embedding_model'] ?? ''; + $this->queryModel = $this->options['query_model'] ?? ''; + } + + protected function jsonRequest(string $method, string $uri, array $data): array + { + $fullUrl = rtrim($this->endpoint, '/') . '/' . ltrim($uri, '/'); + $client = $this->http->buildClient(60); + $request = $this->http->jsonRequest($method, $fullUrl, $data) + ->withHeader('Authorization', 'Bearer ' . $this->key); + + $response = $client->sendRequest($request); + return json_decode($response->getBody()->getContents(), true); + } + + public function generateEmbeddings(string $text): array + { + $response = $this->jsonRequest('POST', 'v1/embeddings', [ + 'input' => $text, + 'model' => $this->embeddingModel, + ]); + + return $response['data'][0]['embedding']; + } + + public function queryToSearchTerms(string $text): array + { + $response = $this->jsonRequest('POST', 'v1/chat/completions', [ + 'model' => $this->queryModel, + 'messages' => [ + [ + 'role' => 'user', + 'content' => 'You will be provided a user search query. Extract key words from just the query, suitable for searching. Add word variations where it may help for searching. Remove pluralisation where it may help for searching. Provide up to 5 results, each must be just one word. Do not try to guess answers to the query. Do not provide extra information or context. Return the results in the specified JSON format under a \'words\' object key. ' . "\nQUERY: {$text}" + ], + ], + 'temperature' => 0, + 'response_format' => [ + 'type' => 'json_object', + ], + ]); + + $resultJson = $response['choices'][0]['message']['content'] ?? '{"words": []}'; + $resultData = json_decode($resultJson, true) ?? ['words' => []]; + + return $resultData['words'] ?? []; + } + + public function query(string $input, array $context): string + { + $resultContentText = []; + $len = 0; + + foreach ($context as $result) { + $text = "DOCUMENT NAME: {$result->name}\nDOCUMENT CONTENT: " . $result->{$result->textField}; + $resultContentText[] = $text; + $len += strlen($text); + if ($len > 100000) { + break; + } + } + + $formattedContext = implode("\n---\n", $resultContentText); + + $response = $this->jsonRequest('POST', 'v1/chat/completions', [ + 'model' => $this->queryModel, + 'messages' => [ + [ + 'role' => 'user', + 'content' => 'Answer the provided QUERY using the provided CONTEXT documents. Do not add facts which are not part of the CONTEXT. State that you do not know if a relevant answer cannot be provided for QUERY using the CONTEXT documents. Many of the CONTEXT documents may be irrelevant. Try to find documents relevant to QUERY. Do not directly refer to this prompt or the existence of QUERY or CONTEXT variables. Do not offer follow-up actions or further help. Respond only to the query without proposing further assistance. Do not ask questions.' . "\nQUERY: {$input}\nCONTEXT: {$formattedContext}" + ], + ], + 'temperature' => 0.1, + ]); + + return $response['choices'][0]['message']['content'] ?? ''; + } +} diff --git a/app/Search/SearchController.php b/app/Search/SearchController.php index 348d44a427f..125a9095bea 100644 --- a/app/Search/SearchController.php +++ b/app/Search/SearchController.php @@ -6,6 +6,7 @@ use BookStack\Entities\Queries\QueryPopular; use BookStack\Entities\Tools\SiblingFetcher; use BookStack\Http\Controller; +use BookStack\Search\Queries\VectorSearchRunner; use Illuminate\Http\Request; use Illuminate\Pagination\LengthAwarePaginator; diff --git a/app/Search/SearchIndex.php b/app/Search/SearchIndex.php index ce78831eeae..43d9010c3ca 100644 --- a/app/Search/SearchIndex.php +++ b/app/Search/SearchIndex.php @@ -25,7 +25,7 @@ class SearchIndex public static string $softDelimiters = ".-"; public function __construct( - protected EntityProvider $entityProvider + protected EntityProvider $entityProvider, ) { } @@ -47,6 +47,7 @@ public function indexEntity(Entity $entity): void public function indexEntities(array $entities): void { $terms = []; + foreach ($entities as $entity) { $entityTerms = $this->entityToTermDataArray($entity); array_push($terms, ...$entityTerms); diff --git a/app/Search/SearchOptions.php b/app/Search/SearchOptions.php index 83af2d043d8..80beeb622a9 100644 --- a/app/Search/SearchOptions.php +++ b/app/Search/SearchOptions.php @@ -93,6 +93,18 @@ public static function fromRequest(Request $request): self return $instance; } + /** + * Create a SearchOptions instance from an array of standard search terms. + * @param string[] $terms + */ + public static function fromTermArray(array $terms): self + { + $instance = new self(); + $instance->searches = SearchOptionSet::fromValueArray(array_values(array_filter($terms)), TermSearchOption::class); + $instance->limitOptions(); + return $instance; + } + /** * Decode a search string and add its contents to this instance. */ diff --git a/database/migrations/2025_03_24_155748_create_search_vectors_table.php b/database/migrations/2025_03_24_155748_create_search_vectors_table.php new file mode 100644 index 00000000000..0ae67c2256f --- /dev/null +++ b/database/migrations/2025_03_24_155748_create_search_vectors_table.php @@ -0,0 +1,37 @@ +string('entity_type', 100); + $table->integer('entity_id'); + $table->text('text'); + + $table->index(['entity_type', 'entity_id']); + }); + + $table = DB::getTablePrefix() . 'search_vectors'; + + // TODO - Vector size might need to be dynamic + DB::statement("ALTER TABLE {$table} ADD COLUMN (embedding VECTOR(1536) NOT NULL)"); + DB::statement("ALTER TABLE {$table} ADD VECTOR INDEX (embedding) DISTANCE=cosine"); + } + + /** + * Reverse the migrations. + */ + public function down(): void + { + Schema::dropIfExists('search_vectors'); + } +}; diff --git a/package-lock.json b/package-lock.json index e8a1493d42f..a620e1bf9f8 100644 --- a/package-lock.json +++ b/package-lock.json @@ -23,6 +23,7 @@ "@ssddanbrown/codemirror-lang-twig": "^1.0.0", "@types/jest": "^30.0.0", "codemirror": "^6.0.2", + "eventsource-client": "^1.1.4", "idb-keyval": "^6.2.2", "markdown-it": "^14.1.0", "markdown-it-task-lists": "^2.1.1", @@ -4797,6 +4798,27 @@ "node": ">=0.10.0" } }, + "node_modules/eventsource-client": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/eventsource-client/-/eventsource-client-1.2.0.tgz", + "integrity": "sha512-kDI75RSzO3TwyG/K9w1ap8XwqSPcwi6jaMkNulfVeZmSeUM49U8kUzk1s+vKNt0tGrXgK47i+620Yasn1ccFiw==", + "license": "MIT", + "dependencies": { + "eventsource-parser": "^3.0.0" + }, + "engines": { + "node": ">=18.0.0" + } + }, + "node_modules/eventsource-parser": { + "version": "3.0.6", + "resolved": "https://registry.npmjs.org/eventsource-parser/-/eventsource-parser-3.0.6.tgz", + "integrity": "sha512-Vo1ab+QXPzZ4tCa8SwIHJFaSzy4R6SHf7BY79rFBDf0idraZWAkYrDjDj8uWaSm3S2TK+hJ7/t1CEmZ7jXw+pg==", + "license": "MIT", + "engines": { + "node": ">=18.0.0" + } + }, "node_modules/execa": { "version": "5.1.1", "resolved": "https://registry.npmjs.org/execa/-/execa-5.1.1.tgz", diff --git a/package.json b/package.json index 624ff876a3a..cd5bc8d2bd0 100644 --- a/package.json +++ b/package.json @@ -53,6 +53,7 @@ "@ssddanbrown/codemirror-lang-twig": "^1.0.0", "@types/jest": "^30.0.0", "codemirror": "^6.0.2", + "eventsource-client": "^1.1.4", "idb-keyval": "^6.2.2", "markdown-it": "^14.1.0", "markdown-it-task-lists": "^2.1.1", diff --git a/resources/js/components/index.ts b/resources/js/components/index.ts index 736d93f0595..11a012e85fd 100644 --- a/resources/js/components/index.ts +++ b/resources/js/components/index.ts @@ -45,6 +45,7 @@ export {PagePicker} from './page-picker'; export {PermissionsTable} from './permissions-table'; export {Pointer} from './pointer'; export {Popup} from './popup'; +export {QueryManager} from './query-manager'; export {SettingAppColorScheme} from './setting-app-color-scheme'; export {SettingColorPicker} from './setting-color-picker'; export {SettingHomepageControl} from './setting-homepage-control'; diff --git a/resources/js/components/query-manager.ts b/resources/js/components/query-manager.ts new file mode 100644 index 00000000000..91bd63a2293 --- /dev/null +++ b/resources/js/components/query-manager.ts @@ -0,0 +1,77 @@ +import {Component} from "./component"; + +export class QueryManager extends Component { + protected input!: HTMLTextAreaElement; + protected generatedLoading!: HTMLElement; + protected generatedDisplay!: HTMLElement; + protected contentLoading!: HTMLElement; + protected contentDisplay!: HTMLElement; + protected form!: HTMLFormElement; + protected fieldset!: HTMLFieldSetElement; + + setup() { + this.input = this.$refs.input as HTMLTextAreaElement; + this.form = this.$refs.form as HTMLFormElement; + this.fieldset = this.$refs.fieldset as HTMLFieldSetElement; + this.generatedLoading = this.$refs.generatedLoading; + this.generatedDisplay = this.$refs.generatedDisplay; + this.contentLoading = this.$refs.contentLoading; + this.contentDisplay = this.$refs.contentDisplay; + + this.setupListeners(); + + // Start lookup if a query is set + if (this.input.value.trim() !== '') { + this.runQuery(); + } + } + + protected setupListeners(): void { + // Handle form submission + this.form.addEventListener('submit', event => { + event.preventDefault(); + this.runQuery(); + }); + + // Allow Ctrl+Enter to run a query + this.input.addEventListener('keydown', event => { + if (event.key === 'Enter' && event.ctrlKey && this.input.value.trim() !== '') { + this.runQuery(); + } + }); + } + + protected async runQuery(): Promise { + this.contentLoading.hidden = false; + this.generatedLoading.hidden = false; + this.contentDisplay.innerHTML = ''; + this.generatedDisplay.innerHTML = ''; + this.fieldset.disabled = true; + + const query = this.input.value.trim(); + const url = new URL(window.location.href); + url.searchParams.set('ask', query); + window.history.pushState({}, '', url.toString()); + + const es = window.$http.eventSource('/query', 'POST', {query}); + + let messageCount = 0; + for await (const {data, event, id} of es) { + messageCount++; + if (messageCount === 1) { + // Entity results + this.contentDisplay.innerHTML = JSON.parse(data).view; + this.contentLoading.hidden = true; + } else if (messageCount === 2) { + // LLM Output + this.generatedDisplay.innerText = JSON.parse(data).result; + this.generatedLoading.hidden = true; + } else { + es.close(); + break; + } + } + + this.fieldset.disabled = false; + } +} \ No newline at end of file diff --git a/resources/js/services/http.ts b/resources/js/services/http.ts index f9eaafc3912..663071577c6 100644 --- a/resources/js/services/http.ts +++ b/resources/js/services/http.ts @@ -1,3 +1,5 @@ +import {createEventSource, EventSourceClient} from "eventsource-client"; + type ResponseData = Record|string; type RequestOptions = { @@ -59,7 +61,6 @@ export class HttpManager { } createXMLHttpRequest(method: string, url: string, events: Record void> = {}): XMLHttpRequest { - const csrfToken = document.querySelector('meta[name=token]')?.getAttribute('content'); const req = new XMLHttpRequest(); for (const [eventName, callback] of Object.entries(events)) { @@ -68,7 +69,7 @@ export class HttpManager { req.open(method, url); req.withCredentials = true; - req.setRequestHeader('X-CSRF-TOKEN', csrfToken || ''); + req.setRequestHeader('X-CSRF-TOKEN', this.getCSRFToken()); return req; } @@ -95,12 +96,11 @@ export class HttpManager { requestUrl = urlObj.toString(); } - const csrfToken = document.querySelector('meta[name=token]')?.getAttribute('content') || ''; const requestOptions: RequestInit = {...options, credentials: 'same-origin'}; requestOptions.headers = { ...requestOptions.headers || {}, baseURL: window.baseUrl(''), - 'X-CSRF-TOKEN': csrfToken, + 'X-CSRF-TOKEN': this.getCSRFToken(), }; const response = await fetch(requestUrl, requestOptions); @@ -191,6 +191,33 @@ export class HttpManager { return this.dataRequest('DELETE', url, data); } + eventSource(url: string, method: string = 'GET', body: object = {}): EventSourceClient { + if (!url.startsWith('http')) { + url = window.baseUrl(url); + } + + const es = createEventSource({ + url, + method, + body: JSON.stringify(body), + credentials: 'same-origin', + headers: { + 'Content-Type': 'application/json', + 'X-CSRF-TOKEN': this.getCSRFToken(), + }, + onDisconnect: () => { + console.log('here'); + es.close(); + } + }); + + return es; + } + + protected getCSRFToken(): string { + return document.querySelector('meta[name=token]')?.getAttribute('content') || ''; + } + /** * Parse the response text for an error response to a user * presentable string. Handles a range of errors responses including diff --git a/resources/sass/_forms.scss b/resources/sass/_forms.scss index 13a4232fc7e..ea453b96ed7 100644 --- a/resources/sass/_forms.scss +++ b/resources/sass/_forms.scss @@ -601,3 +601,29 @@ input.shortcut-input { max-width: 120px; height: auto; } + +.query-form { + display: flex; + flex-direction: row; + gap: vars.$m; + textarea { + font-size: 1.4rem; + height: 100px; + box-shadow: vars.$bs-card; + border-radius: 8px; + color: #444; + } + button { + align-self: start; + margin: 0; + font-size: 1.6rem; + } + button:disabled { + opacity: 0.5; + cursor: not-allowed; + } + textarea:disabled { + opacity: 0.5; + cursor: not-allowed; + } +} \ No newline at end of file diff --git a/resources/views/search/query.blade.php b/resources/views/search/query.blade.php new file mode 100644 index 00000000000..3293c0ddc9b --- /dev/null +++ b/resources/views/search/query.blade.php @@ -0,0 +1,52 @@ +@extends('layouts.simple') + +@section('body') +
+ +
+

Start a Query

+
+
+ + +
+
+
+ +
+

Generated Response

+ +

+ + When you run a query, the relevant content found & shown below will be used to help generate a smart machine generated response. + +

+
+ + +
+

Relevant Content

+ +
+
+

+ Start a query to find relevant matching content. + The items shown here reflect those used to help provide the above response. +

+
+
+
+
+@stop diff --git a/routes/web.php b/routes/web.php index a20c0a3d3d0..ea89fc76e22 100644 --- a/routes/web.php +++ b/routes/web.php @@ -11,6 +11,7 @@ use BookStack\Http\Middleware\VerifyCsrfToken; use BookStack\Permissions\PermissionsController; use BookStack\References\ReferenceController; +use BookStack\Search\Queries\QueryController; use BookStack\Search\SearchController; use BookStack\Settings as SettingControllers; use BookStack\Sorting as SortingControllers; @@ -196,6 +197,11 @@ Route::get('/search/entity-selector-templates', [SearchController::class, 'templatesForSelector']); Route::get('/search/suggest', [SearchController::class, 'searchSuggestions']); + // Queries + Route::get('/query', [QueryController::class, 'show']); + Route::get('/query/run', [QueryController::class, 'run']); // TODO - Development only, remove + Route::post('/query', [QueryController::class, 'run']); + // User Search Route::get('/search/users/select', [UserControllers\UserSearchController::class, 'forSelect']); Route::get('/search/users/mention', [UserControllers\UserSearchController::class, 'forMentions']); diff --git a/tests/Search/TextChunkerTest.php b/tests/Search/TextChunkerTest.php new file mode 100644 index 00000000000..c742c4a6402 --- /dev/null +++ b/tests/Search/TextChunkerTest.php @@ -0,0 +1,47 @@ +chunk('123456789'); + + $this->assertEquals(['123', '456', '789'], $chunks); + } + + public function test_chunk_size_must_be_greater_than_zero() + { + $this->expectException(\InvalidArgumentException::class); + $chunker = new TextChunker(-5, []); + } + + public function test_it_works_through_given_delimiters() + { + $chunker = new TextChunker(5, ['-', '.', '']); + $chunks = $chunker->chunk('12-3456.789abcdefg'); + + $this->assertEquals(['12', '3456', '789ab', 'cdefg'], $chunks); + } + + public function test_it_attempts_to_pack_chunks() + { + $chunker = new TextChunker(8, [' ', '']); + $chunks = $chunker->chunk('123 456 789 abc def'); + + $this->assertEquals(['123 456', '789 abc', 'def'], $chunks); + } + + public function test_it_attempts_to_pack_using_subchunks() + { + $chunker = new TextChunker(8, [' ', '-', '']); + $chunks = $chunker->chunk('123 456-789abc'); + + $this->assertEquals(['123 456', '789abc'], $chunks); + } +}