From be8de118db913711eb72ae5187d26e54a0055727 Mon Sep 17 00:00:00 2001 From: 简律纯 Date: Fri, 15 Dec 2023 09:11:47 +0800 Subject: refactor(docs): optmst `docs` dir & `deps` --- .../integrations/search/internal/.eslintrc | 6 + .../integrations/search/internal/_/index.ts | 74 ++++++++++ .../integrations/search/internal/extract/index.ts | 107 ++++++++++++++ .../search/internal/highlight/index.ts | 162 +++++++++++++++++++++ .../integrations/search/internal/index.ts | 26 ++++ .../integrations/search/internal/tokenize/index.ts | 136 +++++++++++++++++ 6 files changed, 511 insertions(+) create mode 100644 docs/src/templates/assets/javascripts/integrations/search/internal/.eslintrc create mode 100644 docs/src/templates/assets/javascripts/integrations/search/internal/_/index.ts create mode 100644 docs/src/templates/assets/javascripts/integrations/search/internal/extract/index.ts create mode 100644 docs/src/templates/assets/javascripts/integrations/search/internal/highlight/index.ts create mode 100644 docs/src/templates/assets/javascripts/integrations/search/internal/index.ts create mode 100644 docs/src/templates/assets/javascripts/integrations/search/internal/tokenize/index.ts (limited to 'docs/src/templates/assets/javascripts/integrations/search/internal') diff --git a/docs/src/templates/assets/javascripts/integrations/search/internal/.eslintrc b/docs/src/templates/assets/javascripts/integrations/search/internal/.eslintrc new file mode 100644 index 00000000..9368ceb6 --- /dev/null +++ b/docs/src/templates/assets/javascripts/integrations/search/internal/.eslintrc @@ -0,0 +1,6 @@ +{ + "rules": { + "no-fallthrough": "off", + "no-underscore-dangle": "off" + } +} diff --git a/docs/src/templates/assets/javascripts/integrations/search/internal/_/index.ts b/docs/src/templates/assets/javascripts/integrations/search/internal/_/index.ts new file mode 100644 index 00000000..ae8f6104 --- /dev/null +++ b/docs/src/templates/assets/javascripts/integrations/search/internal/_/index.ts @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2016-2023 Martin Donath + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +/* ---------------------------------------------------------------------------- + * Helper types + * ------------------------------------------------------------------------- */ + +/** + * Visitor function + * + * @param start - Start offset + * @param end - End offset + */ +type VisitorFn = ( + start: number, end: number +) => void + +/* ---------------------------------------------------------------------------- + * Functions + * ------------------------------------------------------------------------- */ + +/** + * Split a string using the given separator + * + * @param input - Input value + * @param separator - Separator + * @param fn - Visitor function + */ +export function split( + input: string, separator: RegExp, fn: VisitorFn +): void { + separator = new RegExp(separator, "g") + + /* Split string using separator */ + let match: RegExpExecArray | null + let index = 0 + do { + match = separator.exec(input) + + /* Emit non-empty range */ + const until = match?.index ?? input.length + if (index < until) + fn(index, until) + + /* Update last index */ + if (match) { + const [term] = match + index = match.index + term.length + + /* Support zero-length lookaheads */ + if (term.length === 0) + separator.lastIndex = match.index + 1 + } + } while (match) +} diff --git a/docs/src/templates/assets/javascripts/integrations/search/internal/extract/index.ts b/docs/src/templates/assets/javascripts/integrations/search/internal/extract/index.ts new file mode 100644 index 00000000..2a98b9e1 --- /dev/null +++ b/docs/src/templates/assets/javascripts/integrations/search/internal/extract/index.ts @@ -0,0 +1,107 @@ +/* + * Copyright (c) 2016-2023 Martin Donath + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +/* ---------------------------------------------------------------------------- + * Types + * ------------------------------------------------------------------------- */ + +/** + * Extraction type + * + * This type defines the possible values that are encoded into the first two + * bits of a section that is part of the blocks of a tokenization table. There + * are three types of interest: HTML opening and closing tags, as well as the + * actual text content we need to extract for indexing. + */ +export const enum Extract { + TAG_OPEN = 0, /* HTML opening tag */ + TEXT = 1, /* Text content */ + TAG_CLOSE = 2 /* HTML closing tag */ +} + +/* ---------------------------------------------------------------------------- + * Helper types + * ------------------------------------------------------------------------- */ + +/** + * Visitor function + * + * @param block - Block index + * @param type - Extraction type + * @param start - Start offset + * @param end - End offset + */ +type VisitorFn = ( + block: number, type: Extract, start: number, end: number +) => void + +/* ---------------------------------------------------------------------------- + * Functions + * ------------------------------------------------------------------------- */ + +/** + * Split a string into markup and text sections + * + * This function scans a string and divides it up into sections of markup and + * text. For each section, it invokes the given visitor function with the block + * index, extraction type, as well as start and end offsets. Using a visitor + * function (= streaming data) is ideal for minimizing pressure on the GC. + * + * @param input - Input value + * @param fn - Visitor function + */ +export function extract( + input: string, fn: VisitorFn +): void { + + let block = 0 /* Current block */ + let start = 0 /* Current start offset */ + let end = 0 /* Current end offset */ + + /* Split string into sections */ + for (let stack = 0; end < input.length; end++) { + + /* Opening tag after non-empty section */ + if (input.charAt(end) === "<" && end > start) { + fn(block, Extract.TEXT, start, start = end) + + /* Closing tag */ + } else if (input.charAt(end) === ">") { + if (input.charAt(start + 1) === "/") { + if (--stack === 0) + fn(block++, Extract.TAG_CLOSE, start, end + 1) + + /* Tag is not self-closing */ + } else if (input.charAt(end - 1) !== "/") { + if (stack++ === 0) + fn(block, Extract.TAG_OPEN, start, end + 1) + } + + /* New section */ + start = end + 1 + } + } + + /* Add trailing section */ + if (end > start) + fn(block, Extract.TEXT, start, end) +} diff --git a/docs/src/templates/assets/javascripts/integrations/search/internal/highlight/index.ts b/docs/src/templates/assets/javascripts/integrations/search/internal/highlight/index.ts new file mode 100644 index 00000000..7cc3bf1a --- /dev/null +++ b/docs/src/templates/assets/javascripts/integrations/search/internal/highlight/index.ts @@ -0,0 +1,162 @@ +/* + * Copyright (c) 2016-2023 Martin Donath + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +/* ---------------------------------------------------------------------------- + * Types + * ------------------------------------------------------------------------- */ + +/** + * Position table + */ +export type PositionTable = number[][] + +/** + * Position + */ +export type Position = number + +/* ---------------------------------------------------------------------------- + * Functions + * ------------------------------------------------------------------------- */ + +/** + * Highlight all occurrences in a string + * + * This function receives a field's value (e.g. like `title` or `text`), it's + * position table that was generated during indexing, and the positions found + * when executing the query. It then highlights all occurrences, and returns + * their concatenation. In case of multiple blocks, two are returned. + * + * @param input - Input value + * @param table - Table for indexing + * @param positions - Occurrences + * @param full - Full results + * + * @returns Highlighted string value + */ +export function highlight( + input: string, table: PositionTable, positions: Position[], full = false +): string { + return highlightAll([input], table, positions, full).pop()! +} + +/** + * Highlight all occurrences in a set of strings + * + * @param inputs - Input values + * @param table - Table for indexing + * @param positions - Occurrences + * @param full - Full results + * + * @returns Highlighted string values + */ +export function highlightAll( + inputs: string[], table: PositionTable, positions: Position[], full = false +): string[] { + + /* Map blocks to input values */ + const mapping = [0] + for (let t = 1; t < table.length; t++) { + const prev = table[t - 1] + const next = table[t] + + /* Check if table points to new block */ + const p = prev[prev.length - 1] >>> 2 & 0x3FF + const q = next[0] >>> 12 + + /* Add block to mapping */ + mapping.push(+(p > q) + mapping[mapping.length - 1]) + } + + /* Highlight strings one after another */ + return inputs.map((input, i) => { + let cursor = 0 + + /* Map occurrences to blocks */ + const blocks = new Map() + for (const p of positions.sort((a, b) => a - b)) { + const index = p & 0xFFFFF + const block = p >>> 20 + if (mapping[block] !== i) + continue + + /* Ensure presence of block group */ + let group = blocks.get(block) + if (typeof group === "undefined") + blocks.set(block, group = []) + + /* Add index to group */ + group.push(index) + } + + /* Just return string, if no occurrences */ + if (blocks.size === 0) + return input + + /* Compute slices */ + const slices: string[] = [] + for (const [block, indexes] of blocks) { + const t = table[block] + + /* Extract positions and length */ + const start = t[0] >>> 12 + const end = t[t.length - 1] >>> 12 + const length = t[t.length - 1] >>> 2 & 0x3FF + + /* Add prefix, if full results are desired */ + if (full && start > cursor) + slices.push(input.slice(cursor, start)) + + /* Extract and highlight slice */ + let slice = input.slice(start, end + length) + for (const j of indexes.sort((a, b) => b - a)) { + + /* Retrieve offset and length of match */ + const p = (t[j] >>> 12) - start + const q = (t[j] >>> 2 & 0x3FF) + p + + /* Wrap occurrence */ + slice = [ + slice.slice(0, p), + "", + slice.slice(p, q), + "", + slice.slice(q) + ].join("") + } + + /* Update cursor */ + cursor = end + length + + /* Append slice and abort if we have two */ + if (slices.push(slice) === 2) + break + } + + /* Add suffix, if full results are desired */ + if (full && cursor < input.length) + slices.push(input.slice(cursor)) + + /* Return highlighted slices */ + return slices.join("") + }) +} diff --git a/docs/src/templates/assets/javascripts/integrations/search/internal/index.ts b/docs/src/templates/assets/javascripts/integrations/search/internal/index.ts new file mode 100644 index 00000000..c752329e --- /dev/null +++ b/docs/src/templates/assets/javascripts/integrations/search/internal/index.ts @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2016-2023 Martin Donath + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +export * from "./_" +export * from "./extract" +export * from "./highlight" +export * from "./tokenize" diff --git a/docs/src/templates/assets/javascripts/integrations/search/internal/tokenize/index.ts b/docs/src/templates/assets/javascripts/integrations/search/internal/tokenize/index.ts new file mode 100644 index 00000000..f5089bc9 --- /dev/null +++ b/docs/src/templates/assets/javascripts/integrations/search/internal/tokenize/index.ts @@ -0,0 +1,136 @@ +/* + * Copyright (c) 2016-2023 Martin Donath + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +import { split } from "../_" +import { + Extract, + extract +} from "../extract" + +/* ---------------------------------------------------------------------------- + * Functions + * ------------------------------------------------------------------------- */ + +/** + * Split a string or set of strings into tokens + * + * This tokenizer supersedes the default tokenizer that is provided by Lunr.js, + * as it is aware of HTML tags and allows for multi-character splitting. + * + * It takes the given inputs, splits each of them into markup and text sections, + * tokenizes and segments (if necessary) each of them, and then indexes them in + * a table by using a compact bit representation. Bitwise techniques are used + * to write and read from the table during indexing and querying. + * + * @see https://bit.ly/3W3Xw4J - Search: better, faster, smaller + * + * @param input - Input value(s) + * + * @returns Tokens + */ +export function tokenize( + input?: string | string[] +): lunr.Token[] { + const tokens: lunr.Token[] = [] + if (typeof input === "undefined") + return tokens + + /* Tokenize strings one after another */ + const inputs = Array.isArray(input) ? input : [input] + for (let i = 0; i < inputs.length; i++) { + const table = lunr.tokenizer.table + const total = table.length + + /* Split string into sections and tokenize content blocks */ + extract(inputs[i], (block, type, start, end) => { + table[block += total] ||= [] + switch (type) { + + /* Handle markup */ + case Extract.TAG_OPEN: + case Extract.TAG_CLOSE: + table[block].push( + start << 12 | + end - start << 2 | + type + ) + break + + /* Handle text content */ + case Extract.TEXT: + const section = inputs[i].slice(start, end) + split(section, lunr.tokenizer.separator, (index, until) => { + + /** + * Apply segmenter after tokenization. Note that the segmenter will + * also split words at word boundaries, which is not what we want, + * so we need to check if we can somehow mitigate this behavior. + */ + if (typeof lunr.segmenter !== "undefined") { + const subsection = section.slice(index, until) + if (/^[MHIK]$/.test(lunr.segmenter.ctype_(subsection))) { + const segments = lunr.segmenter.segment(subsection) + for (let s = 0, l = 0; s < segments.length; s++) { + + /* Add block to section */ + table[block] ||= [] + table[block].push( + start + index + l << 12 | + segments[s].length << 2 | + type + ) + + /* Add token with position */ + tokens.push(new lunr.Token( + segments[s].toLowerCase(), { + position: block << 20 | table[block].length - 1 + } + )) + + /* Keep track of length */ + l += segments[s].length + } + return + } + } + + /* Add block to section */ + table[block].push( + start + index << 12 | + until - index << 2 | + type + ) + + /* Add token with position */ + tokens.push(new lunr.Token( + section.slice(index, until).toLowerCase(), { + position: block << 20 | table[block].length - 1 + } + )) + }) + } + }) + } + + /* Return tokens */ + return tokens +} -- cgit v1.2.3-70-g09d2