diff options
| author | 2023-10-07 06:48:07 +0800 | |
|---|---|---|
| committer | 2023-10-07 06:48:07 +0800 | |
| commit | 991fd7a6d67ee017c57beaaa21fc31c4bee7944d (patch) | |
| tree | e895202203fcaa50b0052f60ef6fc7d6d2928cf9 /src/templates/assets/javascripts/integrations/search/query | |
| parent | d62900046bb6f754a8e6e7e670a66a90134055d9 (diff) | |
| download | infini-991fd7a6d67ee017c57beaaa21fc31c4bee7944d.tar.gz infini-991fd7a6d67ee017c57beaaa21fc31c4bee7944d.zip | |
feat(version): versions
Diffstat (limited to 'src/templates/assets/javascripts/integrations/search/query')
5 files changed, 383 insertions, 0 deletions
diff --git a/src/templates/assets/javascripts/integrations/search/query/.eslintrc b/src/templates/assets/javascripts/integrations/search/query/.eslintrc new file mode 100644 index 00000000..3031c7e3 --- /dev/null +++ b/src/templates/assets/javascripts/integrations/search/query/.eslintrc @@ -0,0 +1,6 @@ +{ + "rules": { + "no-control-regex": "off", + "@typescript-eslint/no-explicit-any": "off" + } +} diff --git a/src/templates/assets/javascripts/integrations/search/query/_/index.ts b/src/templates/assets/javascripts/integrations/search/query/_/index.ts new file mode 100644 index 00000000..14482e43 --- /dev/null +++ b/src/templates/assets/javascripts/integrations/search/query/_/index.ts @@ -0,0 +1,172 @@ +/* + * Copyright (c) 2016-2023 Martin Donath <martin.donath@squidfunk.com> + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +import { split } from "../../internal" +import { transform } from "../transform" + +/* ---------------------------------------------------------------------------- + * Types + * ------------------------------------------------------------------------- */ + +/** + * Search query clause + */ +export interface SearchQueryClause { + presence: lunr.Query.presence /* Clause presence */ + term: string /* Clause term */ +} + +/* ------------------------------------------------------------------------- */ + +/** + * Search query terms + */ +export type SearchQueryTerms = Record<string, boolean> + +/* ---------------------------------------------------------------------------- + * Functions + * ------------------------------------------------------------------------- */ + +/** + * Transform search query + * + * This function lexes the given search query and applies the transformation + * function to each term, preserving markup like `+` and `-` modifiers. + * + * @param query - Search query + * + * @returns Search query + */ +export function transformSearchQuery( + query: string +): string { + + /* Split query terms with tokenizer */ + return transform(query, part => { + const terms: string[] = [] + + /* Initialize lexer and analyze part */ + const lexer = new lunr.QueryLexer(part) + lexer.run() + + /* Extract and tokenize term from lexeme */ + for (const { type, str: term, start, end } of lexer.lexemes) + switch (type) { + + /* Hack: remove colon - see https://bit.ly/3wD3T3I */ + case "FIELD": + if (!["title", "text", "tags"].includes(term)) + part = [ + part.slice(0, end), + " ", + part.slice(end + 1) + ].join("") + break + + /* Tokenize term */ + case "TERM": + split(term, lunr.tokenizer.separator, (...range) => { + terms.push([ + part.slice(0, start), + term.slice(...range), + part.slice(end) + ].join("")) + }) + } + + /* Return terms */ + return terms + }) +} + +/* ------------------------------------------------------------------------- */ + +/** + * Parse a search query for analysis + * + * Lunr.js itself has a bug where it doesn't detect or remove wildcards for + * query clauses, so we must do this here. + * + * @see https://bit.ly/3DpTGtz - GitHub issue + * + * @param value - Query value + * + * @returns Search query clauses + */ +export function parseSearchQuery( + value: string +): SearchQueryClause[] { + const query = new lunr.Query(["title", "text", "tags"]) + const parser = new lunr.QueryParser(value, query) + + /* Parse Search query */ + parser.parse() + for (const clause of query.clauses) { + clause.usePipeline = true + + /* Handle leading wildcard */ + if (clause.term.startsWith("*")) { + clause.wildcard = lunr.Query.wildcard.LEADING + clause.term = clause.term.slice(1) + } + + /* Handle trailing wildcard */ + if (clause.term.endsWith("*")) { + clause.wildcard = lunr.Query.wildcard.TRAILING + clause.term = clause.term.slice(0, -1) + } + } + + /* Return query clauses */ + return query.clauses +} + +/** + * Analyze the search query clauses in regard to the search terms found + * + * @param query - Search query clauses + * @param terms - Search terms + * + * @returns Search query terms + */ +export function getSearchQueryTerms( + query: SearchQueryClause[], terms: string[] +): SearchQueryTerms { + const clauses = new Set<SearchQueryClause>(query) + + /* Match query clauses against terms */ + const result: SearchQueryTerms = {} + for (let t = 0; t < terms.length; t++) + for (const clause of clauses) + if (terms[t].startsWith(clause.term)) { + result[clause.term] = true + clauses.delete(clause) + } + + /* Annotate unmatched non-stopword query clauses */ + for (const clause of clauses) + if (lunr.stopWordFilter?.(clause.term)) + result[clause.term] = false + + /* Return query terms */ + return result +} diff --git a/src/templates/assets/javascripts/integrations/search/query/index.ts b/src/templates/assets/javascripts/integrations/search/query/index.ts new file mode 100644 index 00000000..763e2fd4 --- /dev/null +++ b/src/templates/assets/javascripts/integrations/search/query/index.ts @@ -0,0 +1,25 @@ +/* + * Copyright (c) 2016-2023 Martin Donath <martin.donath@squidfunk.com> + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +export * from "./_" +export * from "./segment" +export * from "./transform" diff --git a/src/templates/assets/javascripts/integrations/search/query/segment/index.ts b/src/templates/assets/javascripts/integrations/search/query/segment/index.ts new file mode 100644 index 00000000..b96796f4 --- /dev/null +++ b/src/templates/assets/javascripts/integrations/search/query/segment/index.ts @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2016-2023 Martin Donath <martin.donath@squidfunk.com> + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +/* ---------------------------------------------------------------------------- + * Functions + * ------------------------------------------------------------------------- */ + +/** + * Segment a search query using the inverted index + * + * This function implements a clever approach to text segmentation for Asian + * languages, as it used the information already available in the search index. + * The idea is to greedily segment the search query based on the tokens that are + * already part of the index, as described in the linked issue. + * + * @see https://bit.ly/3lwjrk7 - GitHub issue + * + * @param query - Query value + * @param index - Inverted index + * + * @returns Segmented query value + */ +export function segment( + query: string, index: object +): Iterable<string> { + const segments = new Set<string>() + + /* Segment search query */ + const wordcuts = new Uint16Array(query.length) + for (let i = 0; i < query.length; i++) + for (let j = i + 1; j < query.length; j++) { + const value = query.slice(i, j) + if (value in index) + wordcuts[i] = j - i + } + + /* Compute longest matches with minimum overlap */ + const stack = [0] + for (let s = stack.length; s > 0;) { + const p = stack[--s] + for (let q = 1; q < wordcuts[p]; q++) + if (wordcuts[p + q] > wordcuts[p] - q) { + segments.add(query.slice(p, p + q)) + stack[s++] = p + q + } + + /* Continue at end of query string */ + const q = p + wordcuts[p] + if (wordcuts[q] && q < query.length - 1) + stack[s++] = q + + /* Add current segment */ + segments.add(query.slice(p, q)) + } + + // @todo fix this case in the code block above, this is a hotfix + if (segments.has("")) + return new Set([query]) + + /* Return segmented query value */ + return segments +} diff --git a/src/templates/assets/javascripts/integrations/search/query/transform/index.ts b/src/templates/assets/javascripts/integrations/search/query/transform/index.ts new file mode 100644 index 00000000..41497786 --- /dev/null +++ b/src/templates/assets/javascripts/integrations/search/query/transform/index.ts @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2016-2023 Martin Donath <martin.donath@squidfunk.com> + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +/* ---------------------------------------------------------------------------- + * Helper types + * ------------------------------------------------------------------------- */ + +/** + * Visitor function + * + * @param value - String value + * + * @returns String term(s) + */ +type VisitorFn = ( + value: string +) => string | string[] + +/* ---------------------------------------------------------------------------- + * Functions + * ------------------------------------------------------------------------- */ + +/** + * Default transformation function + * + * 1. Trim excess whitespace from left and right. + * + * 2. Search for parts in quotation marks and prepend a `+` modifier to denote + * that the resulting document must contain all parts, converting the query + * to an `AND` query (as opposed to the default `OR` behavior). While users + * may expect parts enclosed in quotation marks to map to span queries, i.e. + * for which order is important, Lunr.js doesn't support them, so the best + * we can do is to convert the parts to an `AND` query. + * + * 3. Replace control characters which are not located at the beginning of the + * query or preceded by white space, or are not followed by a non-whitespace + * character or are at the end of the query string. Furthermore, filter + * unmatched quotation marks. + * + * 4. Split the query string at whitespace, then pass each part to the visitor + * function for tokenization, and append a wildcard to every resulting term + * that is not explicitly marked with a `+`, `-`, `~` or `^` modifier, since + * it ensures consistent and stable ranking when multiple terms are entered. + * Also, if a fuzzy or boost modifier are given, but no numeric value has + * been entered, default to 1 to not induce a query error. + * + * @param query - Query value + * @param fn - Visitor function + * + * @returns Transformed query value + */ +export function transform( + query: string, fn: VisitorFn = term => term +): string { + return query + + /* => 1 */ + .trim() + + /* => 2 */ + .split(/"([^"]+)"/g) + .map((parts, index) => index & 1 + ? parts.replace(/^\b|^(?![^\x00-\x7F]|$)|\s+/g, " +") + : parts + ) + .join("") + + /* => 3 */ + .replace(/"|(?:^|\s+)[*+\-:^~]+(?=\s+|$)/g, "") + + /* => 4 */ + .split(/\s+/g) + .reduce((prev, term) => { + const next = fn(term) + return [...prev, ...Array.isArray(next) ? next : [next]] + }, [] as string[]) + .map(term => /([~^]$)/.test(term) ? `${term}1` : term) + .map(term => /(^[+-]|[~^]\d+$)/.test(term) ? term : `${term}*`) + .join(" ") +} |
