diff options
| author | 2023-10-07 06:48:07 +0800 | |
|---|---|---|
| committer | 2023-10-07 06:48:07 +0800 | |
| commit | 991fd7a6d67ee017c57beaaa21fc31c4bee7944d (patch) | |
| tree | e895202203fcaa50b0052f60ef6fc7d6d2928cf9 /src/templates/assets/javascripts/integrations/search | |
| parent | d62900046bb6f754a8e6e7e670a66a90134055d9 (diff) | |
| download | infini-991fd7a6d67ee017c57beaaa21fc31c4bee7944d.tar.gz infini-991fd7a6d67ee017c57beaaa21fc31c4bee7944d.zip | |
feat(version): versions
Diffstat (limited to 'src/templates/assets/javascripts/integrations/search')
20 files changed, 1890 insertions, 0 deletions
diff --git a/src/templates/assets/javascripts/integrations/search/_/index.ts b/src/templates/assets/javascripts/integrations/search/_/index.ts new file mode 100644 index 00000000..0e217fa4 --- /dev/null +++ b/src/templates/assets/javascripts/integrations/search/_/index.ts @@ -0,0 +1,332 @@ +/* + * Copyright (c) 2016-2023 Martin Donath <martin.donath@squidfunk.com> + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +import { + SearchDocument, + SearchIndex, + SearchOptions, + setupSearchDocumentMap +} from "../config" +import { + Position, + PositionTable, + highlight, + highlightAll, + tokenize +} from "../internal" +import { + SearchQueryTerms, + getSearchQueryTerms, + parseSearchQuery, + segment, + transformSearchQuery +} from "../query" + +/* ---------------------------------------------------------------------------- + * Types + * ------------------------------------------------------------------------- */ + +/** + * Search item + */ +export interface SearchItem + extends SearchDocument +{ + score: number /* Score (relevance) */ + terms: SearchQueryTerms /* Search query terms */ +} + +/** + * Search result + */ +export interface SearchResult { + items: SearchItem[][] /* Search items */ + suggest?: string[] /* Search suggestions */ +} + +/* ---------------------------------------------------------------------------- + * Functions + * ------------------------------------------------------------------------- */ + +/** + * Create field extractor factory + * + * @param table - Position table map + * + * @returns Extractor factory + */ +function extractor(table: Map<string, PositionTable>) { + return (name: keyof SearchDocument) => { + return (doc: SearchDocument) => { + if (typeof doc[name] === "undefined") + return undefined + + /* Compute identifier and initialize table */ + const id = [doc.location, name].join(":") + table.set(id, lunr.tokenizer.table = []) + + /* Return field value */ + return doc[name] + } + } +} + +/** + * Compute the difference of two lists of strings + * + * @param a - 1st list of strings + * @param b - 2nd list of strings + * + * @returns Difference + */ +function difference(a: string[], b: string[]): string[] { + const [x, y] = [new Set(a), new Set(b)] + return [ + ...new Set([...x].filter(value => !y.has(value))) + ] +} + +/* ---------------------------------------------------------------------------- + * Class + * ------------------------------------------------------------------------- */ + +/** + * Search index + */ +export class Search { + + /** + * Search document map + */ + protected map: Map<string, SearchDocument> + + /** + * Search options + */ + protected options: SearchOptions + + /** + * The underlying Lunr.js search index + */ + protected index: lunr.Index + + /** + * Internal position table map + */ + protected table: Map<string, PositionTable> + + /** + * Create the search integration + * + * @param data - Search index + */ + public constructor({ config, docs, options }: SearchIndex) { + const field = extractor(this.table = new Map()) + + /* Set up document map and options */ + this.map = setupSearchDocumentMap(docs) + this.options = options + + /* Set up document index */ + this.index = lunr(function () { + this.metadataWhitelist = ["position"] + this.b(0) + + /* Set up (multi-)language support */ + if (config.lang.length === 1 && config.lang[0] !== "en") { + // @ts-expect-error - namespace indexing not supported + this.use(lunr[config.lang[0]]) + } else if (config.lang.length > 1) { + this.use(lunr.multiLanguage(...config.lang)) + } + + /* Set up custom tokenizer (must be after language setup) */ + this.tokenizer = tokenize as typeof lunr.tokenizer + lunr.tokenizer.separator = new RegExp(config.separator) + + /* Set up custom segmenter, if loaded */ + lunr.segmenter = "TinySegmenter" in lunr + ? new lunr.TinySegmenter() + : undefined + + /* Compute functions to be removed from the pipeline */ + const fns = difference([ + "trimmer", "stopWordFilter", "stemmer" + ], config.pipeline) + + /* Remove functions from the pipeline for registered languages */ + for (const lang of config.lang.map(language => ( + // @ts-expect-error - namespace indexing not supported + language === "en" ? lunr : lunr[language] + ))) + for (const fn of fns) { + this.pipeline.remove(lang[fn]) + this.searchPipeline.remove(lang[fn]) + } + + /* Set up index reference */ + this.ref("location") + + /* Set up index fields */ + this.field("title", { boost: 1e3, extractor: field("title") }) + this.field("text", { boost: 1e0, extractor: field("text") }) + this.field("tags", { boost: 1e6, extractor: field("tags") }) + + /* Add documents to index */ + for (const doc of docs) + this.add(doc, { boost: doc.boost }) + }) + } + + /** + * Search for matching documents + * + * @param query - Search query + * + * @returns Search result + */ + public search(query: string): SearchResult { + + // Experimental Chinese segmentation + query = query.replace(/\p{sc=Han}+/gu, value => { + return [...segment(value, this.index.invertedIndex)] + .join("* ") + }) + + // @todo: move segmenter (above) into transformSearchQuery + query = transformSearchQuery(query) + if (!query) + return { items: [] } + + /* Parse query to extract clauses for analysis */ + const clauses = parseSearchQuery(query) + .filter(clause => ( + clause.presence !== lunr.Query.presence.PROHIBITED + )) + + /* Perform search and post-process results */ + const groups = this.index.search(query) + + /* Apply post-query boosts based on title and search query terms */ + .reduce<SearchItem[]>((item, { ref, score, matchData }) => { + let doc = this.map.get(ref) + if (typeof doc !== "undefined") { + + /* Shallow copy document */ + doc = { ...doc } + if (doc.tags) + doc.tags = [...doc.tags] + + /* Compute and analyze search query terms */ + const terms = getSearchQueryTerms( + clauses, + Object.keys(matchData.metadata) + ) + + /* Highlight matches in fields */ + for (const field of this.index.fields) { + if (typeof doc[field] === "undefined") + continue + + /* Collect positions from matches */ + const positions: Position[] = [] + for (const match of Object.values(matchData.metadata)) + if (typeof match[field] !== "undefined") + positions.push(...match[field].position) + + /* Skip highlighting, if no positions were collected */ + if (!positions.length) + continue + + /* Load table and determine highlighting method */ + const table = this.table.get([doc.location, field].join(":"))! + const fn = Array.isArray(doc[field]) + ? highlightAll + : highlight + + // @ts-expect-error - stop moaning, TypeScript! + doc[field] = fn(doc[field], table, positions, field !== "text") + } + + /* Highlight title and text and apply post-query boosts */ + const boost = +!doc.parent + + Object.values(terms) + .filter(t => t).length / + Object.keys(terms).length + + /* Append item */ + item.push({ + ...doc, + score: score * (1 + boost ** 2), + terms + }) + } + return item + }, []) + + /* Sort search results again after applying boosts */ + .sort((a, b) => b.score - a.score) + + /* Group search results by article */ + .reduce((items, result) => { + const doc = this.map.get(result.location) + if (typeof doc !== "undefined") { + const ref = doc.parent + ? doc.parent.location + : doc.location + items.set(ref, [...items.get(ref) || [], result]) + } + return items + }, new Map<string, SearchItem[]>()) + + /* Ensure that every item set has an article */ + for (const [ref, items] of groups) + if (!items.find(item => item.location === ref)) { + const doc = this.map.get(ref)! + items.push({ ...doc, score: 0, terms: {} }) + } + + /* Generate search suggestions, if desired */ + let suggest: string[] | undefined + if (this.options.suggest) { + const titles = this.index.query(builder => { + for (const clause of clauses) + builder.term(clause.term, { + fields: ["title"], + presence: lunr.Query.presence.REQUIRED, + wildcard: lunr.Query.wildcard.TRAILING + }) + }) + + /* Retrieve suggestions for best match */ + suggest = titles.length + ? Object.keys(titles[0].matchData.metadata) + : [] + } + + /* Return search result */ + return { + items: [...groups.values()], + ...typeof suggest !== "undefined" && { suggest } + } + } +} diff --git a/src/templates/assets/javascripts/integrations/search/config/index.ts b/src/templates/assets/javascripts/integrations/search/config/index.ts new file mode 100644 index 00000000..3d88d1c6 --- /dev/null +++ b/src/templates/assets/javascripts/integrations/search/config/index.ts @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2016-2023 Martin Donath <martin.donath@squidfunk.com> + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +/* ---------------------------------------------------------------------------- + * Types + * ------------------------------------------------------------------------- */ + +/** + * Search configuration + */ +export interface SearchConfig { + lang: string[] /* Search languages */ + separator: string /* Search separator */ + pipeline: SearchPipelineFn[] /* Search pipeline */ +} + +/** + * Search document + */ +export interface SearchDocument { + location: string /* Document location */ + title: string /* Document title */ + text: string /* Document text */ + tags?: string[] /* Document tags */ + boost?: number /* Document boost */ + parent?: SearchDocument /* Document parent */ +} + +/** + * Search options + */ +export interface SearchOptions { + suggest: boolean /* Search suggestions */ +} + +/* ------------------------------------------------------------------------- */ + +/** + * Search index + */ +export interface SearchIndex { + config: SearchConfig /* Search configuration */ + docs: SearchDocument[] /* Search documents */ + options: SearchOptions /* Search options */ +} + +/* ---------------------------------------------------------------------------- + * Helper types + * ------------------------------------------------------------------------- */ + +/** + * Search pipeline function + */ +type SearchPipelineFn = + | "trimmer" /* Trimmer */ + | "stopWordFilter" /* Stop word filter */ + | "stemmer" /* Stemmer */ + +/* ---------------------------------------------------------------------------- + * Functions + * ------------------------------------------------------------------------- */ + +/** + * Create a search document map + * + * This function creates a mapping of URLs (including anchors) to the actual + * articles and sections. It relies on the invariant that the search index is + * ordered with the main article appearing before all sections with anchors. + * If this is not the case, the logic music be changed. + * + * @param docs - Search documents + * + * @returns Search document map + */ +export function setupSearchDocumentMap( + docs: SearchDocument[] +): Map<string, SearchDocument> { + const map = new Map<string, SearchDocument>() + for (const doc of docs) { + const [path] = doc.location.split("#") + + /* Add document article */ + const article = map.get(path) + if (typeof article === "undefined") { + map.set(path, doc) + + /* Add document section */ + } else { + map.set(doc.location, doc) + doc.parent = article + } + } + + /* Return search document map */ + return map +} diff --git a/src/templates/assets/javascripts/integrations/search/highlighter/index.ts b/src/templates/assets/javascripts/integrations/search/highlighter/index.ts new file mode 100644 index 00000000..0fcbb19e --- /dev/null +++ b/src/templates/assets/javascripts/integrations/search/highlighter/index.ts @@ -0,0 +1,93 @@ +/* + * Copyright (c) 2016-2023 Martin Donath <martin.donath@squidfunk.com> + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +import escapeHTML from "escape-html" + +import { SearchConfig } from "../config" + +/* ---------------------------------------------------------------------------- + * Types + * ------------------------------------------------------------------------- */ + +/** + * Search highlight function + * + * @param value - Value + * + * @returns Highlighted value + */ +export type SearchHighlightFn = (value: string) => string + +/** + * Search highlight factory function + * + * @param query - Query value + * + * @returns Search highlight function + */ +export type SearchHighlightFactoryFn = (query: string) => SearchHighlightFn + +/* ---------------------------------------------------------------------------- + * Functions + * ------------------------------------------------------------------------- */ + +/** + * Create a search highlighter + * + * @param config - Search configuration + * + * @returns Search highlight factory function + */ +export function setupSearchHighlighter( + config: SearchConfig +): SearchHighlightFactoryFn { + // Hack: temporarily remove pure lookaheads and lookbehinds + const regex = config.separator.split("|").map(term => { + const temp = term.replace(/(\(\?[!=<][^)]+\))/g, "") + return temp.length === 0 ? "�" : term + }) + .join("|") + + const separator = new RegExp(regex, "img") + const highlight = (_: unknown, data: string, term: string) => { + return `${data}<mark data-md-highlight>${term}</mark>` + } + + /* Return factory function */ + return (query: string) => { + query = query + .replace(/[\s*+\-:~^]+/g, " ") + .trim() + + /* Create search term match expression */ + const match = new RegExp(`(^|${config.separator}|)(${ + query + .replace(/[|\\{}()[\]^$+*?.-]/g, "\\$&") + .replace(separator, "|") + })`, "img") + + /* Highlight string value */ + return value => escapeHTML(value) + .replace(match, highlight) + .replace(/<\/mark>(\s+)<mark[^>]*>/img, "$1") + } +} diff --git a/src/templates/assets/javascripts/integrations/search/index.ts b/src/templates/assets/javascripts/integrations/search/index.ts new file mode 100644 index 00000000..94c010bb --- /dev/null +++ b/src/templates/assets/javascripts/integrations/search/index.ts @@ -0,0 +1,27 @@ +/* + * Copyright (c) 2016-2023 Martin Donath <martin.donath@squidfunk.com> + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +export * from "./_" +export * from "./config" +export * from "./highlighter" +export * from "./query" +export * from "./worker" diff --git a/src/templates/assets/javascripts/integrations/search/internal/.eslintrc b/src/templates/assets/javascripts/integrations/search/internal/.eslintrc new file mode 100644 index 00000000..9368ceb6 --- /dev/null +++ b/src/templates/assets/javascripts/integrations/search/internal/.eslintrc @@ -0,0 +1,6 @@ +{ + "rules": { + "no-fallthrough": "off", + "no-underscore-dangle": "off" + } +} diff --git a/src/templates/assets/javascripts/integrations/search/internal/_/index.ts b/src/templates/assets/javascripts/integrations/search/internal/_/index.ts new file mode 100644 index 00000000..ae8f6104 --- /dev/null +++ b/src/templates/assets/javascripts/integrations/search/internal/_/index.ts @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2016-2023 Martin Donath <martin.donath@squidfunk.com> + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +/* ---------------------------------------------------------------------------- + * Helper types + * ------------------------------------------------------------------------- */ + +/** + * Visitor function + * + * @param start - Start offset + * @param end - End offset + */ +type VisitorFn = ( + start: number, end: number +) => void + +/* ---------------------------------------------------------------------------- + * Functions + * ------------------------------------------------------------------------- */ + +/** + * Split a string using the given separator + * + * @param input - Input value + * @param separator - Separator + * @param fn - Visitor function + */ +export function split( + input: string, separator: RegExp, fn: VisitorFn +): void { + separator = new RegExp(separator, "g") + + /* Split string using separator */ + let match: RegExpExecArray | null + let index = 0 + do { + match = separator.exec(input) + + /* Emit non-empty range */ + const until = match?.index ?? input.length + if (index < until) + fn(index, until) + + /* Update last index */ + if (match) { + const [term] = match + index = match.index + term.length + + /* Support zero-length lookaheads */ + if (term.length === 0) + separator.lastIndex = match.index + 1 + } + } while (match) +} diff --git a/src/templates/assets/javascripts/integrations/search/internal/extract/index.ts b/src/templates/assets/javascripts/integrations/search/internal/extract/index.ts new file mode 100644 index 00000000..2a98b9e1 --- /dev/null +++ b/src/templates/assets/javascripts/integrations/search/internal/extract/index.ts @@ -0,0 +1,107 @@ +/* + * Copyright (c) 2016-2023 Martin Donath <martin.donath@squidfunk.com> + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +/* ---------------------------------------------------------------------------- + * Types + * ------------------------------------------------------------------------- */ + +/** + * Extraction type + * + * This type defines the possible values that are encoded into the first two + * bits of a section that is part of the blocks of a tokenization table. There + * are three types of interest: HTML opening and closing tags, as well as the + * actual text content we need to extract for indexing. + */ +export const enum Extract { + TAG_OPEN = 0, /* HTML opening tag */ + TEXT = 1, /* Text content */ + TAG_CLOSE = 2 /* HTML closing tag */ +} + +/* ---------------------------------------------------------------------------- + * Helper types + * ------------------------------------------------------------------------- */ + +/** + * Visitor function + * + * @param block - Block index + * @param type - Extraction type + * @param start - Start offset + * @param end - End offset + */ +type VisitorFn = ( + block: number, type: Extract, start: number, end: number +) => void + +/* ---------------------------------------------------------------------------- + * Functions + * ------------------------------------------------------------------------- */ + +/** + * Split a string into markup and text sections + * + * This function scans a string and divides it up into sections of markup and + * text. For each section, it invokes the given visitor function with the block + * index, extraction type, as well as start and end offsets. Using a visitor + * function (= streaming data) is ideal for minimizing pressure on the GC. + * + * @param input - Input value + * @param fn - Visitor function + */ +export function extract( + input: string, fn: VisitorFn +): void { + + let block = 0 /* Current block */ + let start = 0 /* Current start offset */ + let end = 0 /* Current end offset */ + + /* Split string into sections */ + for (let stack = 0; end < input.length; end++) { + + /* Opening tag after non-empty section */ + if (input.charAt(end) === "<" && end > start) { + fn(block, Extract.TEXT, start, start = end) + + /* Closing tag */ + } else if (input.charAt(end) === ">") { + if (input.charAt(start + 1) === "/") { + if (--stack === 0) + fn(block++, Extract.TAG_CLOSE, start, end + 1) + + /* Tag is not self-closing */ + } else if (input.charAt(end - 1) !== "/") { + if (stack++ === 0) + fn(block, Extract.TAG_OPEN, start, end + 1) + } + + /* New section */ + start = end + 1 + } + } + + /* Add trailing section */ + if (end > start) + fn(block, Extract.TEXT, start, end) +} diff --git a/src/templates/assets/javascripts/integrations/search/internal/highlight/index.ts b/src/templates/assets/javascripts/integrations/search/internal/highlight/index.ts new file mode 100644 index 00000000..7cc3bf1a --- /dev/null +++ b/src/templates/assets/javascripts/integrations/search/internal/highlight/index.ts @@ -0,0 +1,162 @@ +/* + * Copyright (c) 2016-2023 Martin Donath <martin.donath@squidfunk.com> + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +/* ---------------------------------------------------------------------------- + * Types + * ------------------------------------------------------------------------- */ + +/** + * Position table + */ +export type PositionTable = number[][] + +/** + * Position + */ +export type Position = number + +/* ---------------------------------------------------------------------------- + * Functions + * ------------------------------------------------------------------------- */ + +/** + * Highlight all occurrences in a string + * + * This function receives a field's value (e.g. like `title` or `text`), it's + * position table that was generated during indexing, and the positions found + * when executing the query. It then highlights all occurrences, and returns + * their concatenation. In case of multiple blocks, two are returned. + * + * @param input - Input value + * @param table - Table for indexing + * @param positions - Occurrences + * @param full - Full results + * + * @returns Highlighted string value + */ +export function highlight( + input: string, table: PositionTable, positions: Position[], full = false +): string { + return highlightAll([input], table, positions, full).pop()! +} + +/** + * Highlight all occurrences in a set of strings + * + * @param inputs - Input values + * @param table - Table for indexing + * @param positions - Occurrences + * @param full - Full results + * + * @returns Highlighted string values + */ +export function highlightAll( + inputs: string[], table: PositionTable, positions: Position[], full = false +): string[] { + + /* Map blocks to input values */ + const mapping = [0] + for (let t = 1; t < table.length; t++) { + const prev = table[t - 1] + const next = table[t] + + /* Check if table points to new block */ + const p = prev[prev.length - 1] >>> 2 & 0x3FF + const q = next[0] >>> 12 + + /* Add block to mapping */ + mapping.push(+(p > q) + mapping[mapping.length - 1]) + } + + /* Highlight strings one after another */ + return inputs.map((input, i) => { + let cursor = 0 + + /* Map occurrences to blocks */ + const blocks = new Map<number, number[]>() + for (const p of positions.sort((a, b) => a - b)) { + const index = p & 0xFFFFF + const block = p >>> 20 + if (mapping[block] !== i) + continue + + /* Ensure presence of block group */ + let group = blocks.get(block) + if (typeof group === "undefined") + blocks.set(block, group = []) + + /* Add index to group */ + group.push(index) + } + + /* Just return string, if no occurrences */ + if (blocks.size === 0) + return input + + /* Compute slices */ + const slices: string[] = [] + for (const [block, indexes] of blocks) { + const t = table[block] + + /* Extract positions and length */ + const start = t[0] >>> 12 + const end = t[t.length - 1] >>> 12 + const length = t[t.length - 1] >>> 2 & 0x3FF + + /* Add prefix, if full results are desired */ + if (full && start > cursor) + slices.push(input.slice(cursor, start)) + + /* Extract and highlight slice */ + let slice = input.slice(start, end + length) + for (const j of indexes.sort((a, b) => b - a)) { + + /* Retrieve offset and length of match */ + const p = (t[j] >>> 12) - start + const q = (t[j] >>> 2 & 0x3FF) + p + + /* Wrap occurrence */ + slice = [ + slice.slice(0, p), + "<mark>", + slice.slice(p, q), + "</mark>", + slice.slice(q) + ].join("") + } + + /* Update cursor */ + cursor = end + length + + /* Append slice and abort if we have two */ + if (slices.push(slice) === 2) + break + } + + /* Add suffix, if full results are desired */ + if (full && cursor < input.length) + slices.push(input.slice(cursor)) + + /* Return highlighted slices */ + return slices.join("") + }) +} diff --git a/src/templates/assets/javascripts/integrations/search/internal/index.ts b/src/templates/assets/javascripts/integrations/search/internal/index.ts new file mode 100644 index 00000000..c752329e --- /dev/null +++ b/src/templates/assets/javascripts/integrations/search/internal/index.ts @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2016-2023 Martin Donath <martin.donath@squidfunk.com> + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +export * from "./_" +export * from "./extract" +export * from "./highlight" +export * from "./tokenize" diff --git a/src/templates/assets/javascripts/integrations/search/internal/tokenize/index.ts b/src/templates/assets/javascripts/integrations/search/internal/tokenize/index.ts new file mode 100644 index 00000000..f5089bc9 --- /dev/null +++ b/src/templates/assets/javascripts/integrations/search/internal/tokenize/index.ts @@ -0,0 +1,136 @@ +/* + * Copyright (c) 2016-2023 Martin Donath <martin.donath@squidfunk.com> + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +import { split } from "../_" +import { + Extract, + extract +} from "../extract" + +/* ---------------------------------------------------------------------------- + * Functions + * ------------------------------------------------------------------------- */ + +/** + * Split a string or set of strings into tokens + * + * This tokenizer supersedes the default tokenizer that is provided by Lunr.js, + * as it is aware of HTML tags and allows for multi-character splitting. + * + * It takes the given inputs, splits each of them into markup and text sections, + * tokenizes and segments (if necessary) each of them, and then indexes them in + * a table by using a compact bit representation. Bitwise techniques are used + * to write and read from the table during indexing and querying. + * + * @see https://bit.ly/3W3Xw4J - Search: better, faster, smaller + * + * @param input - Input value(s) + * + * @returns Tokens + */ +export function tokenize( + input?: string | string[] +): lunr.Token[] { + const tokens: lunr.Token[] = [] + if (typeof input === "undefined") + return tokens + + /* Tokenize strings one after another */ + const inputs = Array.isArray(input) ? input : [input] + for (let i = 0; i < inputs.length; i++) { + const table = lunr.tokenizer.table + const total = table.length + + /* Split string into sections and tokenize content blocks */ + extract(inputs[i], (block, type, start, end) => { + table[block += total] ||= [] + switch (type) { + + /* Handle markup */ + case Extract.TAG_OPEN: + case Extract.TAG_CLOSE: + table[block].push( + start << 12 | + end - start << 2 | + type + ) + break + + /* Handle text content */ + case Extract.TEXT: + const section = inputs[i].slice(start, end) + split(section, lunr.tokenizer.separator, (index, until) => { + + /** + * Apply segmenter after tokenization. Note that the segmenter will + * also split words at word boundaries, which is not what we want, + * so we need to check if we can somehow mitigate this behavior. + */ + if (typeof lunr.segmenter !== "undefined") { + const subsection = section.slice(index, until) + if (/^[MHIK]$/.test(lunr.segmenter.ctype_(subsection))) { + const segments = lunr.segmenter.segment(subsection) + for (let s = 0, l = 0; s < segments.length; s++) { + + /* Add block to section */ + table[block] ||= [] + table[block].push( + start + index + l << 12 | + segments[s].length << 2 | + type + ) + + /* Add token with position */ + tokens.push(new lunr.Token( + segments[s].toLowerCase(), { + position: block << 20 | table[block].length - 1 + } + )) + + /* Keep track of length */ + l += segments[s].length + } + return + } + } + + /* Add block to section */ + table[block].push( + start + index << 12 | + until - index << 2 | + type + ) + + /* Add token with position */ + tokens.push(new lunr.Token( + section.slice(index, until).toLowerCase(), { + position: block << 20 | table[block].length - 1 + } + )) + }) + } + }) + } + + /* Return tokens */ + return tokens +} diff --git a/src/templates/assets/javascripts/integrations/search/query/.eslintrc b/src/templates/assets/javascripts/integrations/search/query/.eslintrc new file mode 100644 index 00000000..3031c7e3 --- /dev/null +++ b/src/templates/assets/javascripts/integrations/search/query/.eslintrc @@ -0,0 +1,6 @@ +{ + "rules": { + "no-control-regex": "off", + "@typescript-eslint/no-explicit-any": "off" + } +} diff --git a/src/templates/assets/javascripts/integrations/search/query/_/index.ts b/src/templates/assets/javascripts/integrations/search/query/_/index.ts new file mode 100644 index 00000000..14482e43 --- /dev/null +++ b/src/templates/assets/javascripts/integrations/search/query/_/index.ts @@ -0,0 +1,172 @@ +/* + * Copyright (c) 2016-2023 Martin Donath <martin.donath@squidfunk.com> + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +import { split } from "../../internal" +import { transform } from "../transform" + +/* ---------------------------------------------------------------------------- + * Types + * ------------------------------------------------------------------------- */ + +/** + * Search query clause + */ +export interface SearchQueryClause { + presence: lunr.Query.presence /* Clause presence */ + term: string /* Clause term */ +} + +/* ------------------------------------------------------------------------- */ + +/** + * Search query terms + */ +export type SearchQueryTerms = Record<string, boolean> + +/* ---------------------------------------------------------------------------- + * Functions + * ------------------------------------------------------------------------- */ + +/** + * Transform search query + * + * This function lexes the given search query and applies the transformation + * function to each term, preserving markup like `+` and `-` modifiers. + * + * @param query - Search query + * + * @returns Search query + */ +export function transformSearchQuery( + query: string +): string { + + /* Split query terms with tokenizer */ + return transform(query, part => { + const terms: string[] = [] + + /* Initialize lexer and analyze part */ + const lexer = new lunr.QueryLexer(part) + lexer.run() + + /* Extract and tokenize term from lexeme */ + for (const { type, str: term, start, end } of lexer.lexemes) + switch (type) { + + /* Hack: remove colon - see https://bit.ly/3wD3T3I */ + case "FIELD": + if (!["title", "text", "tags"].includes(term)) + part = [ + part.slice(0, end), + " ", + part.slice(end + 1) + ].join("") + break + + /* Tokenize term */ + case "TERM": + split(term, lunr.tokenizer.separator, (...range) => { + terms.push([ + part.slice(0, start), + term.slice(...range), + part.slice(end) + ].join("")) + }) + } + + /* Return terms */ + return terms + }) +} + +/* ------------------------------------------------------------------------- */ + +/** + * Parse a search query for analysis + * + * Lunr.js itself has a bug where it doesn't detect or remove wildcards for + * query clauses, so we must do this here. + * + * @see https://bit.ly/3DpTGtz - GitHub issue + * + * @param value - Query value + * + * @returns Search query clauses + */ +export function parseSearchQuery( + value: string +): SearchQueryClause[] { + const query = new lunr.Query(["title", "text", "tags"]) + const parser = new lunr.QueryParser(value, query) + + /* Parse Search query */ + parser.parse() + for (const clause of query.clauses) { + clause.usePipeline = true + + /* Handle leading wildcard */ + if (clause.term.startsWith("*")) { + clause.wildcard = lunr.Query.wildcard.LEADING + clause.term = clause.term.slice(1) + } + + /* Handle trailing wildcard */ + if (clause.term.endsWith("*")) { + clause.wildcard = lunr.Query.wildcard.TRAILING + clause.term = clause.term.slice(0, -1) + } + } + + /* Return query clauses */ + return query.clauses +} + +/** + * Analyze the search query clauses in regard to the search terms found + * + * @param query - Search query clauses + * @param terms - Search terms + * + * @returns Search query terms + */ +export function getSearchQueryTerms( + query: SearchQueryClause[], terms: string[] +): SearchQueryTerms { + const clauses = new Set<SearchQueryClause>(query) + + /* Match query clauses against terms */ + const result: SearchQueryTerms = {} + for (let t = 0; t < terms.length; t++) + for (const clause of clauses) + if (terms[t].startsWith(clause.term)) { + result[clause.term] = true + clauses.delete(clause) + } + + /* Annotate unmatched non-stopword query clauses */ + for (const clause of clauses) + if (lunr.stopWordFilter?.(clause.term)) + result[clause.term] = false + + /* Return query terms */ + return result +} diff --git a/src/templates/assets/javascripts/integrations/search/query/index.ts b/src/templates/assets/javascripts/integrations/search/query/index.ts new file mode 100644 index 00000000..763e2fd4 --- /dev/null +++ b/src/templates/assets/javascripts/integrations/search/query/index.ts @@ -0,0 +1,25 @@ +/* + * Copyright (c) 2016-2023 Martin Donath <martin.donath@squidfunk.com> + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +export * from "./_" +export * from "./segment" +export * from "./transform" diff --git a/src/templates/assets/javascripts/integrations/search/query/segment/index.ts b/src/templates/assets/javascripts/integrations/search/query/segment/index.ts new file mode 100644 index 00000000..b96796f4 --- /dev/null +++ b/src/templates/assets/javascripts/integrations/search/query/segment/index.ts @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2016-2023 Martin Donath <martin.donath@squidfunk.com> + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +/* ---------------------------------------------------------------------------- + * Functions + * ------------------------------------------------------------------------- */ + +/** + * Segment a search query using the inverted index + * + * This function implements a clever approach to text segmentation for Asian + * languages, as it used the information already available in the search index. + * The idea is to greedily segment the search query based on the tokens that are + * already part of the index, as described in the linked issue. + * + * @see https://bit.ly/3lwjrk7 - GitHub issue + * + * @param query - Query value + * @param index - Inverted index + * + * @returns Segmented query value + */ +export function segment( + query: string, index: object +): Iterable<string> { + const segments = new Set<string>() + + /* Segment search query */ + const wordcuts = new Uint16Array(query.length) + for (let i = 0; i < query.length; i++) + for (let j = i + 1; j < query.length; j++) { + const value = query.slice(i, j) + if (value in index) + wordcuts[i] = j - i + } + + /* Compute longest matches with minimum overlap */ + const stack = [0] + for (let s = stack.length; s > 0;) { + const p = stack[--s] + for (let q = 1; q < wordcuts[p]; q++) + if (wordcuts[p + q] > wordcuts[p] - q) { + segments.add(query.slice(p, p + q)) + stack[s++] = p + q + } + + /* Continue at end of query string */ + const q = p + wordcuts[p] + if (wordcuts[q] && q < query.length - 1) + stack[s++] = q + + /* Add current segment */ + segments.add(query.slice(p, q)) + } + + // @todo fix this case in the code block above, this is a hotfix + if (segments.has("")) + return new Set([query]) + + /* Return segmented query value */ + return segments +} diff --git a/src/templates/assets/javascripts/integrations/search/query/transform/index.ts b/src/templates/assets/javascripts/integrations/search/query/transform/index.ts new file mode 100644 index 00000000..41497786 --- /dev/null +++ b/src/templates/assets/javascripts/integrations/search/query/transform/index.ts @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2016-2023 Martin Donath <martin.donath@squidfunk.com> + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +/* ---------------------------------------------------------------------------- + * Helper types + * ------------------------------------------------------------------------- */ + +/** + * Visitor function + * + * @param value - String value + * + * @returns String term(s) + */ +type VisitorFn = ( + value: string +) => string | string[] + +/* ---------------------------------------------------------------------------- + * Functions + * ------------------------------------------------------------------------- */ + +/** + * Default transformation function + * + * 1. Trim excess whitespace from left and right. + * + * 2. Search for parts in quotation marks and prepend a `+` modifier to denote + * that the resulting document must contain all parts, converting the query + * to an `AND` query (as opposed to the default `OR` behavior). While users + * may expect parts enclosed in quotation marks to map to span queries, i.e. + * for which order is important, Lunr.js doesn't support them, so the best + * we can do is to convert the parts to an `AND` query. + * + * 3. Replace control characters which are not located at the beginning of the + * query or preceded by white space, or are not followed by a non-whitespace + * character or are at the end of the query string. Furthermore, filter + * unmatched quotation marks. + * + * 4. Split the query string at whitespace, then pass each part to the visitor + * function for tokenization, and append a wildcard to every resulting term + * that is not explicitly marked with a `+`, `-`, `~` or `^` modifier, since + * it ensures consistent and stable ranking when multiple terms are entered. + * Also, if a fuzzy or boost modifier are given, but no numeric value has + * been entered, default to 1 to not induce a query error. + * + * @param query - Query value + * @param fn - Visitor function + * + * @returns Transformed query value + */ +export function transform( + query: string, fn: VisitorFn = term => term +): string { + return query + + /* => 1 */ + .trim() + + /* => 2 */ + .split(/"([^"]+)"/g) + .map((parts, index) => index & 1 + ? parts.replace(/^\b|^(?![^\x00-\x7F]|$)|\s+/g, " +") + : parts + ) + .join("") + + /* => 3 */ + .replace(/"|(?:^|\s+)[*+\-:^~]+(?=\s+|$)/g, "") + + /* => 4 */ + .split(/\s+/g) + .reduce((prev, term) => { + const next = fn(term) + return [...prev, ...Array.isArray(next) ? next : [next]] + }, [] as string[]) + .map(term => /([~^]$)/.test(term) ? `${term}1` : term) + .map(term => /(^[+-]|[~^]\d+$)/.test(term) ? term : `${term}*`) + .join(" ") +} diff --git a/src/templates/assets/javascripts/integrations/search/worker/_/index.ts b/src/templates/assets/javascripts/integrations/search/worker/_/index.ts new file mode 100644 index 00000000..26713573 --- /dev/null +++ b/src/templates/assets/javascripts/integrations/search/worker/_/index.ts @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2016-2023 Martin Donath <martin.donath@squidfunk.com> + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A RTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +import { + ObservableInput, + Subject, + first, + merge, + of, + switchMap +} from "rxjs" + +import { feature } from "~/_" +import { watchToggle, watchWorker } from "~/browser" + +import { SearchIndex } from "../../config" +import { + SearchMessage, + SearchMessageType +} from "../message" + +/* ---------------------------------------------------------------------------- + * Functions + * ------------------------------------------------------------------------- */ + +/** + * Set up search worker + * + * This function creates and initializes a web worker that is used for search, + * so that the user interface doesn't freeze. In general, the application does + * not care how search is implemented, as long as the web worker conforms to + * the format expected by the application as defined in `SearchMessage`. This + * allows the author to implement custom search functionality, by providing a + * custom web worker via configuration. + * + * Material for MkDocs' built-in search implementation makes use of Lunr.js, an + * efficient and fast implementation for client-side search. Leveraging a tiny + * iframe-based web worker shim, search is even supported for the `file://` + * protocol, enabling search for local non-hosted builds. + * + * If the protocol is `file://`, search initialization is deferred to mitigate + * freezing, as it's now synchronous by design - see https://bit.ly/3C521EO + * + * @see https://bit.ly/3igvtQv - How to implement custom search + * + * @param url - Worker URL + * @param index$ - Search index observable input + * + * @returns Search worker + */ +export function setupSearchWorker( + url: string, index$: ObservableInput<SearchIndex> +): Subject<SearchMessage> { + const worker$ = watchWorker<SearchMessage>(url) + merge( + of(location.protocol !== "file:"), + watchToggle("search") + ) + .pipe( + first(active => active), + switchMap(() => index$) + ) + .subscribe(({ config, docs }) => worker$.next({ + type: SearchMessageType.SETUP, + data: { + config, + docs, + options: { + suggest: feature("search.suggest") + } + } + })) + + /* Return search worker */ + return worker$ +} diff --git a/src/templates/assets/javascripts/integrations/search/worker/index.ts b/src/templates/assets/javascripts/integrations/search/worker/index.ts new file mode 100644 index 00000000..7120ad6e --- /dev/null +++ b/src/templates/assets/javascripts/integrations/search/worker/index.ts @@ -0,0 +1,24 @@ +/* + * Copyright (c) 2016-2023 Martin Donath <martin.donath@squidfunk.com> + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +export * from "./_" +export * from "./message" diff --git a/src/templates/assets/javascripts/integrations/search/worker/main/.eslintrc b/src/templates/assets/javascripts/integrations/search/worker/main/.eslintrc new file mode 100644 index 00000000..3df9d551 --- /dev/null +++ b/src/templates/assets/javascripts/integrations/search/worker/main/.eslintrc @@ -0,0 +1,6 @@ +{ + "rules": { + "no-console": "off", + "@typescript-eslint/no-misused-promises": "off" + } +} diff --git a/src/templates/assets/javascripts/integrations/search/worker/main/index.ts b/src/templates/assets/javascripts/integrations/search/worker/main/index.ts new file mode 100644 index 00000000..2df38080 --- /dev/null +++ b/src/templates/assets/javascripts/integrations/search/worker/main/index.ts @@ -0,0 +1,192 @@ +/* + * Copyright (c) 2016-2023 Martin Donath <martin.donath@squidfunk.com> + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A RTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +import lunr from "lunr" + +import { getElement } from "~/browser/element/_" +import "~/polyfills" + +import { Search } from "../../_" +import { SearchConfig } from "../../config" +import { + SearchMessage, + SearchMessageType +} from "../message" + +/* ---------------------------------------------------------------------------- + * Types + * ------------------------------------------------------------------------- */ + +/** + * Add support for `iframe-worker` shim + * + * While `importScripts` is synchronous when executed inside of a web worker, + * it's not possible to provide a synchronous shim implementation. The cool + * thing is that awaiting a non-Promise will convert it into a Promise, so + * extending the type definition to return a `Promise` shouldn't break anything. + * + * @see https://bit.ly/2PjDnXi - GitHub comment + * + * @param urls - Scripts to load + * + * @returns Promise resolving with no result + */ +declare global { + function importScripts(...urls: string[]): Promise<void> | void +} + +/* ---------------------------------------------------------------------------- + * Data + * ------------------------------------------------------------------------- */ + +/** + * Search index + */ +let index: Search + +/* ---------------------------------------------------------------------------- + * Helper functions + * ------------------------------------------------------------------------- */ + +/** + * Fetch (= import) multi-language support through `lunr-languages` + * + * This function automatically imports the stemmers necessary to process the + * languages which are defined as part of the search configuration. + * + * If the worker runs inside of an `iframe` (when using `iframe-worker` as + * a shim), the base URL for the stemmers to be loaded must be determined by + * searching for the first `script` element with a `src` attribute, which will + * contain the contents of this script. + * + * @param config - Search configuration + * + * @returns Promise resolving with no result + */ +async function setupSearchLanguages( + config: SearchConfig +): Promise<void> { + let base = "../lunr" + + /* Detect `iframe-worker` and fix base URL */ + if (typeof parent !== "undefined" && "IFrameWorker" in parent) { + const worker = getElement<HTMLScriptElement>("script[src]")! + const [path] = worker.src.split("/worker") + + /* Prefix base with path */ + base = base.replace("..", path) + } + + /* Add scripts for languages */ + const scripts = [] + for (const lang of config.lang) { + switch (lang) { + + /* Add segmenter for Japanese */ + case "ja": + scripts.push(`${base}/tinyseg.js`) + break + + /* Add segmenter for Hindi and Thai */ + case "hi": + case "th": + scripts.push(`${base}/wordcut.js`) + break + } + + /* Add language support */ + if (lang !== "en") + scripts.push(`${base}/min/lunr.${lang}.min.js`) + } + + /* Add multi-language support */ + if (config.lang.length > 1) + scripts.push(`${base}/min/lunr.multi.min.js`) + + /* Load scripts synchronously */ + if (scripts.length) + await importScripts( + `${base}/min/lunr.stemmer.support.min.js`, + ...scripts + ) +} + +/* ---------------------------------------------------------------------------- + * Functions + * ------------------------------------------------------------------------- */ + +/** + * Message handler + * + * @param message - Source message + * + * @returns Target message + */ +export async function handler( + message: SearchMessage +): Promise<SearchMessage> { + switch (message.type) { + + /* Search setup message */ + case SearchMessageType.SETUP: + await setupSearchLanguages(message.data.config) + index = new Search(message.data) + return { + type: SearchMessageType.READY + } + + /* Search query message */ + case SearchMessageType.QUERY: + const query = message.data + try { + return { + type: SearchMessageType.RESULT, + data: index.search(query) + } + + /* Return empty result in case of error */ + } catch (err) { + console.warn(`Invalid query: ${query} – see https://bit.ly/2s3ChXG`) + console.warn(err) + return { + type: SearchMessageType.RESULT, + data: { items: [] } + } + } + + /* All other messages */ + default: + throw new TypeError("Invalid message type") + } +} + +/* ---------------------------------------------------------------------------- + * Worker + * ------------------------------------------------------------------------- */ + +/* Expose Lunr.js in global scope, or stemmers won't work */ +self.lunr = lunr + +/* Handle messages */ +addEventListener("message", async ev => { + postMessage(await handler(ev.data)) +}) diff --git a/src/templates/assets/javascripts/integrations/search/worker/message/index.ts b/src/templates/assets/javascripts/integrations/search/worker/message/index.ts new file mode 100644 index 00000000..54d5001e --- /dev/null +++ b/src/templates/assets/javascripts/integrations/search/worker/message/index.ts @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2016-2023 Martin Donath <martin.donath@squidfunk.com> + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A RTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +import { SearchResult } from "../../_" +import { SearchIndex } from "../../config" + +/* ---------------------------------------------------------------------------- + * Types + * ------------------------------------------------------------------------- */ + +/** + * Search message type + */ +export const enum SearchMessageType { + SETUP, /* Search index setup */ + READY, /* Search index ready */ + QUERY, /* Search query */ + RESULT /* Search results */ +} + +/* ------------------------------------------------------------------------- */ + +/** + * Message containing the data necessary to setup the search index + */ +export interface SearchSetupMessage { + type: SearchMessageType.SETUP /* Message type */ + data: SearchIndex /* Message data */ +} + +/** + * Message indicating the search index is ready + */ +export interface SearchReadyMessage { + type: SearchMessageType.READY /* Message type */ +} + +/** + * Message containing a search query + */ +export interface SearchQueryMessage { + type: SearchMessageType.QUERY /* Message type */ + data: string /* Message data */ +} + +/** + * Message containing results for a search query + */ +export interface SearchResultMessage { + type: SearchMessageType.RESULT /* Message type */ + data: SearchResult /* Message data */ +} + +/* ------------------------------------------------------------------------- */ + +/** + * Message exchanged with the search worker + */ +export type SearchMessage = + | SearchSetupMessage + | SearchReadyMessage + | SearchQueryMessage + | SearchResultMessage + +/* ---------------------------------------------------------------------------- + * Functions + * ------------------------------------------------------------------------- */ + +/** + * Type guard for search ready messages + * + * @param message - Search worker message + * + * @returns Test result + */ +export function isSearchReadyMessage( + message: SearchMessage +): message is SearchReadyMessage { + return message.type === SearchMessageType.READY +} + +/** + * Type guard for search result messages + * + * @param message - Search worker message + * + * @returns Test result + */ +export function isSearchResultMessage( + message: SearchMessage +): message is SearchResultMessage { + return message.type === SearchMessageType.RESULT +} |
