aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/docs/src/templates/assets/javascripts/integrations/search/internal/extract
diff options
context:
space:
mode:
author简律纯 <i@jyunko.cn>2023-12-15 09:11:47 +0800
committer简律纯 <i@jyunko.cn>2023-12-15 09:11:47 +0800
commitbe8de118db913711eb72ae5187d26e54a0055727 (patch)
tree96cd6c012dafa3f4015e54edef90df5eaaab0ddb /docs/src/templates/assets/javascripts/integrations/search/internal/extract
parent9b2d27ba1d91a0d5531bc9c0d52c3887a2dfb2aa (diff)
downloadinfini-be8de118db913711eb72ae5187d26e54a0055727.tar.gz
infini-be8de118db913711eb72ae5187d26e54a0055727.zip
refactor(docs): optmst `docs` dir & `deps`
Diffstat (limited to 'docs/src/templates/assets/javascripts/integrations/search/internal/extract')
-rw-r--r--docs/src/templates/assets/javascripts/integrations/search/internal/extract/index.ts107
1 files changed, 107 insertions, 0 deletions
diff --git a/docs/src/templates/assets/javascripts/integrations/search/internal/extract/index.ts b/docs/src/templates/assets/javascripts/integrations/search/internal/extract/index.ts
new file mode 100644
index 00000000..2a98b9e1
--- /dev/null
+++ b/docs/src/templates/assets/javascripts/integrations/search/internal/extract/index.ts
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2016-2023 Martin Donath <martin.donath@squidfunk.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/* ----------------------------------------------------------------------------
+ * Types
+ * ------------------------------------------------------------------------- */
+
+/**
+ * Extraction type
+ *
+ * This type defines the possible values that are encoded into the first two
+ * bits of a section that is part of the blocks of a tokenization table. There
+ * are three types of interest: HTML opening and closing tags, as well as the
+ * actual text content we need to extract for indexing.
+ */
+export const enum Extract {
+ TAG_OPEN = 0, /* HTML opening tag */
+ TEXT = 1, /* Text content */
+ TAG_CLOSE = 2 /* HTML closing tag */
+}
+
+/* ----------------------------------------------------------------------------
+ * Helper types
+ * ------------------------------------------------------------------------- */
+
+/**
+ * Visitor function
+ *
+ * @param block - Block index
+ * @param type - Extraction type
+ * @param start - Start offset
+ * @param end - End offset
+ */
+type VisitorFn = (
+ block: number, type: Extract, start: number, end: number
+) => void
+
+/* ----------------------------------------------------------------------------
+ * Functions
+ * ------------------------------------------------------------------------- */
+
+/**
+ * Split a string into markup and text sections
+ *
+ * This function scans a string and divides it up into sections of markup and
+ * text. For each section, it invokes the given visitor function with the block
+ * index, extraction type, as well as start and end offsets. Using a visitor
+ * function (= streaming data) is ideal for minimizing pressure on the GC.
+ *
+ * @param input - Input value
+ * @param fn - Visitor function
+ */
+export function extract(
+ input: string, fn: VisitorFn
+): void {
+
+ let block = 0 /* Current block */
+ let start = 0 /* Current start offset */
+ let end = 0 /* Current end offset */
+
+ /* Split string into sections */
+ for (let stack = 0; end < input.length; end++) {
+
+ /* Opening tag after non-empty section */
+ if (input.charAt(end) === "<" && end > start) {
+ fn(block, Extract.TEXT, start, start = end)
+
+ /* Closing tag */
+ } else if (input.charAt(end) === ">") {
+ if (input.charAt(start + 1) === "/") {
+ if (--stack === 0)
+ fn(block++, Extract.TAG_CLOSE, start, end + 1)
+
+ /* Tag is not self-closing */
+ } else if (input.charAt(end - 1) !== "/") {
+ if (stack++ === 0)
+ fn(block, Extract.TAG_OPEN, start, end + 1)
+ }
+
+ /* New section */
+ start = end + 1
+ }
+ }
+
+ /* Add trailing section */
+ if (end > start)
+ fn(block, Extract.TEXT, start, end)
+}