src/templates/assets/javascripts/integrations/search/internal/extract/index.ts


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107

/*
 * Copyright (c) 2016-2023 Martin Donath <martin.donath@squidfunk.com>
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to
 * deal in the Software without restriction, including without limitation the
 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 * sell copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 */

/* ----------------------------------------------------------------------------
 * Types
 * ------------------------------------------------------------------------- */

/**
 * Extraction type
 *
 * This type defines the possible values that are encoded into the first two
 * bits of a section that is part of the blocks of a tokenization table. There
 * are three types of interest: HTML opening and closing tags, as well as the
 * actual text content we need to extract for indexing.
 */
export const enum Extract {
  TAG_OPEN  = 0,                       /* HTML opening tag */
  TEXT      = 1,                       /* Text content */
  TAG_CLOSE = 2                        /* HTML closing tag */
}

/* ----------------------------------------------------------------------------
 * Helper types
 * ------------------------------------------------------------------------- */

/**
 * Visitor function
 *
 * @param block - Block index
 * @param type - Extraction type
 * @param start - Start offset
 * @param end - End offset
 */
type VisitorFn = (
  block: number, type: Extract, start: number, end: number
) => void

/* ----------------------------------------------------------------------------
 * Functions
 * ------------------------------------------------------------------------- */

/**
 * Split a string into markup and text sections
 *
 * This function scans a string and divides it up into sections of markup and
 * text. For each section, it invokes the given visitor function with the block
 * index, extraction type, as well as start and end offsets. Using a visitor
 * function (= streaming data) is ideal for minimizing pressure on the GC.
 *
 * @param input - Input value
 * @param fn - Visitor function
 */
export function extract(
  input: string, fn: VisitorFn
): void {

  let block = 0                        /* Current block */
  let start = 0                        /* Current start offset */
  let end = 0                          /* Current end offset */

  /* Split string into sections */
  for (let stack = 0; end < input.length; end++) {

    /* Opening tag after non-empty section */
    if (input.charAt(end) === "<" && end > start) {
      fn(block, Extract.TEXT, start, start = end)

    /* Closing tag */
    } else if (input.charAt(end) === ">") {
      if (input.charAt(start + 1) === "/") {
        if (--stack === 0)
          fn(block++, Extract.TAG_CLOSE, start, end + 1)

      /* Tag is not self-closing */
      } else if (input.charAt(end - 1) !== "/") {
        if (stack++ === 0)
          fn(block, Extract.TAG_OPEN, start, end + 1)
      }

      /* New section */
      start = end + 1
    }
  }

  /* Add trailing section */
  if (end > start)
    fn(block, Extract.TEXT, start, end)
}