1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
|
/*
* Copyright (c) 2016-2023 Martin Donath <martin.donath@squidfunk.com>
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
/* ----------------------------------------------------------------------------
* Types
* ------------------------------------------------------------------------- */
/**
* Extraction type
*
* This type defines the possible values that are encoded into the first two
* bits of a section that is part of the blocks of a tokenization table. There
* are three types of interest: HTML opening and closing tags, as well as the
* actual text content we need to extract for indexing.
*/
export const enum Extract {
TAG_OPEN = 0, /* HTML opening tag */
TEXT = 1, /* Text content */
TAG_CLOSE = 2 /* HTML closing tag */
}
/* ----------------------------------------------------------------------------
* Helper types
* ------------------------------------------------------------------------- */
/**
* Visitor function
*
* @param block - Block index
* @param type - Extraction type
* @param start - Start offset
* @param end - End offset
*/
type VisitorFn = (
block: number, type: Extract, start: number, end: number
) => void
/* ----------------------------------------------------------------------------
* Functions
* ------------------------------------------------------------------------- */
/**
* Split a string into markup and text sections
*
* This function scans a string and divides it up into sections of markup and
* text. For each section, it invokes the given visitor function with the block
* index, extraction type, as well as start and end offsets. Using a visitor
* function (= streaming data) is ideal for minimizing pressure on the GC.
*
* @param input - Input value
* @param fn - Visitor function
*/
export function extract(
input: string, fn: VisitorFn
): void {
let block = 0 /* Current block */
let start = 0 /* Current start offset */
let end = 0 /* Current end offset */
/* Split string into sections */
for (let stack = 0; end < input.length; end++) {
/* Opening tag after non-empty section */
if (input.charAt(end) === "<" && end > start) {
fn(block, Extract.TEXT, start, start = end)
/* Closing tag */
} else if (input.charAt(end) === ">") {
if (input.charAt(start + 1) === "/") {
if (--stack === 0)
fn(block++, Extract.TAG_CLOSE, start, end + 1)
/* Tag is not self-closing */
} else if (input.charAt(end - 1) !== "/") {
if (stack++ === 0)
fn(block, Extract.TAG_OPEN, start, end + 1)
}
/* New section */
start = end + 1
}
}
/* Add trailing section */
if (end > start)
fn(block, Extract.TEXT, start, end)
}
|