diff options
Diffstat (limited to 'docs/src/plugins/search/plugin.py')
| -rw-r--r-- | docs/src/plugins/search/plugin.py | 580 |
1 files changed, 580 insertions, 0 deletions
diff --git a/docs/src/plugins/search/plugin.py b/docs/src/plugins/search/plugin.py new file mode 100644 index 00000000..5c254e3f --- /dev/null +++ b/docs/src/plugins/search/plugin.py @@ -0,0 +1,580 @@ +# Copyright (c) 2016-2023 Martin Donath <martin.donath@squidfunk.com> + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to +# deal in the Software without restriction, including without limitation the +# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +# sell copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +# IN THE SOFTWARE. + +import json +import logging +import os +import regex as re + +from html import escape +from html.parser import HTMLParser +from mkdocs import utils +from mkdocs.plugins import BasePlugin + +from .config import SearchConfig + +try: + import jieba +except ImportError: + jieba = None + +# ----------------------------------------------------------------------------- +# Classes +# ----------------------------------------------------------------------------- + +# Search plugin +class SearchPlugin(BasePlugin[SearchConfig]): + + # Initialize plugin + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # Initialize incremental builds + self.is_dirtyreload = False + + # Initialize search index cache + self.search_index_prev = None + + # Determine whether we're serving the site + def on_startup(self, *, command, dirty): + self.is_dirty = dirty + + # Initialize plugin + def on_config(self, config): + if not self.config.enabled: + return + + # Retrieve default value for language + if not self.config.lang: + self.config.lang = [self._translate( + config, "search.config.lang" + )] + + # Retrieve default value for separator + if not self.config.separator: + self.config.separator = self._translate( + config, "search.config.separator" + ) + + # Retrieve default value for pipeline + if not self.config.pipeline: + self.config.pipeline = list(filter(len, re.split( + r"\s*,\s*", self._translate(config, "search.config.pipeline") + ))) + + # Initialize search index + self.search_index = SearchIndex(**self.config) + + # Set jieba dictionary, if given + if self.config.jieba_dict: + path = os.path.normpath(self.config.jieba_dict) + if os.path.isfile(path): + jieba.set_dictionary(path) + log.debug(f"Loading jieba dictionary: {path}") + else: + log.warning( + f"Configuration error for 'search.jieba_dict': " + f"'{self.config.jieba_dict}' does not exist." + ) + + # Set jieba user dictionary, if given + if self.config.jieba_dict_user: + path = os.path.normpath(self.config.jieba_dict_user) + if os.path.isfile(path): + jieba.load_userdict(path) + log.debug(f"Loading jieba user dictionary: {path}") + else: + log.warning( + f"Configuration error for 'search.jieba_dict_user': " + f"'{self.config.jieba_dict_user}' does not exist." + ) + + # Add page to search index + def on_page_context(self, context, *, page, config, nav): + if not self.config.enabled: + return + + # Index page + self.search_index.add_entry_from_context(page) + page.content = re.sub( + r"\s?data-search-\w+=\"[^\"]+\"", + "", + page.content + ) + + # Generate search index + def on_post_build(self, *, config): + if not self.config.enabled: + return + + # Write search index + base = os.path.join(config.site_dir, "search") + path = os.path.join(base, "search_index.json") + + # Generate and write search index to file + data = self.search_index.generate_search_index(self.search_index_prev) + utils.write_file(data.encode("utf-8"), path) + + # Persist search index for repeated invocation + if self.is_dirty: + self.search_index_prev = self.search_index + + # Determine whether we're running under dirty reload + def on_serve(self, server, *, config, builder): + self.is_dirtyreload = self.is_dirty + + # ------------------------------------------------------------------------- + + # Translate the given placeholder value + def _translate(self, config, value): + env = config.theme.get_env() + + # Load language template and return translation for placeholder + language = "partials/language.html" + template = env.get_template(language, None, { "config": config }) + return template.module.t(value) + +# ----------------------------------------------------------------------------- + +# Search index with support for additional fields +class SearchIndex: + + # Initialize search index + def __init__(self, **config): + self.config = config + self.entries = [] + + # Add page to search index + def add_entry_from_context(self, page): + search = page.meta.get("search", {}) + if search.get("exclude"): + return + + # Divide page content into sections + parser = Parser() + parser.feed(page.content) + parser.close() + + # Add sections to index + for section in parser.data: + if not section.is_excluded(): + self.create_entry_for_section(section, page.toc, page.url, page) + + # Override: graceful indexing and additional fields + def create_entry_for_section(self, section, toc, url, page): + item = self._find_toc_by_id(toc, section.id) + if item: + url = url + item.url + elif section.id: + url = url + "#" + section.id + + # Set page title as section title if none was given, which happens when + # the first headline in a Markdown document is not a h1 headline. Also, + # if a page title was set via front matter, use that even though a h1 + # might be given or the page name was specified in nav in mkdocs.yml + if not section.title: + section.title = [str(page.meta.get("title", page.title))] + + # Compute title and text + title = "".join(section.title).strip() + text = "".join(section.text).strip() + + # Segment Chinese characters if jieba is available + if jieba: + title = self._segment_chinese(title) + text = self._segment_chinese(text) + + # Create entry for section + entry = { + "location": url, + "title": title, + "text": text + } + + # Set document tags + tags = page.meta.get("tags") + if isinstance(tags, list): + entry["tags"] = [] + for name in tags: + if name and isinstance(name, (str, int, float, bool)): + entry["tags"].append(name) + + # Set document boost + search = page.meta.get("search", {}) + if "boost" in search: + entry["boost"] = search["boost"] + + # Add entry to index + self.entries.append(entry) + + # Generate search index + def generate_search_index(self, prev): + config = { + key: self.config[key] + for key in ["lang", "separator", "pipeline"] + } + + # Hack: if we're running under dirty reload, the search index will only + # include the entries for the current page. However, MkDocs > 1.4 allows + # us to persist plugin state across rebuilds, which is exactly what we + # do by passing the previously built index to this method. Thus, we just + # remove the previous entries for the current page, and append the new + # entries to the end of the index, as order doesn't matter. + if prev and self.entries: + path = self.entries[0]["location"] + + # Since we're sure that we're running under dirty reload, the list + # of entries will only contain sections for a single page. Thus, we + # use the first entry to remove all entries from the previous run + # that belong to the current page. The rationale behind this is that + # authors might add or remove section headers, so we need to make + # sure that sections are synchronized correctly. + entries = [ + entry for entry in prev.entries + if not entry["location"].startswith(path) + ] + + # Merge previous with current entries + self.entries = entries + self.entries + + # Otherwise just set previous entries + if prev and not self.entries: + self.entries = prev.entries + + # Return search index as JSON + data = { "config": config, "docs": self.entries } + return json.dumps( + data, + separators = (",", ":"), + default = str + ) + + # ------------------------------------------------------------------------- + + # Retrieve item for anchor + def _find_toc_by_id(self, toc, id): + for toc_item in toc: + if toc_item.id == id: + return toc_item + + # Recurse into children of item + toc_item = self._find_toc_by_id(toc_item.children, id) + if toc_item is not None: + return toc_item + + # No item found + return None + + # Find and segment Chinese characters in string + def _segment_chinese(self, data): + expr = re.compile(r"(\p{IsHan}+)", re.UNICODE) + + # Replace callback + def replace(match): + value = match.group(0) + + # Replace occurrence in original string with segmented version and + # surround with zero-width whitespace for efficient indexing + return "".join([ + "\u200b", + "\u200b".join(jieba.cut(value.encode("utf-8"))), + "\u200b", + ]) + + # Return string with segmented occurrences + return expr.sub(replace, data).strip("\u200b") + +# ----------------------------------------------------------------------------- + +# HTML element +class Element: + """ + An element with attributes, essentially a small wrapper object for the + parser to access attributes in other callbacks than handle_starttag. + """ + + # Initialize HTML element + def __init__(self, tag, attrs = {}): + self.tag = tag + self.attrs = attrs + + # String representation + def __repr__(self): + return self.tag + + # Support comparison (compare by tag only) + def __eq__(self, other): + if other is Element: + return self.tag == other.tag + else: + return self.tag == other + + # Support set operations + def __hash__(self): + return hash(self.tag) + + # Check whether the element should be excluded + def is_excluded(self): + return "data-search-exclude" in self.attrs + +# ----------------------------------------------------------------------------- + +# HTML section +class Section: + """ + A block of text with markup, preceded by a title (with markup), i.e., a + headline with a certain level (h1-h6). Internally used by the parser. + """ + + # Initialize HTML section + def __init__(self, el, depth = 0): + self.el = el + self.depth = depth + + # Initialize section data + self.text = [] + self.title = [] + self.id = None + + # String representation + def __repr__(self): + if self.id: + return "#".join([self.el.tag, self.id]) + else: + return self.el.tag + + # Check whether the section should be excluded + def is_excluded(self): + return self.el.is_excluded() + +# ----------------------------------------------------------------------------- + +# HTML parser +class Parser(HTMLParser): + """ + This parser divides the given string of HTML into a list of sections, each + of which are preceded by a h1-h6 level heading. A white- and blacklist of + tags dictates which tags should be preserved as part of the index, and + which should be ignored in their entirety. + """ + + # Initialize HTML parser + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # Tags to skip + self.skip = set([ + "object", # Objects + "script", # Scripts + "style" # Styles + ]) + + # Tags to keep + self.keep = set([ + "p", # Paragraphs + "code", "pre", # Code blocks + "li", "ol", "ul", # Lists + "sub", "sup" # Sub- and superscripts + ]) + + # Current context and section + self.context = [] + self.section = None + + # All parsed sections + self.data = [] + + # Called at the start of every HTML tag + def handle_starttag(self, tag, attrs): + attrs = dict(attrs) + + # Ignore self-closing tags + el = Element(tag, attrs) + if not tag in void: + self.context.append(el) + else: + return + + # Handle heading + if tag in ([f"h{x}" for x in range(1, 7)]): + depth = len(self.context) + if "id" in attrs: + + # Ensure top-level section + if tag != "h1" and not self.data: + self.section = Section(Element("hx"), depth) + self.data.append(self.section) + + # Set identifier, if not first section + self.section = Section(el, depth) + if self.data: + self.section.id = attrs["id"] + + # Append section to list + self.data.append(self.section) + + # Handle preface - ensure top-level section + if not self.section: + self.section = Section(Element("hx")) + self.data.append(self.section) + + # Handle special cases to skip + for key, value in attrs.items(): + + # Skip block if explicitly excluded from search + if key == "data-search-exclude": + self.skip.add(el) + return + + # Skip line numbers - see https://bit.ly/3GvubZx + if key == "class" and value == "linenodiv": + self.skip.add(el) + return + + # Render opening tag if kept + if not self.skip.intersection(self.context): + if tag in self.keep: + + # Check whether we're inside the section title + data = self.section.text + if self.section.el in self.context: + data = self.section.title + + # Append to section title or text + data.append(f"<{tag}>") + + # Called at the end of every HTML tag + def handle_endtag(self, tag): + if not self.context or self.context[-1] != tag: + return + + # Check whether we're exiting the current context, which happens when + # a headline is nested in another element. In that case, we close the + # current section, continuing to append data to the previous section, + # which could also be a nested section – see https://bit.ly/3IxxIJZ + if self.section.depth > len(self.context): + for section in reversed(self.data): + if section.depth <= len(self.context): + + # Set depth to infinity in order to denote that the current + # section is exited and must never be considered again. + self.section.depth = float("inf") + self.section = section + break + + # Remove element from skip list + el = self.context.pop() + if el in self.skip: + if el.tag not in ["script", "style", "object"]: + self.skip.remove(el) + return + + # Render closing tag if kept + if not self.skip.intersection(self.context): + if tag in self.keep: + + # Check whether we're inside the section title + data = self.section.text + if self.section.el in self.context: + data = self.section.title + + # Search for corresponding opening tag + index = data.index(f"<{tag}>") + for i in range(index + 1, len(data)): + if not data[i].isspace(): + index = len(data) + break + + # Remove element if empty (or only whitespace) + if len(data) > index: + while len(data) > index: + data.pop() + + # Append to section title or text + else: + data.append(f"</{tag}>") + + # Called for the text contents of each tag + def handle_data(self, data): + if self.skip.intersection(self.context): + return + + # Collapse whitespace in non-pre contexts + if not "pre" in self.context: + if not data.isspace(): + data = data.replace("\n", " ") + else: + data = " " + + # Handle preface - ensure top-level section + if not self.section: + self.section = Section(Element("hx")) + self.data.append(self.section) + + # Handle section headline + if self.section.el in self.context: + permalink = False + for el in self.context: + if el.tag == "a" and el.attrs.get("class") == "headerlink": + permalink = True + + # Ignore permalinks + if not permalink: + self.section.title.append( + escape(data, quote = False) + ) + + # Collapse adjacent whitespace + elif data.isspace(): + if not self.section.text or not self.section.text[-1].isspace(): + self.section.text.append(data) + elif "pre" in self.context: + self.section.text.append(data) + + # Handle everything else + else: + self.section.text.append( + escape(data, quote = False) + ) + +# ----------------------------------------------------------------------------- +# Data +# ----------------------------------------------------------------------------- + +# Set up logging +log = logging.getLogger("mkdocs.material.search") + +# Tags that are self-closing +void = set([ + "area", # Image map areas + "base", # Document base + "br", # Line breaks + "col", # Table columns + "embed", # External content + "hr", # Horizontal rules + "img", # Images + "input", # Input fields + "link", # Links + "meta", # Metadata + "param", # External parameters + "source", # Image source sets + "track", # Text track + "wbr" # Line break opportunities +]) |
