From be8de118db913711eb72ae5187d26e54a0055727 Mon Sep 17 00:00:00 2001 From: 简律纯 Date: Fri, 15 Dec 2023 09:11:47 +0800 Subject: refactor(docs): optmst `docs` dir & `deps` --- src/plugins/search/__init__.py | 19 -- src/plugins/search/config.py | 58 ----- src/plugins/search/plugin.py | 580 ----------------------------------------- 3 files changed, 657 deletions(-) delete mode 100644 src/plugins/search/__init__.py delete mode 100644 src/plugins/search/config.py delete mode 100644 src/plugins/search/plugin.py (limited to 'src/plugins/search') diff --git a/src/plugins/search/__init__.py b/src/plugins/search/__init__.py deleted file mode 100644 index d1899378..00000000 --- a/src/plugins/search/__init__.py +++ /dev/null @@ -1,19 +0,0 @@ -# Copyright (c) 2016-2023 Martin Donath - -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to -# deal in the Software without restriction, including without limitation the -# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or -# sell copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: - -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. - -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -# IN THE SOFTWARE. diff --git a/src/plugins/search/config.py b/src/plugins/search/config.py deleted file mode 100644 index e150fbb3..00000000 --- a/src/plugins/search/config.py +++ /dev/null @@ -1,58 +0,0 @@ -# Copyright (c) 2016-2023 Martin Donath - -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to -# deal in the Software without restriction, including without limitation the -# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or -# sell copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: - -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. - -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -# IN THE SOFTWARE. - -from mkdocs.config.config_options import ( - Choice, - Deprecated, - Optional, - ListOfItems, - Type -) -from mkdocs.config.base import Config -from mkdocs.contrib.search import LangOption - -# ----------------------------------------------------------------------------- -# Options -# ----------------------------------------------------------------------------- - -# Options for search pipeline -pipeline = ("stemmer", "stopWordFilter", "trimmer") - -# ----------------------------------------------------------------------------- -# Classes -# ----------------------------------------------------------------------------- - -# Search plugin configuration -class SearchConfig(Config): - enabled = Type(bool, default = True) - - # Settings for search - lang = Optional(LangOption()) - separator = Optional(Type(str)) - pipeline = ListOfItems(Choice(pipeline), default = []) - - # Settings for text segmentation (Chinese) - jieba_dict = Optional(Type(str)) - jieba_dict_user = Optional(Type(str)) - - # Unsupported settings, originally implemented in MkDocs - indexing = Deprecated(message = "Unsupported option") - prebuild_index = Deprecated(message = "Unsupported option") - min_search_length = Deprecated(message = "Unsupported option") diff --git a/src/plugins/search/plugin.py b/src/plugins/search/plugin.py deleted file mode 100644 index 5c254e3f..00000000 --- a/src/plugins/search/plugin.py +++ /dev/null @@ -1,580 +0,0 @@ -# Copyright (c) 2016-2023 Martin Donath - -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to -# deal in the Software without restriction, including without limitation the -# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or -# sell copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: - -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. - -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -# IN THE SOFTWARE. - -import json -import logging -import os -import regex as re - -from html import escape -from html.parser import HTMLParser -from mkdocs import utils -from mkdocs.plugins import BasePlugin - -from .config import SearchConfig - -try: - import jieba -except ImportError: - jieba = None - -# ----------------------------------------------------------------------------- -# Classes -# ----------------------------------------------------------------------------- - -# Search plugin -class SearchPlugin(BasePlugin[SearchConfig]): - - # Initialize plugin - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - # Initialize incremental builds - self.is_dirtyreload = False - - # Initialize search index cache - self.search_index_prev = None - - # Determine whether we're serving the site - def on_startup(self, *, command, dirty): - self.is_dirty = dirty - - # Initialize plugin - def on_config(self, config): - if not self.config.enabled: - return - - # Retrieve default value for language - if not self.config.lang: - self.config.lang = [self._translate( - config, "search.config.lang" - )] - - # Retrieve default value for separator - if not self.config.separator: - self.config.separator = self._translate( - config, "search.config.separator" - ) - - # Retrieve default value for pipeline - if not self.config.pipeline: - self.config.pipeline = list(filter(len, re.split( - r"\s*,\s*", self._translate(config, "search.config.pipeline") - ))) - - # Initialize search index - self.search_index = SearchIndex(**self.config) - - # Set jieba dictionary, if given - if self.config.jieba_dict: - path = os.path.normpath(self.config.jieba_dict) - if os.path.isfile(path): - jieba.set_dictionary(path) - log.debug(f"Loading jieba dictionary: {path}") - else: - log.warning( - f"Configuration error for 'search.jieba_dict': " - f"'{self.config.jieba_dict}' does not exist." - ) - - # Set jieba user dictionary, if given - if self.config.jieba_dict_user: - path = os.path.normpath(self.config.jieba_dict_user) - if os.path.isfile(path): - jieba.load_userdict(path) - log.debug(f"Loading jieba user dictionary: {path}") - else: - log.warning( - f"Configuration error for 'search.jieba_dict_user': " - f"'{self.config.jieba_dict_user}' does not exist." - ) - - # Add page to search index - def on_page_context(self, context, *, page, config, nav): - if not self.config.enabled: - return - - # Index page - self.search_index.add_entry_from_context(page) - page.content = re.sub( - r"\s?data-search-\w+=\"[^\"]+\"", - "", - page.content - ) - - # Generate search index - def on_post_build(self, *, config): - if not self.config.enabled: - return - - # Write search index - base = os.path.join(config.site_dir, "search") - path = os.path.join(base, "search_index.json") - - # Generate and write search index to file - data = self.search_index.generate_search_index(self.search_index_prev) - utils.write_file(data.encode("utf-8"), path) - - # Persist search index for repeated invocation - if self.is_dirty: - self.search_index_prev = self.search_index - - # Determine whether we're running under dirty reload - def on_serve(self, server, *, config, builder): - self.is_dirtyreload = self.is_dirty - - # ------------------------------------------------------------------------- - - # Translate the given placeholder value - def _translate(self, config, value): - env = config.theme.get_env() - - # Load language template and return translation for placeholder - language = "partials/language.html" - template = env.get_template(language, None, { "config": config }) - return template.module.t(value) - -# ----------------------------------------------------------------------------- - -# Search index with support for additional fields -class SearchIndex: - - # Initialize search index - def __init__(self, **config): - self.config = config - self.entries = [] - - # Add page to search index - def add_entry_from_context(self, page): - search = page.meta.get("search", {}) - if search.get("exclude"): - return - - # Divide page content into sections - parser = Parser() - parser.feed(page.content) - parser.close() - - # Add sections to index - for section in parser.data: - if not section.is_excluded(): - self.create_entry_for_section(section, page.toc, page.url, page) - - # Override: graceful indexing and additional fields - def create_entry_for_section(self, section, toc, url, page): - item = self._find_toc_by_id(toc, section.id) - if item: - url = url + item.url - elif section.id: - url = url + "#" + section.id - - # Set page title as section title if none was given, which happens when - # the first headline in a Markdown document is not a h1 headline. Also, - # if a page title was set via front matter, use that even though a h1 - # might be given or the page name was specified in nav in mkdocs.yml - if not section.title: - section.title = [str(page.meta.get("title", page.title))] - - # Compute title and text - title = "".join(section.title).strip() - text = "".join(section.text).strip() - - # Segment Chinese characters if jieba is available - if jieba: - title = self._segment_chinese(title) - text = self._segment_chinese(text) - - # Create entry for section - entry = { - "location": url, - "title": title, - "text": text - } - - # Set document tags - tags = page.meta.get("tags") - if isinstance(tags, list): - entry["tags"] = [] - for name in tags: - if name and isinstance(name, (str, int, float, bool)): - entry["tags"].append(name) - - # Set document boost - search = page.meta.get("search", {}) - if "boost" in search: - entry["boost"] = search["boost"] - - # Add entry to index - self.entries.append(entry) - - # Generate search index - def generate_search_index(self, prev): - config = { - key: self.config[key] - for key in ["lang", "separator", "pipeline"] - } - - # Hack: if we're running under dirty reload, the search index will only - # include the entries for the current page. However, MkDocs > 1.4 allows - # us to persist plugin state across rebuilds, which is exactly what we - # do by passing the previously built index to this method. Thus, we just - # remove the previous entries for the current page, and append the new - # entries to the end of the index, as order doesn't matter. - if prev and self.entries: - path = self.entries[0]["location"] - - # Since we're sure that we're running under dirty reload, the list - # of entries will only contain sections for a single page. Thus, we - # use the first entry to remove all entries from the previous run - # that belong to the current page. The rationale behind this is that - # authors might add or remove section headers, so we need to make - # sure that sections are synchronized correctly. - entries = [ - entry for entry in prev.entries - if not entry["location"].startswith(path) - ] - - # Merge previous with current entries - self.entries = entries + self.entries - - # Otherwise just set previous entries - if prev and not self.entries: - self.entries = prev.entries - - # Return search index as JSON - data = { "config": config, "docs": self.entries } - return json.dumps( - data, - separators = (",", ":"), - default = str - ) - - # ------------------------------------------------------------------------- - - # Retrieve item for anchor - def _find_toc_by_id(self, toc, id): - for toc_item in toc: - if toc_item.id == id: - return toc_item - - # Recurse into children of item - toc_item = self._find_toc_by_id(toc_item.children, id) - if toc_item is not None: - return toc_item - - # No item found - return None - - # Find and segment Chinese characters in string - def _segment_chinese(self, data): - expr = re.compile(r"(\p{IsHan}+)", re.UNICODE) - - # Replace callback - def replace(match): - value = match.group(0) - - # Replace occurrence in original string with segmented version and - # surround with zero-width whitespace for efficient indexing - return "".join([ - "\u200b", - "\u200b".join(jieba.cut(value.encode("utf-8"))), - "\u200b", - ]) - - # Return string with segmented occurrences - return expr.sub(replace, data).strip("\u200b") - -# ----------------------------------------------------------------------------- - -# HTML element -class Element: - """ - An element with attributes, essentially a small wrapper object for the - parser to access attributes in other callbacks than handle_starttag. - """ - - # Initialize HTML element - def __init__(self, tag, attrs = {}): - self.tag = tag - self.attrs = attrs - - # String representation - def __repr__(self): - return self.tag - - # Support comparison (compare by tag only) - def __eq__(self, other): - if other is Element: - return self.tag == other.tag - else: - return self.tag == other - - # Support set operations - def __hash__(self): - return hash(self.tag) - - # Check whether the element should be excluded - def is_excluded(self): - return "data-search-exclude" in self.attrs - -# ----------------------------------------------------------------------------- - -# HTML section -class Section: - """ - A block of text with markup, preceded by a title (with markup), i.e., a - headline with a certain level (h1-h6). Internally used by the parser. - """ - - # Initialize HTML section - def __init__(self, el, depth = 0): - self.el = el - self.depth = depth - - # Initialize section data - self.text = [] - self.title = [] - self.id = None - - # String representation - def __repr__(self): - if self.id: - return "#".join([self.el.tag, self.id]) - else: - return self.el.tag - - # Check whether the section should be excluded - def is_excluded(self): - return self.el.is_excluded() - -# ----------------------------------------------------------------------------- - -# HTML parser -class Parser(HTMLParser): - """ - This parser divides the given string of HTML into a list of sections, each - of which are preceded by a h1-h6 level heading. A white- and blacklist of - tags dictates which tags should be preserved as part of the index, and - which should be ignored in their entirety. - """ - - # Initialize HTML parser - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - # Tags to skip - self.skip = set([ - "object", # Objects - "script", # Scripts - "style" # Styles - ]) - - # Tags to keep - self.keep = set([ - "p", # Paragraphs - "code", "pre", # Code blocks - "li", "ol", "ul", # Lists - "sub", "sup" # Sub- and superscripts - ]) - - # Current context and section - self.context = [] - self.section = None - - # All parsed sections - self.data = [] - - # Called at the start of every HTML tag - def handle_starttag(self, tag, attrs): - attrs = dict(attrs) - - # Ignore self-closing tags - el = Element(tag, attrs) - if not tag in void: - self.context.append(el) - else: - return - - # Handle heading - if tag in ([f"h{x}" for x in range(1, 7)]): - depth = len(self.context) - if "id" in attrs: - - # Ensure top-level section - if tag != "h1" and not self.data: - self.section = Section(Element("hx"), depth) - self.data.append(self.section) - - # Set identifier, if not first section - self.section = Section(el, depth) - if self.data: - self.section.id = attrs["id"] - - # Append section to list - self.data.append(self.section) - - # Handle preface - ensure top-level section - if not self.section: - self.section = Section(Element("hx")) - self.data.append(self.section) - - # Handle special cases to skip - for key, value in attrs.items(): - - # Skip block if explicitly excluded from search - if key == "data-search-exclude": - self.skip.add(el) - return - - # Skip line numbers - see https://bit.ly/3GvubZx - if key == "class" and value == "linenodiv": - self.skip.add(el) - return - - # Render opening tag if kept - if not self.skip.intersection(self.context): - if tag in self.keep: - - # Check whether we're inside the section title - data = self.section.text - if self.section.el in self.context: - data = self.section.title - - # Append to section title or text - data.append(f"<{tag}>") - - # Called at the end of every HTML tag - def handle_endtag(self, tag): - if not self.context or self.context[-1] != tag: - return - - # Check whether we're exiting the current context, which happens when - # a headline is nested in another element. In that case, we close the - # current section, continuing to append data to the previous section, - # which could also be a nested section – see https://bit.ly/3IxxIJZ - if self.section.depth > len(self.context): - for section in reversed(self.data): - if section.depth <= len(self.context): - - # Set depth to infinity in order to denote that the current - # section is exited and must never be considered again. - self.section.depth = float("inf") - self.section = section - break - - # Remove element from skip list - el = self.context.pop() - if el in self.skip: - if el.tag not in ["script", "style", "object"]: - self.skip.remove(el) - return - - # Render closing tag if kept - if not self.skip.intersection(self.context): - if tag in self.keep: - - # Check whether we're inside the section title - data = self.section.text - if self.section.el in self.context: - data = self.section.title - - # Search for corresponding opening tag - index = data.index(f"<{tag}>") - for i in range(index + 1, len(data)): - if not data[i].isspace(): - index = len(data) - break - - # Remove element if empty (or only whitespace) - if len(data) > index: - while len(data) > index: - data.pop() - - # Append to section title or text - else: - data.append(f"") - - # Called for the text contents of each tag - def handle_data(self, data): - if self.skip.intersection(self.context): - return - - # Collapse whitespace in non-pre contexts - if not "pre" in self.context: - if not data.isspace(): - data = data.replace("\n", " ") - else: - data = " " - - # Handle preface - ensure top-level section - if not self.section: - self.section = Section(Element("hx")) - self.data.append(self.section) - - # Handle section headline - if self.section.el in self.context: - permalink = False - for el in self.context: - if el.tag == "a" and el.attrs.get("class") == "headerlink": - permalink = True - - # Ignore permalinks - if not permalink: - self.section.title.append( - escape(data, quote = False) - ) - - # Collapse adjacent whitespace - elif data.isspace(): - if not self.section.text or not self.section.text[-1].isspace(): - self.section.text.append(data) - elif "pre" in self.context: - self.section.text.append(data) - - # Handle everything else - else: - self.section.text.append( - escape(data, quote = False) - ) - -# ----------------------------------------------------------------------------- -# Data -# ----------------------------------------------------------------------------- - -# Set up logging -log = logging.getLogger("mkdocs.material.search") - -# Tags that are self-closing -void = set([ - "area", # Image map areas - "base", # Document base - "br", # Line breaks - "col", # Table columns - "embed", # External content - "hr", # Horizontal rules - "img", # Images - "input", # Input fields - "link", # Links - "meta", # Metadata - "param", # External parameters - "source", # Image source sets - "track", # Text track - "wbr" # Line break opportunities -]) -- cgit v1.2.3-70-g09d2