From a2ee372ed0dbb686ce5d252f945d4831b68b4bd5 Mon Sep 17 00:00:00 2001 From: Sunshine Date: Wed, 5 Jun 2024 07:31:36 +0800 Subject: [PATCH] Implement "fallback" searching strategy and remove dedicated Chinese search support. --- Cargo.lock | 40 - Cargo.toml | 5 +- guide/src/format/configuration/general.md | 3 +- guide/src/format/configuration/renderers.md | 3 +- src/renderer/html_handlebars/hbs_renderer.rs | 8 +- src/renderer/html_handlebars/search.rs | 173 ++-- src/renderer/html_handlebars/search/lang.rs | 48 + src/theme/css/chrome.css | 4 + src/theme/searcher/languages/lunr.zh.js | 145 --- src/theme/searcher/mod.rs | 2 +- src/theme/searcher/searcher.fallback.js | 934 +++++++++++++++++++ src/utils/mod.rs | 13 +- 12 files changed, 1126 insertions(+), 252 deletions(-) create mode 100644 src/renderer/html_handlebars/search/lang.rs delete mode 100644 src/theme/searcher/languages/lunr.zh.js create mode 100644 src/theme/searcher/searcher.fallback.js diff --git a/Cargo.lock b/Cargo.lock index 1986f598c0..7859cf3dec 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -248,15 +248,6 @@ version = "1.0.97" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "099a5357d84c4c61eb35fc8eafa9a79a902c2f76911e5747ced4e032edd8d9b4" -[[package]] -name = "cedarwood" -version = "0.4.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6d910bedd62c24733263d0bed247460853c9d22e8956bd4cd964302095e04e90" -dependencies = [ - "smallvec", -] - [[package]] name = "cfg-if" version = "1.0.0" @@ -506,7 +497,6 @@ version = "3.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "41e83863a500656dfa214fee6682de9c5b9f03de6860fec531235ed2ae9f6571" dependencies = [ - "jieba-rs", "lindera", "lindera-core", "regex", @@ -757,15 +747,6 @@ dependencies = [ "slab", ] -[[package]] -name = "fxhash" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c" -dependencies = [ - "byteorder", -] - [[package]] name = "generic-array" version = "0.14.7" @@ -1106,21 +1087,6 @@ version = "1.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b" -[[package]] -name = "jieba-rs" -version = "0.6.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "93f0c1347cd3ac8d7c6e3a2dc33ac496d365cf09fc0831aa61111e1a6738983e" -dependencies = [ - "cedarwood", - "fxhash", - "hashbrown 0.14.5", - "lazy_static", - "phf 0.11.2", - "phf_codegen 0.11.2", - "regex", -] - [[package]] name = "js-sys" version = "0.3.69" @@ -1150,12 +1116,6 @@ dependencies = [ "libc", ] -[[package]] -name = "lazy_static" -version = "1.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" - [[package]] name = "libc" version = "0.2.154" diff --git a/Cargo.toml b/Cargo.toml index 7c866ab23f..1aced5975b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -65,7 +65,10 @@ default = ["watch", "serve", "search"] watch = ["dep:notify", "dep:notify-debouncer-mini", "dep:ignore", "dep:pathdiff", "dep:walkdir"] serve = ["dep:futures-util", "dep:tokio", "dep:warp"] search = ["dep:elasticlunr-rs", "dep:ammonia"] -search-non-english = ["search", "elasticlunr-rs/languages"] +search-non-english = ["search", "elasticlunr-rs/ar", "elasticlunr-rs/da", "elasticlunr-rs/de", "elasticlunr-rs/du", + "elasticlunr-rs/es", "elasticlunr-rs/fi", "elasticlunr-rs/fr", "elasticlunr-rs/hu", "elasticlunr-rs/it", + "elasticlunr-rs/ja", "elasticlunr-rs/ko", "elasticlunr-rs/no", "elasticlunr-rs/pt", "elasticlunr-rs/ro", + "elasticlunr-rs/ru", "elasticlunr-rs/sv", "elasticlunr-rs/tr"] [[bin]] doc = false diff --git a/guide/src/format/configuration/general.md b/guide/src/format/configuration/general.md index 3a10898c1f..3930219cf3 100644 --- a/guide/src/format/configuration/general.md +++ b/guide/src/format/configuration/general.md @@ -47,7 +47,8 @@ This is general information about your book. key in the configuration file. - **language:** The main language of the book, which is used as a language attribute `` for example. This is also used to derive the direction of text (RTL, LTR) within the book. - When `search-non-english` feature is enabled, this may change the behavior of the search functionality provided by the HTML renderer. + When it is specified to a non-English language, an alternative indexing / searching strategy would be applied to the search functionality provided by the HTML renderer. + When `search-non-english` feature is enabled, additional language-specific search support may kick in. - **text-direction**: The direction of text in the book: Left-to-right (LTR) or Right-to-left (RTL). Possible values: `ltr`, `rtl`. When not specified, the text direction is derived from the book's `language` attribute. diff --git a/guide/src/format/configuration/renderers.md b/guide/src/format/configuration/renderers.md index 04d9912b95..2d6f000d56 100644 --- a/guide/src/format/configuration/renderers.md +++ b/guide/src/format/configuration/renderers.md @@ -263,7 +263,8 @@ copy-js = true # include Javascript code for search - **enable:** Enables the search feature. Defaults to `true`. - **limit-results:** The maximum number of search results. Defaults to `30`. - **teaser-word-count:** The number of words used for a search result teaser. - Defaults to `30`. + When `book.language` is set to a non-English language, this limit might + be exceeded in case too many keywords are matched. Defaults to `30`. - **use-boolean-and:** Define the logical link between multiple search words. If true, all search words must appear in each result. Defaults to `false`. - **boost-title:** Boost factor for the search result score if a search word diff --git a/src/renderer/html_handlebars/hbs_renderer.rs b/src/renderer/html_handlebars/hbs_renderer.rs index 606e87c65b..6d8573722c 100644 --- a/src/renderer/html_handlebars/hbs_renderer.rs +++ b/src/renderer/html_handlebars/hbs_renderer.rs @@ -544,10 +544,10 @@ impl Renderer for HtmlHandlebars { { let search = html_config.search.clone().unwrap_or_default(); if search.enable { - let language = book_config - .language - .as_deref() - .and_then(|lang| lang.parse().ok()); + let language = match book_config.language.as_deref() { + None => Err("en".to_string()), + Some(language) => language.parse(), + }; #[allow(unused_variables)] let extra_language_subtag = super::search::create_files(&search, language, destination, book)?; diff --git a/src/renderer/html_handlebars/search.rs b/src/renderer/html_handlebars/search.rs index 66a733a0db..8fcb501476 100644 --- a/src/renderer/html_handlebars/search.rs +++ b/src/renderer/html_handlebars/search.rs @@ -1,3 +1,5 @@ +mod lang; + use std::borrow::Cow; use std::collections::{HashMap, HashSet}; use std::fmt::Display; @@ -32,9 +34,8 @@ fn tokenize(text: &str) -> Vec { /// Languages that wouldn't work with the current feature flag config are included. #[derive(Debug, Copy, Clone)] #[non_exhaustive] -pub enum SupportedNonEnglishLanguage { +pub(crate) enum ExtraSupportedLanguage { Arabic, - Chinese, Danish, Dutch, Finnish, @@ -53,22 +54,21 @@ pub enum SupportedNonEnglishLanguage { Turkish, } -impl FromStr for SupportedNonEnglishLanguage { - type Err = (); +impl FromStr for ExtraSupportedLanguage { + type Err = String; /// A language tag can be like "zh" / "zh-CN" / "zh-Hans" / "zh-Hans-CN") /// See: https://developer.mozilla.org/en-US/docs/Web/HTML/Global_attributes/lang#language_tag_syntax + /// if the language doesn't have extra support, `Err` is returned with the language subtag. fn from_str(language_tag: &str) -> Result { - use SupportedNonEnglishLanguage::*; - match language_tag + use ExtraSupportedLanguage::*; + let language_subtag = language_tag .split('-') .next() .expect("splitting a string always returns at least 1 fragment") - .to_ascii_lowercase() - .as_str() - { + .to_ascii_lowercase(); + match language_subtag.as_str() { "ar" => Ok(Arabic), - "zh" => Ok(Chinese), "da" => Ok(Danish), "nl" => Ok(Dutch), "fi" => Ok(Finnish), @@ -85,22 +85,21 @@ impl FromStr for SupportedNonEnglishLanguage { "es" => Ok(Spanish), "sv" => Ok(Swedish), "tr" => Ok(Turkish), - _ => Err(()), + _ => Err(language_subtag), } } } -impl TryFrom for Box { +impl TryFrom for Box { type Error = (); #[cfg(feature = "search-non-english")] /// Returns `Ok` if and only if `language.lunr_js_content()` returns `Some`. - fn try_from(language: SupportedNonEnglishLanguage) -> std::result::Result { + fn try_from(language: ExtraSupportedLanguage) -> std::result::Result { use elasticlunr::lang as el; - use SupportedNonEnglishLanguage::*; + use ExtraSupportedLanguage::*; match language { Arabic => Ok(Box::new(el::Arabic::new())), - Chinese => Ok(Box::new(el::Chinese::new())), Danish => Ok(Box::new(el::Danish::new())), Dutch => Ok(Box::new(el::Dutch::new())), Finnish => Ok(Box::new(el::Finnish::new())), @@ -121,18 +120,17 @@ impl TryFrom for Box { } #[cfg(not(feature = "search-non-english"))] - fn try_from(_: SupportedNonEnglishLanguage) -> std::result::Result { + fn try_from(_: ExtraSupportedLanguage) -> std::result::Result { Err(()) } } -impl Display for SupportedNonEnglishLanguage { - /// Displays as language subtag (e.g. "zh" for Chinese). +impl Display for ExtraSupportedLanguage { + /// Displays as language subtag (e.g. "de" for German). fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - use SupportedNonEnglishLanguage::*; + use ExtraSupportedLanguage::*; f.write_str(match self { Arabic => "ar", - Chinese => "zh", Danish => "da", Dutch => "nl", Finnish => "fi", @@ -154,14 +152,13 @@ impl Display for SupportedNonEnglishLanguage { } #[cfg(feature = "search-non-english")] -impl SupportedNonEnglishLanguage { +impl ExtraSupportedLanguage { /// Returns `Some` if and only if `self.try_into::>()` returns `Ok`. - pub(crate) fn lunr_js_content(self) -> Option<&'static [u8]> { + pub fn lunr_js_content(self) -> Option<&'static [u8]> { use searcher::lang::*; - use SupportedNonEnglishLanguage::*; + use ExtraSupportedLanguage::*; match self { Arabic => Some(ARABIC_JS), - Chinese => Some(CHINESE_JS), Danish => Some(DANISH_JS), Dutch => Some(DUTCH_JS), Finnish => Some(FINNISH_JS), @@ -185,40 +182,53 @@ impl SupportedNonEnglishLanguage { /// Creates all files required for search. /// Returns the language subtag if extra `lunr.stemmer.support.js` & /// `lunr.*.js` files should be imported. -/// E.g., returns "zh" when `lunr.stemmer.support.js` & `lunr.zh.js` should be imported. +/// E.g., returns "ja" when `lunr.stemmer.support.js` & `lunr.ja.js` should be imported. pub fn create_files( search_config: &Search, - language: Option, + language: Result, destination: &Path, book: &Book, ) -> Result> { - #[allow(unused_variables)] - let (mut index, extra_language_subtag) = match language.and_then(|l| l.try_into().ok()) { - None => { - if let Some(non_english_language) = language { + let potentially_supported_language = language.as_ref().ok().copied(); + let (mut index, extra_language_subtag, use_fallback); + + match language.and_then(|l| l.try_into().map_err(|_| l.to_string())) { + Err(subtag) => { + if let Some(language) = potentially_supported_language { warn!( - "mdBook compiled without {non_english_language:?}(`{non_english_language}`) \ - search support though it's available" + "mdBook compiled without {language:?}(`{language}`) \ + search support though it's available" ); warn!( "please reinstall with `cargo install mdbook --force --features \ - search-non-english`" + search-non-english`" ); - warn!("to enable {non_english_language:?} search support") + warn!("to enable {language:?} search support") } - ( - IndexBuilder::new() - .add_field_with_tokenizer("title", Box::new(&tokenize)) - .add_field_with_tokenizer("body", Box::new(&tokenize)) - .add_field_with_tokenizer("breadcrumbs", Box::new(&tokenize)) - .build(), - None, - ) + match subtag.as_str() { + "en" => { + index = IndexBuilder::new() + .add_field_with_tokenizer("title", Box::new(&tokenize)) + .add_field_with_tokenizer("body", Box::new(&tokenize)) + .add_field_with_tokenizer("breadcrumbs", Box::new(&tokenize)) + .build(); + use_fallback = false; + } + _ => { + index = Index::with_language( + Box::new(lang::Fallback::new()), + &["title", "body", "breadcrumbs"], + ); + use_fallback = true; + } + }; + extra_language_subtag = None; + } + Ok(elasticlunr_language) => { + index = Index::with_language(elasticlunr_language, &["title", "body", "breadcrumbs"]); + extra_language_subtag = potentially_supported_language.map(|l| l.to_string()); + use_fallback = false; } - Some(elasticlunr_language) => ( - Index::with_language(elasticlunr_language, &["title", "body", "breadcrumbs"]), - language.map(|l| l.to_string()), - ), }; let mut doc_urls = Vec::with_capacity(book.sections.len()); @@ -240,7 +250,15 @@ pub fn create_files( "searchindex.js", format!("Object.assign(window.search, {});", index).as_bytes(), )?; - utils::fs::write_file(destination, "searcher.js", searcher::JS)?; + utils::fs::write_file( + destination, + "searcher.js", + if use_fallback { + searcher::FALLBACK_JS + } else { + searcher::JS + }, + )?; utils::fs::write_file(destination, "mark.min.js", searcher::MARK_JS)?; utils::fs::write_file(destination, "elasticlunr.min.js", searcher::ELASTICLUNR_JS)?; #[cfg(feature = "search-non-english")] @@ -363,8 +381,9 @@ fn render_item( breadcrumbs.push(heading.clone()); } Event::Start(Tag::FootnoteDefinition(name)) => { - let number = footnote_numbers.len() + 1; - footnote_numbers.entry(name).or_insert(number); + let len = footnote_numbers.len() + 1; + let number = footnote_numbers.entry(name).or_insert(len); + body.push_str(&format!("[^{}]: ", number)) } Event::Html(html) => { let mut html_block = html.into_string(); @@ -389,15 +408,57 @@ fn render_item( // blocks, and worse case you have some noise in the index. body.push_str(&clean_html(&html)); } - Event::Start(_) | Event::End(_) | Event::Rule | Event::SoftBreak | Event::HardBreak => { - // Insert spaces where HTML output would usually separate text - // to ensure words don't get merged together - if in_heading { - heading.push(' '); - } else { - body.push(' '); + // Insert spaces where HTML output would usually separate text + // to ensure words don't get merged together + Event::Start(tag) => { + let target = if in_heading { &mut heading } else { &mut body }; + match tag { + Tag::Paragraph + | Tag::Heading { .. } + | Tag::BlockQuote + | Tag::CodeBlock(_) + | Tag::HtmlBlock + | Tag::List(_) + | Tag::Table(_) + | Tag::TableHead + | Tag::TableRow + | Tag::TableCell + | Tag::Emphasis + | Tag::Strong + | Tag::Link { .. } + | Tag::MetadataBlock(_) => {} + Tag::Item => target.push_str("* "), + Tag::Strikethrough => target.push_str("~~"), + Tag::Image { .. } => target.push_str("[image: "), + Tag::FootnoteDefinition(_) => unreachable!(), + } + } + Event::End(tag_end) => { + let target = if in_heading { &mut heading } else { &mut body }; + match tag_end { + TagEnd::Paragraph + | TagEnd::Heading(_) + | TagEnd::BlockQuote + | TagEnd::CodeBlock + | TagEnd::Item + | TagEnd::TableHead + | TagEnd::TableRow => target.push_str("\n"), + TagEnd::HtmlBlock + | TagEnd::List(_) + | TagEnd::FootnoteDefinition + | TagEnd::Table + | TagEnd::Emphasis + | TagEnd::Strong + | TagEnd::Link + | TagEnd::MetadataBlock(_) => {} + TagEnd::TableCell => target.push('\t'), + TagEnd::Strikethrough => target.push_str("~~"), + TagEnd::Image => target.push(']'), } } + Event::Rule => {} + Event::SoftBreak => body.push(' '), + Event::HardBreak => body.push('\n'), Event::Text(text) | Event::Code(text) => { if in_heading { heading.push_str(&text); diff --git a/src/renderer/html_handlebars/search/lang.rs b/src/renderer/html_handlebars/search/lang.rs new file mode 100644 index 0000000000..33f877729a --- /dev/null +++ b/src/renderer/html_handlebars/search/lang.rs @@ -0,0 +1,48 @@ +use crate::renderer::html_handlebars::search::MAX_WORD_LENGTH_TO_INDEX; +use elasticlunr::lang::English; +use elasticlunr::Pipeline; +use once_cell::sync::OnceCell; +use regex::Regex; + +pub struct Fallback { + tokenize_regex: &'static Regex, + english: &'static English, +} + +impl Fallback { + pub fn new() -> Self { + static TOKENIZE_REGEX: OnceCell = OnceCell::new(); + static ENGLISH: OnceCell = OnceCell::new(); + Self { + tokenize_regex: TOKENIZE_REGEX.get_or_init(|| Regex::new( + r"[\p{Unified_Ideograph}\p{Hangul}]|[^\p{White_Space}\p{P}\p{Sm}\p{CurrencySymbol}\p{So}\p{Unified_Ideograph}\p{Hangul}\p{Z}\p{C}]+|\p{So}\p{Sk}?(\u200D\p{So}\p{Sk}?)*" + ).unwrap()), + english: ENGLISH.get_or_init(English::new), + } + } +} + +impl elasticlunr::Language for Fallback { + fn name(&self) -> String { + "English, Chinese, Japanese, Korean, Vietnamese".into() + } + + fn code(&self) -> String { + "en".into() + } + + fn tokenize(&self, text: &str) -> Vec { + self.tokenize_regex + .find_iter(text) + .map(|s| s.as_str()) + .filter(|s| s.len() <= MAX_WORD_LENGTH_TO_INDEX) + .map(|s| s.to_lowercase()) + .collect() + } + + fn make_pipeline(&self) -> Pipeline { + let mut pipeline = self.english.make_pipeline(); + pipeline.queue.drain(0..2); + pipeline + } +} diff --git a/src/theme/css/chrome.css b/src/theme/css/chrome.css index 83b7969bce..0c23d1a9b4 100644 --- a/src/theme/css/chrome.css +++ b/src/theme/css/chrome.css @@ -370,6 +370,10 @@ ul#searchresults li { ul#searchresults li.focus { background-color: var(--searchresults-li-bg); } +ul#searchresults li a em { + font-weight: bold; + font-style: normal; +} ul#searchresults span.teaser { display: block; clear: both; diff --git a/src/theme/searcher/languages/lunr.zh.js b/src/theme/searcher/languages/lunr.zh.js deleted file mode 100644 index 48f5890d96..0000000000 --- a/src/theme/searcher/languages/lunr.zh.js +++ /dev/null @@ -1,145 +0,0 @@ -/*! - * Lunr languages, `Chinese` language - * https://github.com/MihaiValentin/lunr-languages - * - * Copyright 2019, Felix Lian (repairearth) - * http://www.mozilla.org/MPL/ - */ -/*! - * based on - * Snowball zhvaScript Library v0.3 - * http://code.google.com/p/urim/ - * http://snowball.tartarus.org/ - * - * Copyright 2010, Oleg Mazko - * http://www.mozilla.org/MPL/ - */ - -/** - * export the module via AMD, CommonJS or as a browser global - * Export code from https://github.com/umdjs/umd/blob/master/returnExports.js - */ -; -(function(root, factory) { - if (typeof define === 'function' && define.amd) { - // AMD. Register as an anonymous module. - define(factory) - } else if (typeof exports === 'object') { - /** - * Node. Does not work with strict CommonJS, but - * only CommonJS-like environments that support module.exports, - * like Node. - */ - module.exports = factory(require('@node-rs/jieba')) - } else { - // Browser globals (root is window) - factory()(root.lunr); - } -}(this, function(nodejieba) { - /** - * Just return a value to define the module export. - * This example returns an object, but the module - * can return a function as the exported value. - */ - return function(lunr, nodejiebaDictJson) { - /* throw error if lunr is not yet included */ - if ('undefined' === typeof lunr) { - throw new Error('Lunr is not present. Please include / require Lunr before this script.'); - } - - /* throw error if lunr stemmer support is not yet included */ - if ('undefined' === typeof lunr.stemmerSupport) { - throw new Error('Lunr stemmer support is not present. Please include / require Lunr stemmer support before this script.'); - } - - /* - Chinese tokenization is trickier, since it does not - take into account spaces. - Since the tokenization function is represented different - internally for each of the Lunr versions, this had to be done - in order to try to try to pick the best way of doing this based - on the Lunr version - */ - var isLunr2 = lunr.version[0] == "2"; - - /* register specific locale function */ - lunr.zh = function() { - this.pipeline.reset(); - this.pipeline.add( - lunr.zh.trimmer, - lunr.zh.stopWordFilter, - lunr.zh.stemmer - ); - - // change the tokenizer for Chinese one - if (isLunr2) { // for lunr version 2.0.0 - this.tokenizer = lunr.zh.tokenizer; - } else { - if (lunr.tokenizer) { // for lunr version 0.6.0 - lunr.tokenizer = lunr.zh.tokenizer; - } - if (this.tokenizerFn) { // for lunr version 0.7.0 -> 1.0.0 - this.tokenizerFn = lunr.zh.tokenizer; - } - } - }; - - lunr.zh.tokenizer = function(obj) { - if (!arguments.length || obj == null || obj == undefined) return [] - if (Array.isArray(obj)) return obj.map(function(t) { - return isLunr2 ? new lunr.Token(t.toLowerCase()) : t.toLowerCase() - }) - - nodejiebaDictJson && nodejieba.load(nodejiebaDictJson) - - var str = obj.toString().trim().toLowerCase(); - var tokens = []; - - nodejieba.cut(str, true).forEach(function(seg) { - tokens = tokens.concat(seg.split(' ')) - }) - - tokens = tokens.filter(function(token) { - return !!token; - }); - - var fromIndex = 0 - - return tokens.map(function(token, index) { - if (isLunr2) { - var start = str.indexOf(token, fromIndex) - - var tokenMetadata = {} - tokenMetadata["position"] = [start, token.length] - tokenMetadata["index"] = index - - fromIndex = start - - return new lunr.Token(token, tokenMetadata); - } else { - return token - } - }); - } - - /* lunr trimmer function */ - lunr.zh.wordCharacters = "\\w\u4e00-\u9fa5"; - lunr.zh.trimmer = lunr.trimmerSupport.generateTrimmer(lunr.zh.wordCharacters); - lunr.Pipeline.registerFunction(lunr.zh.trimmer, 'trimmer-zh'); - - /* lunr stemmer function */ - lunr.zh.stemmer = (function() { - - /* TODO Chinese stemmer */ - return function(word) { - return word; - } - })(); - lunr.Pipeline.registerFunction(lunr.zh.stemmer, 'stemmer-zh'); - - /* lunr stop word filter. see https://www.ranks.nl/stopwords/chinese-stopwords */ - lunr.zh.stopWordFilter = lunr.generateStopWordFilter( - '的 一 不 在 人 有 是 为 為 以 于 於 上 他 而 后 後 之 来 來 及 了 因 下 可 到 由 这 這 与 與 也 此 但 并 並 个 個 其 已 无 無 小 我 们 們 起 最 再 今 去 好 只 又 或 很 亦 某 把 那 你 乃 它 吧 被 比 别 趁 当 當 从 從 得 打 凡 儿 兒 尔 爾 该 該 各 给 給 跟 和 何 还 還 即 几 幾 既 看 据 據 距 靠 啦 另 么 麽 每 嘛 拿 哪 您 凭 憑 且 却 卻 让 讓 仍 啥 如 若 使 谁 誰 虽 雖 随 隨 同 所 她 哇 嗡 往 些 向 沿 哟 喲 用 咱 则 則 怎 曾 至 致 着 著 诸 諸 自'.split(' ')); - lunr.Pipeline.registerFunction(lunr.zh.stopWordFilter, 'stopWordFilter-zh'); - }; -})) \ No newline at end of file diff --git a/src/theme/searcher/mod.rs b/src/theme/searcher/mod.rs index 62b8214f5a..860528b59d 100644 --- a/src/theme/searcher/mod.rs +++ b/src/theme/searcher/mod.rs @@ -2,6 +2,7 @@ //! the "search" cargo feature is disabled. pub static JS: &[u8] = include_bytes!("searcher.js"); +pub static FALLBACK_JS: &[u8] = include_bytes!("searcher.fallback.js"); pub static MARK_JS: &[u8] = include_bytes!("mark.min.js"); pub static ELASTICLUNR_JS: &[u8] = include_bytes!("elasticlunr.min.js"); @@ -9,7 +10,6 @@ pub static ELASTICLUNR_JS: &[u8] = include_bytes!("elasticlunr.min.js"); pub mod lang { pub static STEMMER_SUPPORT_JS: &[u8] = include_bytes!("lunr.stemmer.support.js"); pub static ARABIC_JS: &[u8] = include_bytes!("languages/lunr.ar.js"); - pub static CHINESE_JS: &[u8] = include_bytes!("languages/lunr.zh.js"); pub static DANISH_JS: &[u8] = include_bytes!("languages/lunr.da.js"); pub static DUTCH_JS: &[u8] = include_bytes!("languages/lunr.nl.js"); pub static FINNISH_JS: &[u8] = include_bytes!("languages/lunr.fi.js"); diff --git a/src/theme/searcher/searcher.fallback.js b/src/theme/searcher/searcher.fallback.js new file mode 100644 index 0000000000..716a07b769 --- /dev/null +++ b/src/theme/searcher/searcher.fallback.js @@ -0,0 +1,934 @@ +"use strict"; +window.search = window.search || {}; +(function search(search) { + // Search functionality + // + // You can use !hasFocus() to prevent keyhandling in your key + // event handlers while the user is typing their search. + + if (!Mark || !elasticlunr) { + return; + } + + //IE 11 Compatibility from https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String/startsWith + if (!String.prototype.startsWith) { + String.prototype.startsWith = function(search, pos) { + return this.substr(!pos || pos < 0 ? 0 : +pos, search.length) === search; + }; + } + + var search_wrap = document.getElementById('search-wrapper'), + searchbar = document.getElementById('searchbar'), + searchbar_outer = document.getElementById('searchbar-outer'), + searchresults = document.getElementById('searchresults'), + searchresults_outer = document.getElementById('searchresults-outer'), + searchresults_header = document.getElementById('searchresults-header'), + searchicon = document.getElementById('search-toggle'), + content = document.getElementById('content'), + + searchindex = null, + doc_urls = [], + results_options = { + teaser_word_count: 30, + limit_results: 30, + }, + search_options = { + bool: "AND", + expand: true, + fields: { + title: {boost: 1}, + body: {boost: 1}, + breadcrumbs: {boost: 0} + } + }, + mark_exclude = [], + marker = new Mark(content), + current_searchterm = "", + URL_SEARCH_PARAM = 'search', + URL_MARK_PARAM = 'highlight', + teaser_count = 0, + + SEARCH_HOTKEY_KEYCODE = 83, + ESCAPE_KEYCODE = 27, + DOWN_KEYCODE = 40, + UP_KEYCODE = 38, + SELECT_KEYCODE = 13; + + const REGEX_WHITE_SPACE = /\p{White_Space}+/gu, + REGEX_SEARCH_SPLITTER = /(?:([\p{Unified_Ideograph}\uAC00-\uD7AF]|[^\p{White_Space}\p{P}\p{Sm}\p{Sc}\p{So}\p{Unified_Ideograph}\uAC00-\uD7AF\p{Z}\p{C}]+|\p{So}\p{Sk}?(?:\u200D\p{So}\p{Sk}?)*)|([\p{P}\p{Sm}\p{Sc}\p{Z}\p{C}]+))\p{White_Space}*/gu, + REGEX_STEM = /([a-zA-Z0-9]+)|[^a-zA-Z0-9]+/gu, + REGEX_ESCAPE = /[.*+?^${}()|[\]\\]/gu, + REGEX_DEFAULT_BEGIN = /^[^\p{White_Space}\p{P}\p{Sm}\p{Sc}\p{So}\p{Unified_Ideograph}\uAC00-\uD7AF\p{Z}\p{C}]/u, + REGEX_DEFAULT_END = /[^\p{White_Space}\p{P}\p{Sm}\p{Sc}\p{So}\p{Unified_Ideograph}\uAC00-\uD7AF\p{Z}\p{C}]$/u, + REGEX_SENTENCE = /.+?(?:[。?!.](?:(?![\r\n])[\p{White_Space}\p{Po}])*[\r\n]*|(?:[.?!](?:(?![\r\n])[\p{White_Space}\p{Po}])*?(?:(?![\r\n])\p{White_Space})+)+(?=[^\p{L}]*(?!\p{Ll})\p{L})|[\r\n]+)|.+?$/gu, + REGEX_CLAUSE = /.*?(?:(?:[,;]|……)[\p{White_Space}\p{Po}]*|[,;](?:\p{Po}*?\p{White_Space}+)+)|.+?$/gus, + REGEX_SEGMENT = /([\p{Unified_Ideograph}\uAC00-\uD7AF]+)|([^\p{White_Space}\p{P}\p{Sm}\p{Sc}\p{So}\p{Unified_Ideograph}\uAC00-\uD7AF\p{Z}\p{C}]+)|(\p{So}\p{Sk}?(?:\u200D\p{So}\p{Sk}?)*)|([\p{White_Space}\p{P}\p{Sm}\p{Sc}\p{Z}\p{C}]+)/gu; + + function hasFocus() { + return searchbar === document.activeElement; + } + + function removeChildren(elem) { + while (elem.firstChild) { + elem.removeChild(elem.firstChild); + } + } + + // Helper to parse a url into its building blocks. + function parseURL(url) { + var a = document.createElement('a'); + a.href = url; + return { + source: url, + protocol: a.protocol.replace(':',''), + host: a.hostname, + port: a.port, + params: (function(){ + var ret = {}; + var seg = a.search.replace(/^\?/,'').split('&'); + var len = seg.length, i = 0, s; + for (;i': '>', + '"': '"', + "'": ''' + }; + var repl = function (c, inMap) { return inMap ? MAP[c] : "
"; }; + return function (s) { + return s.replace(/([&<>'"])|[\r\n]+/g, repl); + }; + })(); + + function formatSearchMetric(count, searchterm) { + if (count == 1) { + return count + " search result for '" + searchterm + "':"; + } else if (count == 0) { + return "No search results for '" + searchterm + "'."; + } else { + return count + " search results for '" + searchterm + "':"; + } + } + + function formatSearchResult(result, searchTerms) { + var teaser = makeTeaser(result.doc, searchTerms); + if (!teaser) return; + + teaser_count++; + + // The ?URL_MARK_PARAM= parameter belongs inbetween the page and the #heading-anchor + var url = doc_urls[result.ref].split("#"); + if (url.length == 1) { // no anchor found + url.push(""); + } + + return '' + teaser.breadcrumbs + '' + + '' + + teaser.body + ''; + } + + // `targets` is an array of {begin: number, end: number} that has been sorted by begin + // in ascending order, and shouldn't overlap. + // `range` is {begin: number, end: number} + function highlightAndEscape(text, targets, range) { + const limit = range ? range.end : text.length; + var lastEnd = range ? range.begin : 0; + if (!targets.length) return escapeHTML(text.slice(lastEnd, limit)); + + for (var i = 0; targets[i].end <= lastEnd; i++) ; // skip targets before range + const parts = [], begin = targets[i].begin; + if (lastEnd > begin) lastEnd = begin; + + for (; i < targets.length; i++) { + const target = targets[i], begin = target.begin; + if (begin >= limit) break; // omit targets after range + const end = target.end; + parts.push(escapeHTML(text.slice(lastEnd, begin)), '', escapeHTML(text.slice(begin, end)), ''); + lastEnd = end; + } + parts.push(escapeHTML(text.slice(lastEnd, limit).trimEnd())); + + return "".concat(...parts); + } + + // Merge overlapping or contiguous ranges + function mergeRanges(ranges) { + if (!ranges.length) return []; + + var last = {begin: ranges[0].begin, end: ranges[0].end}; + const result = [last]; + for (const range of ranges.slice(1)) { + if (last.end < range.begin) { + last = {begin: range.begin, end: range.end}; + result.push(last); + } else if (last.end < range.end) { + last.end = range.end; + } + } + return result; + } + + class StructuredText { + constructor(text) { + this.original = text; + this.segments = new Map(); + this.pos = 0; + this.stemmedPos = 0; // `this` is passed to the constructors, and the `pos` fields will be updated there. + this.sentences = text.match(REGEX_SENTENCE).map(match => new Sentence(match, this)); + this.stemmed = "".concat(...this.segments.values().map(segment => segment.stemmed.text)); + delete this.pos; + delete this.stemmedPos; + } + + originalPos(stemmedPos) { + if (stemmedPos <= 0) return stemmedPos; + const offset = stemmedPos - this.stemmed.length; + if (offset >= 0) return this.original.length + offset; + const segment = this.segments.get(stemmedPos); + if (segment) return segment.lower.begin; + for (var pos = stemmedPos - 1; ; pos--) { + const segment = this.segments.get(pos); + if (segment) { + return segment.lower.begin + (segment instanceof DefaultSegment ? segment.lower.text.length : stemmedPos - pos); + } + } + } + + segmentAtStemmed(stemmedPos) { + if (stemmedPos < 0) return; + if (stemmedPos >= this.stemmed.length) return; + const segment = this.segments.get(stemmedPos); + if (segment) return segment; + for (var pos = stemmedPos - 1; ; pos--) { + const segment = this.segments.get(pos); + if (segment) return segment; + } + } + + // `begin` and `end`are indexed on stemmed text. + wordCount(begin, end) { + if (begin >= end) return 0; + const segment = this.segmentAtStemmed(begin), segmentEnd = segment.stemmed.end; + if (segment instanceof IdeographSegment) { + return [...this.stemmed.slice(begin, Math.min(end, segmentEnd))].length / 2 + this.wordCount(segmentEnd, end); + } else { + return segment.wordCount + this.wordCount(segmentEnd, end); + } + } + + // `targetsInStemmed` is an array of {begin: number, end: number} that has been sorted by begin in ascending order. + // `ranges` is an array of {begin: number, end: number} + highlightAndEscapeByStemmed(targetsInStemmed, ranges) { + targetsInStemmed = mergeRanges(targetsInStemmed); + if (!Array.isArray(ranges)) return this.highlightAndEscapeByStemmedInRange(targetsInStemmed, ranges); + ranges = mergeRanges(ranges); + if (!ranges.length) return ""; + const parts = ranges.map(range => this.highlightAndEscapeByStemmedInRange(targetsInStemmed, range)); + if (ranges[0].begin > 0) parts.unshift(""); + if (ranges[ranges.length - 1].end < this.stemmed.length) parts.push(""); + return parts.join("……"); + } + + highlightAndEscapeByStemmedInRange(targetsInStemmed, range) { + return highlightAndEscape(this.original, targetsInStemmed.map(target => { + return {begin: this.originalPos(target.begin), end: this.originalPos(target.end)}; + }), range ? {begin: this.originalPos(range.begin), end: this.originalPos(range.end)} : undefined); + } + + // Expands `range`'s end by `wordCount` words. + // `range` is like {begin: number, end: number} and is indexed on stemmed text. + // The range is modified in-place. + // `limit` is where the range would stop expanding even the required `wordCount` isn't satisfied. + // In this case the remaining `wordCount` to be expanded is returned (otherwise undefined is returned) + // If `limit` is undefined, expanding would stop at the end of the text. + expandEnd(range, wordCount, limit) { + if (typeof limit !== "number" || limit > this.stemmed.length) limit = this.stemmed.length; + if (range.end < range.begin) range.end = range.begin; + if (range.end >= limit) { + if (wordCount < 1) return; + return wordCount; + } + const pos = range.end, segment = this.segmentAtStemmed(pos); + if (segment instanceof IdeographSegment) { + if (wordCount * 2 < 1) return; + const end = Math.min(segment.stemmed.end, limit); + const slice = [...this.stemmed.slice(pos, end)]; + const remainingWordCount = wordCount - slice.length / 2; + if (remainingWordCount < 0) { + range.end += "".concat(...slice.slice(0, wordCount * 2)).length; + return; + } + range.end = end; + return this.expandEnd(range, remainingWordCount, limit); + } else { + wordCount -= segment.wordCount; + if (wordCount < 0) return; + range.end = Math.min(segment.stemmed.end, limit); + return this.expandEnd(range, wordCount, limit); + } + } + + // Counterpart to expandEnd + expandBegin(range, wordCount, limit) { + if (wordCount < 1) return; + if (typeof limit !== "number" || limit < 0) limit = 0; + if (range.begin > range.end) range.begin = range.end; + if (range.begin <= limit) { + if (wordCount < 1) return; + return wordCount; + } + const pos = range.begin, segment = this.segmentAtStemmed(pos - 1); + if (segment instanceof IdeographSegment) { + if (wordCount * 2 < 1) return; + const begin = Math.max(segment.stemmed.begin, limit); + const slice = [...this.stemmed.slice(begin, pos)]; + const remainingWordCount = wordCount - slice.length / 2; + if (remainingWordCount < 0) { + range.begin -= "".concat(...slice.slice(-wordCount * 2)).length; + return; + } + range.begin = begin; + return this.expandBegin(range, remainingWordCount, limit); + } else { + wordCount -= segment.wordCount; + range.begin = Math.max(segment.stemmed.begin, limit); + return this.expandBegin(range, wordCount, limit); + } + } + + // Expands `range`'s end to `type`'s boundary. + // `range` is like {begin: number, end: number} and is indexed on stemmed text. + // The range is modified in-place. + // `limit` is where the range would stop expanding even the required `type`'s boundary isn't reached. + // In this case true is returned (otherwise false is returned) + // If `limit` is undefined, expanding would stop at the end of the text. + expandEndToBoundary(range, type, limit) { + if (typeof limit !== "number") limit = this.stemmed.length; + if (range.end < range.begin) range.end = range.begin; + if (range.end >= limit) return true; + var part = this.segmentAtStemmed(range.end); + while (!(part instanceof type)) part = part.parent; + const partEnd = part.stemmed.end; + if (partEnd <= limit) { + range.end = partEnd; + return false; + } + range.end = limit; + return true; + } + + // Counterpart to expandEndToBoundary + expandBeginToBoundary(range, type, limit) { + if (typeof limit !== "number") limit = 0; + if (range.begin > range.end) range.begin = range.end; + if (range.begin <= limit) return true; + var part = this.segmentAtStemmed(range.begin - 1); + while (!(part instanceof type)) part = part.parent; + const partBegin = part.stemmed.begin; + if (partBegin >= limit) { + range.begin = partBegin; + return false; + } + range.begin = limit; + return true; + } + + // Counterpart to expandEndToBoundary + shrinkEndToBoundary(range, type, limit) { + if (typeof limit !== "number") limit = range.begin; + if (range.end > this.stemmed.length) range.end = this.stemmed.length; + if (range.end <= limit) return true; + var part = this.segmentAtStemmed(range.end - 1); + while (!(part instanceof type)) part = part.parent; + const partBegin = part.stemmed.begin; + if (partBegin >= limit) { + range.end = partBegin; + return false; + } + range.end = limit; + return true; + } + + // Counterpart to expandBeginToBoundary + shrinkBeginToBoundary(range, type, limit) { + if (typeof limit !== "number") limit = range.end; + if (range.begin < 0) range.begin = 0; + if (range.begin >= limit) return true; + var part = this.segmentAtStemmed(range.begin); + while (!(part instanceof type)) part = part.parent; + const partEnd = part.stemmed.end; + if (partEnd <= limit) { + range.begin = partEnd; + return false; + } + range.begin = limit; + return true; + } + } + + class Sentence { + constructor(original, base) { + this.original = {text: original, begin: base.pos} + const begin = base.stemmedPos; + this.clauses = original.toLowerCase().match(REGEX_CLAUSE).map(match => new Clause(match, this, base)); + this.stemmed = {begin, end: base.stemmedPos} + } + } + + class Clause { + constructor(lower, parent, base) { + this.lower = {text: lower, begin: base.pos} + const begin = base.stemmedPos, segments = []; + for (const match of lower.matchAll(REGEX_SEGMENT)) { + if (match[1]) { + segments.push(new IdeographSegment(match[0], this, base)); + } else if (match[2]) { + segments.push(new DefaultSegment(match[0], this, base)); + } else if (match[3]) { + segments.push(new EmojiSegment(match[0], this, base)); + } else if (match[4]) { + segments.push(new NonWordSegment(match[0], this, base)); + } + } + this.segments = segments; + this.stemmed = {begin, end: base.stemmedPos}; + this.parent = parent; + } + } + + class Segment { + constructor(lower, stemmed, parent, base) { + this.lower = {text: lower, begin: base.pos} + const begin = base.stemmedPos; + base.pos += lower.length; + base.stemmedPos += stemmed.length; + base.segments.set(begin, this); + this.stemmed = {text: stemmed, begin, end: base.stemmedPos} + this.parent = parent; + } + } + + class IdeographSegment extends Segment { + constructor(lower, parent, base) { + super(lower, lower, parent, base); + this.wordCount = [...lower].length / 2; // 2 characters count as 1 word + } + } + + class EmojiSegment extends Segment { + constructor(lower, parent, base) { + super(lower, lower, parent, base); + } + + get wordCount() { + return 1; + } + } + + class NonWordSegment extends Segment { + constructor(lower, parent, base) { + super(lower, lower, parent, base); + } + + get wordCount() { + return 0; + } + } + + class DefaultSegment extends Segment { + constructor(lower, parent, base) { + super(lower, elasticlunr.stemmer(lower), parent, base); + } + + get wordCount() { + return 1; + } + } + + function makeTeaser(doc, searchTerms) { + const body = new StructuredText(doc.body), breadcrumbs = new StructuredText(doc.breadcrumbs), + requireMatchAll = search_options.bool === 'AND', matchesInBody = [], matchesInBreadcrumbs = []; + var termCountInBody = 0; + for (const [index, regex] of searchTerms.regex.entries()) { + const currentTermInBody = []; + for (const match of body.stemmed.matchAll(regex)) { + currentTermInBody.push({ + begin: match.index, end: match.index + match[0].length, index + }); + } + const currentTermInBreadcrumbs = []; + for (const match of breadcrumbs.stemmed.matchAll(regex)) { + currentTermInBreadcrumbs.push({ + begin: match.index, end: match.index + match[0].length + }); + } + if (currentTermInBody.length) { + termCountInBody++; + } else if (requireMatchAll && !currentTermInBreadcrumbs.length) { + return; + } + matchesInBody.push(...currentTermInBody); + matchesInBreadcrumbs.push(...currentTermInBreadcrumbs); + } + if (!termCountInBody && !matchesInBreadcrumbs.length) return; + matchesInBreadcrumbs.sort((a, b) => a.begin - b.begin); + + if (!matchesInBody.length) { + const range = {begin: 0, end: 0}; + body.expandEnd(range, results_options.teaser_word_count); + var highlightedBody = body.highlightAndEscapeByStemmed(matchesInBody, [range]); + return { + body: highlightedBody, + breadcrumbs: breadcrumbs.highlightAndEscapeByStemmed(matchesInBreadcrumbs) + }; + } + matchesInBody.sort((a, b) => a.begin - b.begin); + + // Find the minimum window that contains at least one occurrence of each search term. + // `matches` is an array of { begin: number, end: number, index: number } where index is the index of search term. + // `termCount` is the number of unique search terms occurred. + function minWindow(matches, termCount) { + var begin = 0, end = 0, termCountInRange = 0, result = {begin: 0, end: body.stemmed.length}; + const termCountTableInRange = []; + + // Contract window's begin until it no longer contains all keywords + function contractWindow() { + while (true) { + const index = matches[begin].index; + begin++; + termCountTableInRange[index]--; + if (!termCountTableInRange[index]) { + const currentWindow = { + begin: matches[begin - 1].begin, end: matches[end - 1].end + }; + if (currentWindow.end - currentWindow.begin < result.end - result.begin) result = currentWindow; + break; + } + } + } + + // Expand window's end until it contains all keywords + while (end < matches.length) { + const index = matches[end].index; + end++; + if (termCountTableInRange[index]) { + termCountTableInRange[index]++; + } else { + termCountTableInRange[index] = 1; + termCountInRange++; + if (termCountInRange >= termCount) { + contractWindow(); + break; + } + } + } + + // Expand window's end until it contains all keywords again + while (end < matches.length) { + const index = matches[end].index; + end++; + termCountTableInRange[index]++; + if (termCountTableInRange[index] === 1) contractWindow(); + } + return result; + } + + const range = minWindow(matchesInBody, termCountInBody); + const rawBegin = range.begin, rawEnd = range.end; + body.expandBeginToBoundary(range, Sentence); + body.expandEndToBoundary(range, Sentence); + + const wordCountLimit = results_options.teaser_word_count; + var ranges = [], wordCount = body.wordCount(range.begin, range.end); + if (wordCount < wordCountLimit) { + var oldBegin, oldWordCount; + do { + oldBegin = range.begin; + oldWordCount = wordCount; + const reachedLimit = body.expandBeginToBoundary(range, Sentence); + wordCount = body.wordCount(range.begin, range.end); + if (reachedLimit) break; + } while (wordCount < wordCountLimit); + if (wordCount > wordCountLimit) { + range.begin = oldBegin; + wordCount = oldWordCount; + } + if (wordCount < wordCountLimit) { + const remainingWordCount = body.expandEnd(range, wordCountLimit - wordCount); + if (remainingWordCount) body.expandBegin(range, remainingWordCount); + } + ranges.push(range); + } else if (wordCount === wordCountLimit) { + ranges.push(range); + } else { + // When `range` can't be shrunk to `wordCountLimit`, the actual wordCount is returned. + function tryShrink(range, wordCount, wordCountLimit, rawBegin, rawEnd) { + var oldEnd; + do { + oldEnd = range.end; + if (body.shrinkEndToBoundary(range, Clause, rawEnd)) { + range.end = oldEnd; + break; + } + wordCount = body.wordCount(range.begin, range.end); + } while (wordCount > wordCountLimit); + if (wordCount <= wordCountLimit) { + if (wordCount < wordCountLimit) body.expandEnd(range, wordCountLimit - wordCount); + ranges.push(range); + } else { + var oldBegin; + do { + oldBegin = range.begin; + if (body.shrinkBeginToBoundary(range, Clause, rawBegin)) { + range.begin = oldBegin; + break; + } + wordCount = body.wordCount(range.begin, range.end); + } while (wordCount > wordCountLimit); + if (wordCount > wordCountLimit) return wordCount; + if (wordCount < wordCountLimit) body.expandBegin(range, wordCountLimit - wordCount); + } + } + wordCount = tryShrink(range, wordCount, wordCountLimit, rawBegin, rawEnd); + if (!wordCount) { + ranges.push(range); + } else { + // split the result into pieces and shrink them individually, then join them with …… + var freshMatchesInBody = matchesInBody.filter(match => match.begin >= range.begin && match.end <= range.end); + for (const sentence of body.sentences) { + const sentenceEnd = sentence.stemmed.end; + if (sentenceEnd > freshMatchesInBody[0].begin) { + ranges.push({begin: sentence.stemmed.begin, end: sentenceEnd}); + const currentIndex = freshMatchesInBody[0].index; + freshMatchesInBody = freshMatchesInBody.filter(match => match.index !== currentIndex); + while (freshMatchesInBody.length && sentenceEnd > freshMatchesInBody[0].begin) { + const currentIndex = freshMatchesInBody[0].index; + freshMatchesInBody = freshMatchesInBody.filter(match => match.index !== currentIndex); + } + if (!freshMatchesInBody.length) break; + } + } + const wordCountList = ranges.map(range => body.wordCount(range.begin, range.end)); + wordCount = wordCountList.reduce((sum, wordCount) => sum + wordCount); + var exceedingWordCount = wordCount - wordCountLimit; + if (exceedingWordCount < 0) { + var remainingWordCount = wordCountLimit - wordCount; + for (var i = 0; i < ranges.length - 1; i++) { + remainingWordCount = body.expandEnd(ranges[i], remainingWordCount, ranges[i + 1].begin); + if (!remainingWordCount) break; + } + if (remainingWordCount) body.expandEnd(ranges[i], remainingWordCount); + } else if (exceedingWordCount > 0) { + const reversedMatchesInBody = [...matchesInBody]; + reversedMatchesInBody.sort((a, b) => b.end - a.end); + for (i = ranges.length - 1; i >= 0; i--) { + const range = ranges[i]; + const actualWordCount = tryShrink(range, wordCountList[i], wordCountList[i] - exceedingWordCount, + matchesInBody.find(match => match.begin >= range.begin).begin, + reversedMatchesInBody.find(match => match.end <= range.end).end); + if (!actualWordCount) break; + exceedingWordCount -= wordCountList[i] - actualWordCount; + } + } + } + } + return { + body: body.highlightAndEscapeByStemmed(matchesInBody, ranges), + breadcrumbs: breadcrumbs.highlightAndEscapeByStemmed(matchesInBreadcrumbs) + }; + } + + function init(config) { + results_options = config.results_options; + search_options = config.search_options; + searchbar_outer = config.searchbar_outer; + doc_urls = config.doc_urls; + searchindex = elasticlunr.Index.load(config.index); + + // Set up events + searchicon.addEventListener('click', function(e) { searchIconClickHandler(); }, false); + searchbar.addEventListener('keyup', function(e) { searchbarKeyUpHandler(); }, false); + document.addEventListener('keydown', function(e) { globalKeyHandler(e); }, false); + // If the user uses the browser buttons, do the same as if a reload happened + window.onpopstate = function(e) { doSearchOrMarkFromUrl(); }; + // Suppress "submit" events so the page doesn't reload when the user presses Enter + document.addEventListener('submit', function(e) { e.preventDefault(); }, false); + + // If reloaded, do the search or mark again, depending on the current url parameters + doSearchOrMarkFromUrl(); + } + + function unfocusSearchbar() { + // hacky, but just focusing a div only works once + var tmp = document.createElement('input'); + tmp.setAttribute('style', 'position: absolute; opacity: 0;'); + searchicon.appendChild(tmp); + tmp.focus(); + tmp.remove(); + } + + // On reload or browser history backwards/forwards events, parse the url and do search or mark + function doSearchOrMarkFromUrl() { + // Check current URL for search request + var url = parseURL(window.location.href); + if (url.params.hasOwnProperty(URL_SEARCH_PARAM) + && url.params[URL_SEARCH_PARAM] != "") { + showSearch(true); + searchbar.value = decodeURIComponent( + (url.params[URL_SEARCH_PARAM] + '').replace(/\+/g, '%20')); + searchbarKeyUpHandler(); // -> doSearch() + } else { + showSearch(false); + } + + if (url.params.hasOwnProperty(URL_MARK_PARAM)) { + var words = decodeURIComponent(url.params[URL_MARK_PARAM]).split(' '); + marker.mark(words, { + exclude: mark_exclude + }); + + var markers = document.querySelectorAll("mark"); + function hide() { + for (var i = 0; i < markers.length; i++) { + markers[i].classList.add("fade-out"); + window.setTimeout(function(e) { marker.unmark(); }, 300); + } + } + for (var i = 0; i < markers.length; i++) { + markers[i].addEventListener('click', hide); + } + } + } + + // Eventhandler for keyevents on `document` + function globalKeyHandler(e) { + if (e.altKey || e.ctrlKey || e.metaKey || e.shiftKey || e.target.type === 'textarea' || e.target.type === 'text' || !hasFocus() && /^(?:input|select|textarea)$/i.test(e.target.nodeName)) { return; } + + if (e.keyCode === ESCAPE_KEYCODE) { + e.preventDefault(); + searchbar.classList.remove("active"); + setSearchUrlParameters("", + (searchbar.value.trim() !== "") ? "push" : "replace"); + if (hasFocus()) { + unfocusSearchbar(); + } + showSearch(false); + marker.unmark(); + } else if (!hasFocus() && e.keyCode === SEARCH_HOTKEY_KEYCODE) { + e.preventDefault(); + showSearch(true); + window.scrollTo(0, 0); + searchbar.select(); + } else if (hasFocus() && e.keyCode === DOWN_KEYCODE) { + e.preventDefault(); + unfocusSearchbar(); + searchresults.firstElementChild.classList.add("focus"); + } else if (!hasFocus() && (e.keyCode === DOWN_KEYCODE + || e.keyCode === UP_KEYCODE + || e.keyCode === SELECT_KEYCODE)) { + // not `:focus` because browser does annoying scrolling + var focused = searchresults.querySelector("li.focus"); + if (!focused) return; + e.preventDefault(); + if (e.keyCode === DOWN_KEYCODE) { + var next = focused.nextElementSibling; + if (next) { + focused.classList.remove("focus"); + next.classList.add("focus"); + } + } else if (e.keyCode === UP_KEYCODE) { + focused.classList.remove("focus"); + var prev = focused.previousElementSibling; + if (prev) { + prev.classList.add("focus"); + } else { + searchbar.select(); + } + } else { // SELECT_KEYCODE + window.location.assign(focused.querySelector('a')); + } + } + } + + function showSearch(yes) { + if (yes) { + search_wrap.classList.remove('hidden'); + searchicon.setAttribute('aria-expanded', 'true'); + } else { + search_wrap.classList.add('hidden'); + searchicon.setAttribute('aria-expanded', 'false'); + var results = searchresults.children; + for (var i = 0; i < results.length; i++) { + results[i].classList.remove("focus"); + } + } + } + + function showResults(yes) { + if (yes) { + searchresults_outer.classList.remove('hidden'); + } else { + searchresults_outer.classList.add('hidden'); + } + } + + // Eventhandler for search icon + function searchIconClickHandler() { + if (search_wrap.classList.contains('hidden')) { + showSearch(true); + window.scrollTo(0, 0); + searchbar.select(); + } else { + showSearch(false); + } + } + + // Eventhandler for keyevents while the searchbar is focused + function searchbarKeyUpHandler() { + var searchterm = searchbar.value.trim(); + if (searchterm != "") { + searchbar.classList.add("active"); + doSearch(searchterm); + } else { + searchbar.classList.remove("active"); + showResults(false); + removeChildren(searchresults); + } + + setSearchUrlParameters(searchterm, "push_if_new_search_else_replace"); + + // Remove marks + marker.unmark(); + } + + // Update current url with ?URL_SEARCH_PARAM= parameter, remove ?URL_MARK_PARAM and #heading-anchor . + // `action` can be one of "push", "replace", "push_if_new_search_else_replace" + // and replaces or pushes a new browser history item. + // "push_if_new_search_else_replace" pushes if there is no `?URL_SEARCH_PARAM=abc` yet. + function setSearchUrlParameters(searchterm, action) { + var url = parseURL(window.location.href); + var first_search = !url.params.hasOwnProperty(URL_SEARCH_PARAM); + if (searchterm != "" || action == "push_if_new_search_else_replace") { + url.params[URL_SEARCH_PARAM] = searchterm; + delete url.params[URL_MARK_PARAM]; + url.hash = ""; + } else { + delete url.params[URL_MARK_PARAM]; + delete url.params[URL_SEARCH_PARAM]; + } + // A new search will also add a new history item, so the user can go back + // to the page prior to searching. A updated search term will only replace + // the url. + if (action == "push" || (action == "push_if_new_search_else_replace" && first_search)) { + history.pushState({}, document.title, renderURL(url)); + } else if (action == "replace" || (action == "push_if_new_search_else_replace" && !first_search)) { + history.replaceState({}, document.title, renderURL(url)); + } + } + + function preprocessSearchTerms(searchTerms) { + const original = searchTerms.split(REGEX_WHITE_SPACE); + const stemmed = original.map(term => term.toLowerCase().replace(REGEX_STEM, (match, english) => english ? elasticlunr.stemmer(match) : match)); + return { + original, + stemmed, + lunr: searchTerms.replace(REGEX_SEARCH_SPLITTER, (_, word) => word ? `${word} ` : ""), + regex: stemmed.map(term => { + var escaped = term.replace(REGEX_ESCAPE, '\\$&'); + if (REGEX_DEFAULT_BEGIN.test(term)) { + escaped = "(?= results_options.limit_results) break; + } + + // Display search metrics + searchresults_header.innerText = formatSearchMetric(resultCount, searchterm); + + // Display results + showResults(true); + } + + fetch(path_to_root + 'searchindex.json') + .then(response => response.json()) + .then(json => init(json)) + .catch(error => { // Try to load searchindex.js if fetch failed + var script = document.createElement('script'); + script.src = path_to_root + 'searchindex.js'; + script.onload = () => init(window.search); + document.head.appendChild(script); + }); + + // Exported functions + search.hasFocus = hasFocus; +})(window.search); diff --git a/src/utils/mod.rs b/src/utils/mod.rs index 2b17cc7d84..f6e324062d 100644 --- a/src/utils/mod.rs +++ b/src/utils/mod.rs @@ -7,7 +7,7 @@ use crate::errors::Error; use log::error; use once_cell::sync::Lazy; use pulldown_cmark::{html, CodeBlockKind, CowStr, Event, Options, Parser, Tag, TagEnd}; -use regex::Regex; +use regex::{Captures, Regex}; use std::borrow::Cow; use std::collections::HashMap; @@ -19,10 +19,17 @@ pub use self::string::{ take_rustdoc_include_lines, }; -/// Replaces multiple consecutive whitespace characters with a single space character. +/// Replaces multiple consecutive whitespace characters with a single space character +/// if there's no line break, otherwise replaces with a single "\n". pub fn collapse_whitespace(text: &str) -> Cow<'_, str> { static RE: Lazy = Lazy::new(|| Regex::new(r"\s\s+").unwrap()); - RE.replace_all(text, " ") + RE.replace_all(text, |caps: &Captures<'_>| { + if caps[0].contains(['\r', '\n']) { + "\n" + } else { + " " + } + }) } /// Convert the given string to a valid HTML element ID.