From c4e9a938ea53a965819d9ace9f0e65caad571019 Mon Sep 17 00:00:00 2001 From: Hollow Man Date: Fri, 15 Apr 2022 14:43:20 +0800 Subject: [PATCH] Make print page (print.html) links link to anchors on the print page Let all the anchors id on the print page to have a path id prefix to help locate. e.g. bar/foo.md#abc -> #bar-foo-abc Also append a dummy div to the start of the original page to make sure that original page links without an anchor can also be located. Fix to remove all the `./` in the normalized path id so that for "./foo/bar.html#abc" we still get "#foo-bar-abc" Add support for redirect link anchors in print page so that anchors can also be redirected, also handle URL redirect links on print page Handle all the elements id to add a path prefix, also make path id to all be the lower case Fix for print page footnote links by adding the path id prefix Signed-off-by: Hollow Man --- src/renderer/html_handlebars/hbs_renderer.rs | 92 +++++- src/utils/mod.rs | 297 +++++++++++++++---- tests/rendered_output.rs | 8 +- 3 files changed, 335 insertions(+), 62 deletions(-) diff --git a/src/renderer/html_handlebars/hbs_renderer.rs b/src/renderer/html_handlebars/hbs_renderer.rs index b706108e64..4498f97baf 100644 --- a/src/renderer/html_handlebars/hbs_renderer.rs +++ b/src/renderer/html_handlebars/hbs_renderer.rs @@ -56,8 +56,12 @@ impl HtmlHandlebars { let content = utils::render_markdown(&ch.content, ctx.html_config.curly_quotes); - let fixed_content = - utils::render_markdown_with_path(&ch.content, ctx.html_config.curly_quotes, Some(path)); + let printed_item = utils::render_markdown_with_path_and_redirects( + &ch.content, + ctx.html_config.curly_quotes, + Some(path), + &ctx.html_config.redirect, + ); if !ctx.is_index && ctx.html_config.print.page_break { // Add page break between chapters // See https://developer.mozilla.org/en-US/docs/Web/CSS/break-before and https://developer.mozilla.org/en-US/docs/Web/CSS/page-break-before @@ -65,7 +69,25 @@ impl HtmlHandlebars { print_content .push_str(r#"
"#); } - print_content.push_str(&fixed_content); + let print_page_id = { + let mut base = path.display().to_string(); + if base.ends_with(".md") { + base.truncate(base.len() - 3); + } + &base + .replace("/", "-") + .replace("\\", "-") + .to_ascii_lowercase() + }; + + // We have to build header links in advance so that we can know the ranges + // for the headers in one page. + // Insert a dummy div to make sure that we can locate the specific page. + print_content.push_str(&(format!(r#"
"#))); + print_content.push_str(&build_header_links( + &build_print_element_id(&printed_item, &print_page_id), + Some(print_page_id), + )); // Update the context with data for this file let ctx_path = path @@ -210,7 +232,23 @@ impl HtmlHandlebars { code_config: &Code, edition: Option, ) -> String { - let rendered = build_header_links(&rendered); + let rendered = build_header_links(&rendered, None); + let rendered = self.post_process_common(rendered, &playground_config, code_config, edition); + + rendered + } + + /// Applies some post-processing to the HTML to apply some adjustments. + /// + /// This common function is used for both normal chapters (via + /// `post_process`) and the combined print page. + fn post_process_common( + &self, + rendered: String, + playground_config: &Playground, + code_config: &Code, + edition: Option, + ) -> String { let rendered = fix_code_blocks(&rendered); let rendered = add_playground_pre(&rendered, playground_config, edition); let rendered = hide_lines(&rendered, code_config); @@ -568,7 +606,7 @@ impl Renderer for HtmlHandlebars { debug!("Render template"); let rendered = handlebars.render("index", &data)?; - let rendered = self.post_process( + let rendered = self.post_process_common( rendered, &html_config.playground, &html_config.code, @@ -779,9 +817,34 @@ fn make_data( Ok(data) } +/// Go through the rendered print page HTML, +/// add path id prefix to all the elements id as well as footnote links. +fn build_print_element_id(html: &str, print_page_id: &str) -> String { + static ALL_ID: Lazy = Lazy::new(|| Regex::new(r#"(<[^>]*?id=")([^"]+?)""#).unwrap()); + static FOOTNOTE_ID: Lazy = Lazy::new(|| { + Regex::new( + r##"(]*?class="footnote-reference"[^>]*?>[^<]*?]*?href="#)([^"]+?)""##, + ) + .unwrap() + }); + + let temp_html = ALL_ID.replace_all(html, |caps: &Captures<'_>| { + format!("{}{}-{}\"", &caps[1], print_page_id, &caps[2]) + }); + + FOOTNOTE_ID + .replace_all(&temp_html, |caps: &Captures<'_>| { + format!("{}{}-{}\"", &caps[1], print_page_id, &caps[2]) + }) + .into_owned() +} + /// Goes through the rendered HTML, making sure all header tags have /// an anchor respectively so people can link to sections directly. -fn build_header_links(html: &str) -> String { +/// +/// `print_page_id` should be set to the print page ID prefix when adjusting the +/// print page. +fn build_header_links(html: &str, print_page_id: Option<&str>) -> String { static BUILD_HEADER_LINKS: Lazy = Lazy::new(|| { Regex::new(r#"(.*?)"#).unwrap() }); @@ -810,6 +873,7 @@ fn build_header_links(html: &str) -> String { caps.get(2).map(|x| x.as_str().to_string()), caps.get(3).map(|x| x.as_str().to_string()), &mut id_counter, + print_page_id, ) }) .into_owned() @@ -817,14 +881,26 @@ fn build_header_links(html: &str) -> String { /// Insert a sinle link into a header, making sure each link gets its own /// unique ID by appending an auto-incremented number (if necessary). +/// +/// For `print.html`, we will add a path id prefix. fn insert_link_into_header( level: usize, content: &str, id: Option, classes: Option, id_counter: &mut HashMap, + print_page_id: Option<&str>, ) -> String { - let id = id.unwrap_or_else(|| utils::unique_id_from_content(content, id_counter)); + let id = if let Some(print_page_id) = print_page_id { + let content_id = { + #[allow(deprecated)] + utils::id_from_content(content) + }; + let with_prefix = format!("{} {}", print_page_id, content_id); + id.unwrap_or_else(|| utils::unique_id_from_content(&with_prefix, id_counter)) + } else { + id.unwrap_or_else(|| utils::unique_id_from_content(content, id_counter)) + }; let classes = classes .map(|s| format!(" class=\"{s}\"")) .unwrap_or_default(); @@ -1113,7 +1189,7 @@ mod tests { ]; for (src, should_be) in inputs { - let got = build_header_links(src); + let got = build_header_links(src, None); assert_eq!(got, should_be); } } diff --git a/src/utils/mod.rs b/src/utils/mod.rs index c0be203044..27d8752508 100644 --- a/src/utils/mod.rs +++ b/src/utils/mod.rs @@ -6,13 +6,13 @@ pub(crate) mod toml_ext; use crate::errors::Error; use log::error; use once_cell::sync::Lazy; -use pulldown_cmark::{html, CodeBlockKind, CowStr, Event, Options, Parser, Tag, TagEnd}; +use pulldown_cmark::{html, CodeBlockKind, CowStr, Event, LinkType, Options, Parser, Tag, TagEnd}; use regex::Regex; use std::borrow::Cow; use std::collections::HashMap; use std::fmt::Write; -use std::path::Path; +use std::path::{Component, Path, PathBuf}; pub use self::string::{ take_anchored_lines, take_lines, take_rustdoc_include_anchored_lines, @@ -83,63 +83,232 @@ pub fn unique_id_from_content(content: &str, id_counter: &mut HashMap>(path: P) -> String { + let ends_with_slash = path.as_ref().to_str().map_or(false, |s| s.ends_with('/')); + let mut normalized = PathBuf::new(); + for component in path.as_ref().components() { + match &component { + Component::ParentDir => { + if !normalized.pop() { + normalized.push(component); + } + } + Component::CurDir => {} + _ => { + normalized.push(component); + } + } + } + if ends_with_slash { + normalized.push(""); + } + normalized.to_str().unwrap().replace("\\", "/").to_string() +} + +/// Converts a relative URL path to a reference ID for the print page. +fn normalize_print_page_id(mut path: String) -> String { + path = path + .replace("/", "-") + .replace(".html#", "-") + .replace("#", "-") + .to_ascii_lowercase(); + if path.ends_with(".html") { + path.truncate(path.len() - 5); + } + path +} + /// Fix links to the correct location. /// /// This adjusts links, such as turning `.md` extensions to `.html`. /// -/// `path` is the path to the page being rendered relative to the root of the -/// book. This is used for the `print.html` page so that links on the print -/// page go to the original location. Normal page rendering sets `path` to -/// None. Ideally, print page links would link to anchors on the print page, -/// but that is very difficult. -fn adjust_links<'a>(event: Event<'a>, path: Option<&Path>) -> Event<'a> { +/// See [`render_markdown_with_path_and_redirects`] for a description of +/// `path` and `redirects`. +fn adjust_links<'a>( + event: Event<'a>, + path: Option<&Path>, + redirects: &HashMap, +) -> Event<'a> { static SCHEME_LINK: Lazy = Lazy::new(|| Regex::new(r"^[a-z][a-z0-9+.-]*:").unwrap()); - static MD_LINK: Lazy = - Lazy::new(|| Regex::new(r"(?P.*)\.md(?P#.*)?").unwrap()); - - fn fix<'a>(dest: CowStr<'a>, path: Option<&Path>) -> CowStr<'a> { - if dest.starts_with('#') { - // Fragment-only link. - if let Some(path) = path { - let mut base = path.display().to_string(); - if base.ends_with(".md") { - base.replace_range(base.len() - 3.., ".html"); - } - return format!("{}{}", base, dest).into(); - } else { - return dest; + static HTML_MD_LINK: Lazy = + Lazy::new(|| Regex::new(r"(?P.*)\.(html|md)(?P#.*)?").unwrap()); + + fn add_base(path: Option<&Path>) -> String { + let mut fixed_link = String::new(); + if let Some(path) = path { + let base = path + .parent() + .expect("path can't be empty") + .to_str() + .expect("utf-8 paths only"); + if !base.is_empty() { + write!(fixed_link, "{}/", base).unwrap(); } } - // Don't modify links with schemes like `https`. - if !SCHEME_LINK.is_match(&dest) { - // This is a relative link, adjust it as necessary. - let mut fixed_link = String::new(); - if let Some(path) = path { - let base = path + fixed_link.to_string() + } + + fn fix_print_page_link<'a>( + mut normalized_path: String, + redirects: &HashMap, + ) -> CowStr<'a> { + // Fix redirect links + let (path_no_fragment, fragment) = match normalized_path.split_once('#') { + Some((a, b)) => (a, Some(b)), + None => (normalized_path.as_str(), None), + }; + for (original, redirect) in redirects { + if !normalize_path(original.trim_start_matches('/')) + .eq_ignore_ascii_case(&normalized_path) + && !normalize_path(original.trim_start_matches('/')) + .eq_ignore_ascii_case(&path_no_fragment) + { + continue; + } + + let mut unnormalized_path = String::new(); + if SCHEME_LINK.is_match(&redirect) { + unnormalized_path = redirect.to_string(); + } else { + let base = PathBuf::from(path_no_fragment) .parent() .expect("path can't be empty") .to_str() - .expect("utf-8 paths only"); - if !base.is_empty() { - write!(fixed_link, "{}/", base).unwrap(); + .expect("utf-8 paths only") + .to_owned(); + + let normalized_base = normalize_path(base).trim_matches('/').to_owned(); + if !normalized_base.is_empty() { + write!(unnormalized_path, "{}/{}", normalized_base, redirect).unwrap(); + } else { + unnormalized_path = redirect.to_string().trim_start_matches('/').to_string(); } } - if let Some(caps) = MD_LINK.captures(&dest) { - fixed_link.push_str(&caps["link"]); - fixed_link.push_str(".html"); - if let Some(anchor) = caps.name("anchor") { - fixed_link.push_str(anchor.as_str()); + // original without anchors, need to append link anchors + if !original.contains("#") { + if let Some(fragment) = fragment { + if !unnormalized_path.contains("#") { + unnormalized_path.push('#'); + } else { + unnormalized_path.push('-'); + } + unnormalized_path.push_str(fragment); } + } + + if SCHEME_LINK.is_match(&redirect) { + return CowStr::from(unnormalized_path); } else { - fixed_link.push_str(&dest); + normalized_path = normalize_path(unnormalized_path); + } + break; + } + + // Check again to make sure anchors are the html links inside the book. + if normalized_path.starts_with("../") || normalized_path.contains("/../") { + return CowStr::from(normalized_path); + } + + let mut fixed_anchor_for_print = String::new(); + fixed_anchor_for_print.push_str("#"); + fixed_anchor_for_print.push_str(&normalize_print_page_id(normalized_path)); + CowStr::from(fixed_anchor_for_print) + } + + /// Fix resource links like img to the correct location. + fn fix_resource_links<'a>(dest: CowStr<'a>, path: Option<&Path>) -> CowStr<'a> { + // Don't modify links with schemes like `https`. + if SCHEME_LINK.is_match(&dest) { + return dest; + } + + // This is a relative link, adjust it as necessary. + let mut fixed_link = add_base(path); + fixed_link.push_str(&dest); + CowStr::from(fixed_link) + } + + fn fix_a_links_with_type<'a>( + dest: CowStr<'a>, + path: Option<&Path>, + redirects: &HashMap, + link_type: LinkType + ) -> CowStr<'a> { + if link_type == LinkType::Email { + return dest; + } + fix_a_links(dest, path, redirects) + } + + /// Adjust markdown file to correct point in the html file. + fn fix_a_links<'a>( + dest: CowStr<'a>, + path: Option<&Path>, + redirects: &HashMap, + ) -> CowStr<'a> { + if dest.starts_with('#') { + // Fragment-only link. + return match path { + Some(path) => { + let mut base = path.display().to_string(); + if base.ends_with(".md") { + base.truncate(base.len() - 3); + } + format!( + "#{}{}", + normalize_print_page_id(normalize_path(base)), + dest.replace("#", "-") + ) + .into() + } + None => dest, }; - return CowStr::from(fixed_link); } - dest + + // Don't modify links with schemes like `https`. + if SCHEME_LINK.is_match(&dest) { + return dest; + } + + // This is a relative link, adjust it as necessary. + let mut fixed_link = add_base(path); + + if let Some(caps) = HTML_MD_LINK.captures(&dest) { + fixed_link.push_str(&caps["link"]); + fixed_link.push_str(".html"); + if let Some(anchor) = caps.name("anchor") { + fixed_link.push_str(anchor.as_str()); + } + } else { + fixed_link.push_str(&dest); + }; + + let normalized_path = normalize_path(&fixed_link); + + // Judge if the html link is inside the book. + if !normalized_path.starts_with("../") && !normalized_path.contains("/../") { + // In `print.html`, print page links would all link to anchors on the print page. + return match path { + Some(_) => fix_print_page_link(normalized_path, redirects), + None => CowStr::from(fixed_link), + }; + } + // In normal page rendering, links to anchors on another page. + CowStr::from(fixed_link) } - fn fix_html<'a>(html: CowStr<'a>, path: Option<&Path>) -> CowStr<'a> { + fn fix_html<'a>( + html: CowStr<'a>, + path: Option<&Path>, + redirects: &HashMap, + ) -> CowStr<'a> { // This is a terrible hack, but should be reasonably reliable. Nobody // should ever parse a tag with a regex. However, there isn't anything // in Rust that I know of that is suitable for handling partial html @@ -148,12 +317,19 @@ fn adjust_links<'a>(event: Event<'a>, path: Option<&Path>) -> Event<'a> { // There are dozens of HTML tags/attributes that contain paths, so // feel free to add more tags if desired; these are the only ones I // care about right now. - static HTML_LINK: Lazy = - Lazy::new(|| Regex::new(r#"(<(?:a|img) [^>]*?(?:src|href)=")([^"]+?)""#).unwrap()); + static A_LINK: Lazy = + Lazy::new(|| Regex::new(r#"(]*?(name|href)=")([^"]+?)""#).unwrap()); + static IMG_LINK: Lazy = + Lazy::new(|| Regex::new(r#"(]*?src=")([^"]+?)""#).unwrap()); + + let temp_html = IMG_LINK.replace_all(&html, |caps: ®ex::Captures<'_>| { + let fixed = fix_resource_links(caps[2].into(), path); + format!("{}{}\"", &caps[1], fixed) + }); - HTML_LINK - .replace_all(&html, |caps: ®ex::Captures<'_>| { - let fixed = fix(caps[2].into(), path); + A_LINK + .replace_all(&temp_html, |caps: ®ex::Captures<'_>| { + let fixed = fix_a_links(caps[3].into(), path, &redirects); format!("{}{}\"", &caps[1], fixed) }) .into_owned() @@ -168,7 +344,7 @@ fn adjust_links<'a>(event: Event<'a>, path: Option<&Path>) -> Event<'a> { id, }) => Event::Start(Tag::Link { link_type, - dest_url: fix(dest_url, path), + dest_url: fix_a_links_with_type(dest_url, path, redirects, link_type), title, id, }), @@ -179,12 +355,12 @@ fn adjust_links<'a>(event: Event<'a>, path: Option<&Path>) -> Event<'a> { id, }) => Event::Start(Tag::Image { link_type, - dest_url: fix(dest_url, path), + dest_url: fix_resource_links(dest_url, path), title, id, }), - Event::Html(html) => Event::Html(fix_html(html, path)), - Event::InlineHtml(html) => Event::InlineHtml(fix_html(html, path)), + Event::Html(html) => Event::Html(fix_html(html, path, redirects)), + Event::InlineHtml(html) => Event::InlineHtml(fix_html(html, path, redirects)), _ => event, } } @@ -194,6 +370,11 @@ pub fn render_markdown(text: &str, curly_quotes: bool) -> String { render_markdown_with_path(text, curly_quotes, None) } +/// Wrapper around for API compatibility. +pub fn render_markdown_with_path(text: &str, curly_quotes: bool, path: Option<&Path>) -> String { + render_markdown_with_path_and_redirects(text, curly_quotes, path, &HashMap::new()) +} + pub fn new_cmark_parser(text: &str, curly_quotes: bool) -> Parser<'_> { let mut opts = Options::empty(); opts.insert(Options::ENABLE_TABLES); @@ -207,12 +388,26 @@ pub fn new_cmark_parser(text: &str, curly_quotes: bool) -> Parser<'_> { Parser::new_ext(text, opts) } -pub fn render_markdown_with_path(text: &str, curly_quotes: bool, path: Option<&Path>) -> String { +/// Renders markdown to HTML. +/// +/// `path` is the path to the page being rendered relative to the root of the +/// book. This is used for the `print.html` page so that links on the print +/// page go to the anchors that has a path id prefix. Normal page rendering +/// sets `path` to None. +/// +/// `redirects` is also only for the print page. It's for adjusting links to +/// a redirected location to go to the correct spot on the `print.html` page. +pub(crate) fn render_markdown_with_path_and_redirects( + text: &str, + curly_quotes: bool, + path: Option<&Path>, + redirects: &HashMap, +) -> String { let mut s = String::with_capacity(text.len() * 3 / 2); let p = new_cmark_parser(text, curly_quotes); let events = p .map(clean_codeblock_headers) - .map(|event| adjust_links(event, path)) + .map(|event| adjust_links(event, path, &redirects)) .flat_map(|event| { let (a, b) = wrap_tables(event); a.into_iter().chain(b) diff --git a/tests/rendered_output.rs b/tests/rendered_output.rs index a01ce5f46f..b57d867288 100644 --- a/tests/rendered_output.rs +++ b/tests/rendered_output.rs @@ -126,12 +126,14 @@ fn check_correct_relative_links_in_print_page() { assert_contains_strings( first.join("print.html"), &[ - r##"the first section,"##, + r##"the first section,"##, r##"outside"##, r##"Some image"##, - r##"fragment link"##, - r##"HTML Link"##, + r##"fragment link"##, + r##"HTML Link"##, r##"raw html"##, + r##"1"##, + r##"2"##, ], ); }