From b35a76cd48edd95e620b2b3f25df6ddc92b6fd91 Mon Sep 17 00:00:00 2001 From: Tuomas Hietanen Date: Tue, 11 Nov 2025 10:48:18 +0000 Subject: [PATCH 1/2] markdown to html improvement --- .../HtmlFormatting.fs | 48 ++++++++++++++++++- tests/FSharp.Markdown.Tests/Markdown.fs | 32 +++++++++++++ 2 files changed, 79 insertions(+), 1 deletion(-) diff --git a/src/FSharp.Formatting.Markdown/HtmlFormatting.fs b/src/FSharp.Formatting.Markdown/HtmlFormatting.fs index b492289a5..e435e1340 100644 --- a/src/FSharp.Formatting.Markdown/HtmlFormatting.fs +++ b/src/FSharp.Formatting.Markdown/HtmlFormatting.fs @@ -21,6 +21,52 @@ open MarkdownUtils let internal htmlEncode (code: string) = code.Replace("&", "&").Replace("<", "<").Replace(">", ">") +/// Encode emojis and problematic Unicode characters as HTML numeric entities +/// Encodes characters in emoji ranges and symbols, but preserves common international text +let internal encodeHighUnicode (text: string) = + if String.IsNullOrEmpty text then + text + else + // Fast path: check if string needs encoding at all + let needsEncoding = + text + |> Seq.exists (fun c -> + let codePoint = int c + Char.IsSurrogate c || (codePoint >= 0x2000 && codePoint <= 0x2BFF)) + + if not needsEncoding then + text + else + // Tail-recursive function with StringBuilder accumulator + let rec processChars i (sb: System.Text.StringBuilder) = + if i >= text.Length then + sb.ToString() + else + let c = text.[i] + // Check for surrogate pairs first (emojis and other characters outside BMP) + if + Char.IsHighSurrogate c + && i + 1 < text.Length + && Char.IsLowSurrogate(text.[i + 1]) + then + let fullCodePoint = Char.ConvertToUtf32(c, text.[i + 1]) + // Encode all characters outside BMP (>= 0x10000) as they're typically emojis + sb.Append(sprintf "&#%d;" fullCodePoint) |> ignore + processChars (i + 2) sb // Skip both surrogate chars + else + let codePoint = int c + // Encode specific ranges that contain emojis and symbols: + // U+2000-U+2BFF: General Punctuation, Superscripts, Currency, Dingbats, Arrows, Math, Technical, Box Drawing, etc. + // U+1F000-U+1FFFF: Supplementary Multilingual Plane emojis (handled above via surrogates) + if codePoint >= 0x2000 && codePoint <= 0x2BFF then + sb.Append(sprintf "&#%d;" codePoint) |> ignore + else + sb.Append c |> ignore + + processChars (i + 1) sb + + processChars 0 (System.Text.StringBuilder text.Length) + /// Basic escaping as done by Markdown including quotes let internal htmlEncodeQuotes (code: string) = (htmlEncode code).Replace("\"", """) @@ -78,7 +124,7 @@ let rec internal formatSpan (ctx: FormattingContext) span = | AnchorLink(id, _) -> ctx.Writer.Write(" ") | EmbedSpans(cmd, _) -> formatSpans ctx (cmd.Render()) - | Literal(str, _) -> ctx.Writer.Write(str) + | Literal(str, _) -> ctx.Writer.Write(encodeHighUnicode str) | HardLineBreak(_) -> ctx.Writer.Write("
" + ctx.Newline) | IndirectLink(body, _, LookupKey ctx.Links (link, title), _) | DirectLink(body, link, title, _) -> diff --git a/tests/FSharp.Markdown.Tests/Markdown.fs b/tests/FSharp.Markdown.Tests/Markdown.fs index c38eedf34..ecf82fd9c 100644 --- a/tests/FSharp.Markdown.Tests/Markdown.fs +++ b/tests/FSharp.Markdown.Tests/Markdown.fs @@ -30,6 +30,38 @@ let ``Escape HTML entities inside of code`` () = |> Markdown.ToHtml |> should contain "

a &gt; & b

" +[] +let ``Emojis are encoded as HTML numeric entities`` () = + let html = "Like this 🎉🚧⭐⚠️✅" |> Markdown.ToHtml + html |> should contain "🎉" // 🎉 party popper + html |> should contain "🚧" // 🚧 construction + html |> should contain "⭐" // ⭐ star + html |> should contain "⚠" // ⚠️ warning + html |> should contain "✅" // ✅ check mark + +[] +let ``Regular text without emojis is not modified`` () = + // Fast path optimization: regular text should pass through unchanged + let html = "This is regular text with пристаням Cyrillic and 中文 Chinese" |> Markdown.ToHtml + html |> should contain "пристаням" + html |> should contain "中文" + html |> should not' (contain "&#") // No HTML entities for regular international text + +[] +let ``List without blank line after heading`` () = + // Test the issue mentioned in comment: https://github.com/fsprojects/FSharp.Formatting/issues/964#issuecomment-3515381382 + let markdown = + """# This is my title +- this list +- should render""" + + let html = Markdown.ToHtml markdown + // Check if list is rendered as a separate element, not part of heading + html |> should contain "

This is my title

" + html |> should contain "
    " + html |> should contain "
  • this list
  • " + html |> should contain "
  • should render
  • " + [] let ``Inline HTML tag containing 'at' is not turned into hyperlink`` () = let doc = """hi""" |> Markdown.Parse From 0ba7b7708c2596e12ccc0219bbda102d5281bda8 Mon Sep 17 00:00:00 2001 From: Tuomas Hietanen Date: Sun, 16 Nov 2025 11:17:58 +0000 Subject: [PATCH 2/2] addressed nojaf feedback --- .../HtmlFormatting.fs | 92 +++++++++++-------- 1 file changed, 54 insertions(+), 38 deletions(-) diff --git a/src/FSharp.Formatting.Markdown/HtmlFormatting.fs b/src/FSharp.Formatting.Markdown/HtmlFormatting.fs index e435e1340..6cddeaffc 100644 --- a/src/FSharp.Formatting.Markdown/HtmlFormatting.fs +++ b/src/FSharp.Formatting.Markdown/HtmlFormatting.fs @@ -27,45 +27,61 @@ let internal encodeHighUnicode (text: string) = if String.IsNullOrEmpty text then text else - // Fast path: check if string needs encoding at all - let needsEncoding = - text - |> Seq.exists (fun c -> - let codePoint = int c - Char.IsSurrogate c || (codePoint >= 0x2000 && codePoint <= 0x2BFF)) - - if not needsEncoding then - text - else - // Tail-recursive function with StringBuilder accumulator - let rec processChars i (sb: System.Text.StringBuilder) = - if i >= text.Length then - sb.ToString() + // Single-pass encoding with lazy StringBuilder allocation + let mutable sb: System.Text.StringBuilder voption = ValueNone + let mutable i = 0 + + while i < text.Length do + let c = text.[i] + + let needsEncoding, codePoint, skipNext = + // Check for surrogate pairs first (emojis and other characters outside BMP) + if + Char.IsHighSurrogate c + && i + 1 < text.Length + && Char.IsLowSurrogate text.[i + 1] + then + let fullCodePoint = Char.ConvertToUtf32(c, text.[i + 1]) + // Encode all characters outside BMP (>= 0x10000) as they're typically emojis + true, fullCodePoint, true else - let c = text.[i] - // Check for surrogate pairs first (emojis and other characters outside BMP) - if - Char.IsHighSurrogate c - && i + 1 < text.Length - && Char.IsLowSurrogate(text.[i + 1]) - then - let fullCodePoint = Char.ConvertToUtf32(c, text.[i + 1]) - // Encode all characters outside BMP (>= 0x10000) as they're typically emojis - sb.Append(sprintf "&#%d;" fullCodePoint) |> ignore - processChars (i + 2) sb // Skip both surrogate chars - else - let codePoint = int c - // Encode specific ranges that contain emojis and symbols: - // U+2000-U+2BFF: General Punctuation, Superscripts, Currency, Dingbats, Arrows, Math, Technical, Box Drawing, etc. - // U+1F000-U+1FFFF: Supplementary Multilingual Plane emojis (handled above via surrogates) - if codePoint >= 0x2000 && codePoint <= 0x2BFF then - sb.Append(sprintf "&#%d;" codePoint) |> ignore - else - sb.Append c |> ignore - - processChars (i + 1) sb - - processChars 0 (System.Text.StringBuilder text.Length) + let codePoint = int c + // Encode specific ranges that contain emojis and symbols: + // U+2000-U+2BFF: General Punctuation, Superscripts, Currency, Dingbats, Arrows, Math, Technical, Box Drawing, etc. + // U+1F000-U+1FFFF: Supplementary Multilingual Plane emojis (handled above via surrogates) + (codePoint >= 0x2000 && codePoint <= 0x2BFF), codePoint, false + + if needsEncoding then + // Lazy initialization of StringBuilder only when needed + match sb with + | ValueNone -> + let builder = System.Text.StringBuilder(text.Length + 16) + + if i > 0 then + builder.Append(text, 0, i) |> ignore + + sb <- ValueSome builder + | ValueSome _ -> () + + // Append HTML entity without using sprintf (avoid allocation) + match sb with + | ValueSome builder -> + builder.Append "&#" |> ignore + builder.Append codePoint |> ignore + builder.Append ';' |> ignore + | ValueNone -> () + else + // Only append to StringBuilder if it was already initialized + match sb with + | ValueSome builder -> builder.Append c |> ignore + | ValueNone -> () + + i <- i + (if skipNext then 2 else 1) + + // Return original string if no encoding was needed + match sb with + | ValueNone -> text + | ValueSome builder -> builder.ToString() /// Basic escaping as done by Markdown including quotes let internal htmlEncodeQuotes (code: string) =