From b35a76cd48edd95e620b2b3f25df6ddc92b6fd91 Mon Sep 17 00:00:00 2001
From: Tuomas Hietanen <thorium@iki.fi>
Date: Tue, 11 Nov 2025 10:48:18 +0000
Subject: [PATCH 1/2] markdown to html improvement

---
 .../HtmlFormatting.fs                         | 48 ++++++++++++++++++-
 tests/FSharp.Markdown.Tests/Markdown.fs       | 32 +++++++++++++
 2 files changed, 79 insertions(+), 1 deletion(-)
diff --git a/src/FSharp.Formatting.Markdown/HtmlFormatting.fs b/src/FSharp.Formatting.Markdown/HtmlFormatting.fs
index b492289a5..e435e1340 100644
--- a/src/FSharp.Formatting.Markdown/HtmlFormatting.fs
+++ b/src/FSharp.Formatting.Markdown/HtmlFormatting.fs
@@ -21,6 +21,52 @@ open MarkdownUtils
 let internal htmlEncode (code: string) =
     code.Replace("&", "&amp;").Replace("<", "&lt;").Replace(">", "&gt;")
 
+/// Encode emojis and problematic Unicode characters as HTML numeric entities
+/// Encodes characters in emoji ranges and symbols, but preserves common international text
+let internal encodeHighUnicode (text: string) =
+    if String.IsNullOrEmpty text then
+        text
+    else
+        // Fast path: check if string needs encoding at all
+        let needsEncoding =
+            text
+            |> Seq.exists (fun c ->
+                let codePoint = int c
+                Char.IsSurrogate c || (codePoint >= 0x2000 && codePoint <= 0x2BFF))
+
+        if not needsEncoding then
+            text
+        else
+            // Tail-recursive function with StringBuilder accumulator
+            let rec processChars i (sb: System.Text.StringBuilder) =
+                if i >= text.Length then
+                    sb.ToString()
+                else
+                    let c = text.[i]
+                    // Check for surrogate pairs first (emojis and other characters outside BMP)
+                    if
+                        Char.IsHighSurrogate c
+                        && i + 1 < text.Length
+                        && Char.IsLowSurrogate(text.[i + 1])
+                    then
+                        let fullCodePoint = Char.ConvertToUtf32(c, text.[i + 1])
+                        // Encode all characters outside BMP (>= 0x10000) as they're typically emojis
+                        sb.Append(sprintf "&#%d;" fullCodePoint) |> ignore
+                        processChars (i + 2) sb // Skip both surrogate chars
+                    else
+                        let codePoint = int c
+                        // Encode specific ranges that contain emojis and symbols:
+                        // U+2000-U+2BFF: General Punctuation, Superscripts, Currency, Dingbats, Arrows, Math, Technical, Box Drawing, etc.
+                        // U+1F000-U+1FFFF: Supplementary Multilingual Plane emojis (handled above via surrogates)
+                        if codePoint >= 0x2000 && codePoint <= 0x2BFF then
+                            sb.Append(sprintf "&#%d;" codePoint) |> ignore
+                        else
+                            sb.Append c |> ignore
+
+                        processChars (i + 1) sb
+
+            processChars 0 (System.Text.StringBuilder text.Length)
+
 /// Basic escaping as done by Markdown including quotes
 let internal htmlEncodeQuotes (code: string) =
     (htmlEncode code).Replace("\"", "&quot;")
@@ -78,7 +124,7 @@ let rec internal formatSpan (ctx: FormattingContext) span =
 
     | AnchorLink(id, _) -> ctx.Writer.Write("<a name=\"" + htmlEncodeQuotes id + "\">&#160;</a>")
     | EmbedSpans(cmd, _) -> formatSpans ctx (cmd.Render())
-    | Literal(str, _) -> ctx.Writer.Write(str)
+    | Literal(str, _) -> ctx.Writer.Write(encodeHighUnicode str)
     | HardLineBreak(_) -> ctx.Writer.Write("<br />" + ctx.Newline)
     | IndirectLink(body, _, LookupKey ctx.Links (link, title), _)
     | DirectLink(body, link, title, _) ->
diff --git a/tests/FSharp.Markdown.Tests/Markdown.fs b/tests/FSharp.Markdown.Tests/Markdown.fs
index c38eedf34..ecf82fd9c 100644
--- a/tests/FSharp.Markdown.Tests/Markdown.fs
+++ b/tests/FSharp.Markdown.Tests/Markdown.fs
@@ -30,6 +30,38 @@ let ``Escape HTML entities inside of code`` () =
     |> Markdown.ToHtml
     |> should contain "<p><code>a &amp;gt; &amp; b</code></p>"
 
+[<Test>]
+let ``Emojis are encoded as HTML numeric entities`` () =
+    let html = "Like this 🎉🚧⭐⚠️✅" |> Markdown.ToHtml
+    html |> should contain "&#127881;" // 🎉 party popper
+    html |> should contain "&#128679;" // 🚧 construction
+    html |> should contain "&#11088;" // ⭐ star
+    html |> should contain "&#9888;" // ⚠️ warning
+    html |> should contain "&#9989;" // ✅ check mark
+
+[<Test>]
+let ``Regular text without emojis is not modified`` () =
+    // Fast path optimization: regular text should pass through unchanged
+    let html = "This is regular text with пристаням Cyrillic and 中文 Chinese" |> Markdown.ToHtml
+    html |> should contain "пристаням"
+    html |> should contain "中文"
+    html |> should not' (contain "&#") // No HTML entities for regular international text
+
+[<Test>]
+let ``List without blank line after heading`` () =
+    // Test the issue mentioned in comment: https://github.com/fsprojects/FSharp.Formatting/issues/964#issuecomment-3515381382
+    let markdown =
+        """# This is my title
+- this list
+- should render"""
+
+    let html = Markdown.ToHtml markdown
+    // Check if list is rendered as a separate element, not part of heading
+    html |> should contain "<h1>This is my title</h1>"
+    html |> should contain "<ul>"
+    html |> should contain "<li>this list</li>"
+    html |> should contain "<li>should render</li>"
+
 [<Test>]
 let ``Inline HTML tag containing 'at' is not turned into hyperlink`` () =
     let doc = """<a href="mailto:a@b.c">hi</a>""" |> Markdown.Parse

From 0ba7b7708c2596e12ccc0219bbda102d5281bda8 Mon Sep 17 00:00:00 2001
From: Tuomas Hietanen <thorium@iki.fi>
Date: Sun, 16 Nov 2025 11:17:58 +0000
Subject: [PATCH 2/2] addressed nojaf feedback

---
 .../HtmlFormatting.fs                         | 92 +++++++++++--------
 1 file changed, 54 insertions(+), 38 deletions(-)

diff --git a/src/FSharp.Formatting.Markdown/HtmlFormatting.fs b/src/FSharp.Formatting.Markdown/HtmlFormatting.fs
index e435e1340..6cddeaffc 100644
--- a/src/FSharp.Formatting.Markdown/HtmlFormatting.fs
+++ b/src/FSharp.Formatting.Markdown/HtmlFormatting.fs
@@ -27,45 +27,61 @@ let internal encodeHighUnicode (text: string) =
     if String.IsNullOrEmpty text then
         text
     else
-        // Fast path: check if string needs encoding at all
-        let needsEncoding =
-            text
-            |> Seq.exists (fun c ->
-                let codePoint = int c
-                Char.IsSurrogate c || (codePoint >= 0x2000 && codePoint <= 0x2BFF))
-
-        if not needsEncoding then
-            text
-        else
-            // Tail-recursive function with StringBuilder accumulator
-            let rec processChars i (sb: System.Text.StringBuilder) =
-                if i >= text.Length then
-                    sb.ToString()
+        // Single-pass encoding with lazy StringBuilder allocation
+        let mutable sb: System.Text.StringBuilder voption = ValueNone
+        let mutable i = 0
+
+        while i < text.Length do
+            let c = text.[i]
+
+            let needsEncoding, codePoint, skipNext =
+                // Check for surrogate pairs first (emojis and other characters outside BMP)
+                if
+                    Char.IsHighSurrogate c
+                    && i + 1 < text.Length
+                    && Char.IsLowSurrogate text.[i + 1]
+                then
+                    let fullCodePoint = Char.ConvertToUtf32(c, text.[i + 1])
+                    // Encode all characters outside BMP (>= 0x10000) as they're typically emojis
+                    true, fullCodePoint, true
                 else
-                    let c = text.[i]
-                    // Check for surrogate pairs first (emojis and other characters outside BMP)
-                    if
-                        Char.IsHighSurrogate c
-                        && i + 1 < text.Length
-                        && Char.IsLowSurrogate(text.[i + 1])
-                    then
-                        let fullCodePoint = Char.ConvertToUtf32(c, text.[i + 1])
-                        // Encode all characters outside BMP (>= 0x10000) as they're typically emojis
-                        sb.Append(sprintf "&#%d;" fullCodePoint) |> ignore
-                        processChars (i + 2) sb // Skip both surrogate chars
-                    else
-                        let codePoint = int c
-                        // Encode specific ranges that contain emojis and symbols:
-                        // U+2000-U+2BFF: General Punctuation, Superscripts, Currency, Dingbats, Arrows, Math, Technical, Box Drawing, etc.
-                        // U+1F000-U+1FFFF: Supplementary Multilingual Plane emojis (handled above via surrogates)
-                        if codePoint >= 0x2000 && codePoint <= 0x2BFF then
-                            sb.Append(sprintf "&#%d;" codePoint) |> ignore
-                        else
-                            sb.Append c |> ignore
-
-                        processChars (i + 1) sb
-
-            processChars 0 (System.Text.StringBuilder text.Length)
+                    let codePoint = int c
+                    // Encode specific ranges that contain emojis and symbols:
+                    // U+2000-U+2BFF: General Punctuation, Superscripts, Currency, Dingbats, Arrows, Math, Technical, Box Drawing, etc.
+                    // U+1F000-U+1FFFF: Supplementary Multilingual Plane emojis (handled above via surrogates)
+                    (codePoint >= 0x2000 && codePoint <= 0x2BFF), codePoint, false
+
+            if needsEncoding then
+                // Lazy initialization of StringBuilder only when needed
+                match sb with
+                | ValueNone ->
+                    let builder = System.Text.StringBuilder(text.Length + 16)
+
+                    if i > 0 then
+                        builder.Append(text, 0, i) |> ignore
+
+                    sb <- ValueSome builder
+                | ValueSome _ -> ()
+
+                // Append HTML entity without using sprintf (avoid allocation)
+                match sb with
+                | ValueSome builder ->
+                    builder.Append "&#" |> ignore
+                    builder.Append codePoint |> ignore
+                    builder.Append ';' |> ignore
+                | ValueNone -> ()
+            else
+                // Only append to StringBuilder if it was already initialized
+                match sb with
+                | ValueSome builder -> builder.Append c |> ignore
+                | ValueNone -> ()
+
+            i <- i + (if skipNext then 2 else 1)
+
+        // Return original string if no encoding was needed
+        match sb with
+        | ValueNone -> text
+        | ValueSome builder -> builder.ToString()
 
 /// Basic escaping as done by Markdown including quotes
 let internal htmlEncodeQuotes (code: string) =