Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 63 additions & 1 deletion src/FSharp.Formatting.Markdown/HtmlFormatting.fs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,68 @@ open MarkdownUtils
let internal htmlEncode (code: string) =
code.Replace("&", "&amp;").Replace("<", "&lt;").Replace(">", "&gt;")

/// Encode emojis and problematic Unicode characters as HTML numeric entities
/// Encodes characters in emoji ranges and symbols, but preserves common international text
let internal encodeHighUnicode (text: string) =
if String.IsNullOrEmpty text then
text
else
// Single-pass encoding with lazy StringBuilder allocation
let mutable sb: System.Text.StringBuilder voption = ValueNone
let mutable i = 0

while i < text.Length do
let c = text.[i]

let needsEncoding, codePoint, skipNext =
// Check for surrogate pairs first (emojis and other characters outside BMP)
if
Char.IsHighSurrogate c
&& i + 1 < text.Length
&& Char.IsLowSurrogate text.[i + 1]
then
let fullCodePoint = Char.ConvertToUtf32(c, text.[i + 1])
// Encode all characters outside BMP (>= 0x10000) as they're typically emojis
true, fullCodePoint, true
else
let codePoint = int c
// Encode specific ranges that contain emojis and symbols:
// U+2000-U+2BFF: General Punctuation, Superscripts, Currency, Dingbats, Arrows, Math, Technical, Box Drawing, etc.
// U+1F000-U+1FFFF: Supplementary Multilingual Plane emojis (handled above via surrogates)
(codePoint >= 0x2000 && codePoint <= 0x2BFF), codePoint, false

if needsEncoding then
// Lazy initialization of StringBuilder only when needed
match sb with
| ValueNone ->
let builder = System.Text.StringBuilder(text.Length + 16)

if i > 0 then
builder.Append(text, 0, i) |> ignore

sb <- ValueSome builder
| ValueSome _ -> ()

// Append HTML entity without using sprintf (avoid allocation)
match sb with
| ValueSome builder ->
builder.Append "&#" |> ignore
builder.Append codePoint |> ignore
builder.Append ';' |> ignore
| ValueNone -> ()
else
// Only append to StringBuilder if it was already initialized
match sb with
| ValueSome builder -> builder.Append c |> ignore
| ValueNone -> ()

i <- i + (if skipNext then 2 else 1)

// Return original string if no encoding was needed
match sb with
| ValueNone -> text
| ValueSome builder -> builder.ToString()

/// Basic escaping as done by Markdown including quotes
let internal htmlEncodeQuotes (code: string) =
(htmlEncode code).Replace("\"", "&quot;")
Expand Down Expand Up @@ -78,7 +140,7 @@ let rec internal formatSpan (ctx: FormattingContext) span =

| AnchorLink(id, _) -> ctx.Writer.Write("<a name=\"" + htmlEncodeQuotes id + "\">&#160;</a>")
| EmbedSpans(cmd, _) -> formatSpans ctx (cmd.Render())
| Literal(str, _) -> ctx.Writer.Write(str)
| Literal(str, _) -> ctx.Writer.Write(encodeHighUnicode str)
| HardLineBreak(_) -> ctx.Writer.Write("<br />" + ctx.Newline)
| IndirectLink(body, _, LookupKey ctx.Links (link, title), _)
| DirectLink(body, link, title, _) ->
Expand Down
32 changes: 32 additions & 0 deletions tests/FSharp.Markdown.Tests/Markdown.fs
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,38 @@ let ``Escape HTML entities inside of code`` () =
|> Markdown.ToHtml
|> should contain "<p><code>a &amp;gt; &amp; b</code></p>"

[<Test>]
let ``Emojis are encoded as HTML numeric entities`` () =
let html = "Like this 🎉🚧⭐⚠️✅" |> Markdown.ToHtml
html |> should contain "&#127881;" // 🎉 party popper
html |> should contain "&#128679;" // 🚧 construction
html |> should contain "&#11088;" // ⭐ star
html |> should contain "&#9888;" // ⚠️ warning
html |> should contain "&#9989;" // ✅ check mark

[<Test>]
let ``Regular text without emojis is not modified`` () =
// Fast path optimization: regular text should pass through unchanged
let html = "This is regular text with пристаням Cyrillic and 中文 Chinese" |> Markdown.ToHtml
html |> should contain "пристаням"
html |> should contain "中文"
html |> should not' (contain "&#") // No HTML entities for regular international text

[<Test>]
let ``List without blank line after heading`` () =
// Test the issue mentioned in comment: https://github.com/fsprojects/FSharp.Formatting/issues/964#issuecomment-3515381382
let markdown =
"""# This is my title
- this list
- should render"""

let html = Markdown.ToHtml markdown
// Check if list is rendered as a separate element, not part of heading
html |> should contain "<h1>This is my title</h1>"
html |> should contain "<ul>"
html |> should contain "<li>this list</li>"
html |> should contain "<li>should render</li>"

[<Test>]
let ``Inline HTML tag containing 'at' is not turned into hyperlink`` () =
let doc = """<a href="mailto:[email protected]">hi</a>""" |> Markdown.Parse
Expand Down