diff --git a/pkg/app/passage.go b/pkg/app/passage.go index 7822b84..3820c5a 100644 --- a/pkg/app/passage.go +++ b/pkg/app/passage.go @@ -35,55 +35,105 @@ func GetReference(doc *html.Node) string { return utils.GetTextNode(refNode).Data } -func ParseNodesForPassage(node *html.Node) string { - var text string - var parts []string +// Helper function to escape characters for Telegram MarkdownV2 +func escapeMarkdownV2(s string) string { + // According to Telegram API docs for MarkdownV2, characters to escape are: + // '_', '*', '[', ']', '(', ')', '~', '`', '>', '#', '+', '-', '=', '|', '{', '}', '.', '!' + // Note: '^' is not in this list. Let's assume it doesn't need escaping. + // The logic should be to escape these characters *only* when they are not part of a formatting tag. + // However, since we are processing raw text nodes, any special character should be escaped. + r := strings.NewReplacer( + "_", `\_`, "*", `\*`, "[", `\[`, "]", `\]`, "(", `\(`, ")", `\)`, + "~", `\~`, "`", "\\`", ">", `\>`, "#", `\#`, "+", `\+`, "-", `\-`, + "=", `\=`, "|", `\|`, "{", `\{`, "}", `\}`, ".", `\.`, "!", `\!`, + ) + return r.Replace(s) +} - for child := node.FirstChild; child != nil; child = child.NextSibling { - parts = append(parts, text) +// Helper functions for parsing +func isFormattingTag(tag string) bool { + return tag == "sup" || tag == "i" || tag == "b" +} - switch tag := child.Data; tag { - case "span": - childText := strings.Trim(ParseNodesForPassage(child), " ") - if len(childText) > 0 { - parts = append(parts, childText) - } else { - parts = append(parts, child.Data) - } - case "sup": - isFootnote := func(node *html.Node) bool { - for _, attr := range node.Attr { - if attr.Key == "class" && attr.Val == "footnote" { - return true - } - } - return false - } - if isFootnote(child) { - break - } - childText := strings.Trim(ParseNodesForPassage(child), " ") - if len(childText) > 0 { - parts = append(parts, fmt.Sprintf("^%s^", childText)) +func isHeaderTag(tag string) bool { + return tag == "h1" || tag == "h2" || tag == "h3" || tag == "h4" +} + +func wrapText(text, tag string) string { + if strings.TrimSpace(text) == "" { + return text + } + + if tag == "sup" { + // User-specified format for superscript + return fmt.Sprintf("^%s^", strings.Trim(text, " ")) + } + if tag == "i" { + return fmt.Sprintf("_%s_", text) + } + if tag == "b" || isHeaderTag(tag) { + return fmt.Sprintf("*%s*", text) + } + return text +} + +func parseNode(node *html.Node) string { + if node.Type == html.TextNode { + return escapeMarkdownV2(node.Data) + } + + if node.Type != html.ElementNode { + var content strings.Builder + for c := node.FirstChild; c != nil; c = c.NextSibling { + content.WriteString(parseNode(c)) + } + return content.String() + } + + tag := node.Data + + // Handle non-formatting tags first + if tag == "br" { + return "\n" + } + if !isFormattingTag(tag) && !isHeaderTag(tag) { + var content strings.Builder + for c := node.FirstChild; c != nil; c = c.NextSibling { + content.WriteString(parseNode(c)) + } + return content.String() + } + + // Handle formatting tags (b, i, sup, h1-h4) + if tag == "sup" { + for _, attr := range node.Attr { + if attr.Key == "class" && attr.Val == "footnote" { + return "" // Ignore footnote nodes } - break - case "p", "i": - parts = append(parts, ParseNodesForPassage(child)) - break - case "br": - parts = append(parts, "\n") - break - default: - parts = append(parts, child.Data) } } - text = strings.Join(parts, "") + var content strings.Builder + var textBuffer strings.Builder - if node.Data == "h1" || node.Data == "h2" || node.Data == "h3" || node.Data == "h4" { - text = fmt.Sprintf("*%s*", text) + flushTextBuffer := func() { + if textBuffer.Len() > 0 { + content.WriteString(wrapText(textBuffer.String(), tag)) + textBuffer.Reset() + } } - return text + + for c := node.FirstChild; c != nil; c = c.NextSibling { + if c.Type == html.ElementNode && (isFormattingTag(c.Data) || isHeaderTag(c.Data)) { + flushTextBuffer() + content.WriteString(parseNode(c)) + } else { + textBuffer.WriteString(parseNode(c)) + } + } + flushTextBuffer() + + return content.String() } func ParsePassageFromHtml(rawHtml string) string { @@ -92,8 +142,7 @@ func ParsePassageFromHtml(rawHtml string) string { log.Printf("Error parsing html: %v", err) return rawHtml } - - return ParseNodesForPassage(doc) + return parseNode(doc) } // Deprecated: Using new API service @@ -123,7 +172,7 @@ func GetPassage(ref string, doc *html.Node, version string) string { return false }) - textBlocks := utils.MapNodeListToString(filtNodes, ParseNodesForPassage) + textBlocks := utils.MapNodeListToString(filtNodes, parseNode) var passage strings.Builder @@ -177,6 +226,7 @@ func GetBiblePassage(env def.SessionData) def.SessionData { } // Deprecated: Using new API service logic inside GetBiblePassage +// Deprecated: Using new API service func CheckBibleReference(ref string) bool { log.Printf("Checking reference %s", ref) diff --git a/pkg/app/passage_test.go b/pkg/app/passage_test.go index ee4fde3..13860f8 100644 --- a/pkg/app/passage_test.go +++ b/pkg/app/passage_test.go @@ -78,7 +78,7 @@ func TestGetBiblePassage(t *testing.T) { env.User.Config = utils.SerializeUserConfig(conf) env = GetBiblePassage(env) - if env.Res.Message != "In the beginning God created the heavens and the earth." { + if env.Res.Message != `In the beginning God created the heavens and the earth\.` { t.Errorf("Expected passage text, got '%s'", env.Res.Message) } }) @@ -105,26 +105,57 @@ func TestGetBiblePassage(t *testing.T) { } func TestParsePassageFromHtml(t *testing.T) { - t.Run("Valid HTML", func(t *testing.T) { - html := "

12 But to all who did receive him, who believed in his name, he gave the right to become children of God,

" - expected := "12 But to all who did receive him, who believed in his name, he gave the right to become children of God," + t.Run("Valid HTML with superscript", func(t *testing.T) { + html := `

12 But to all who did receive him, who believed in his name, he gave the right to become children of God,

` + expected := `^12^But to all who did receive him, who believed in his name, he gave the right to become children of God,` if got := ParsePassageFromHtml(html); got != expected { t.Errorf("ParsePassageFromHtml() = %v, want %v", got, expected) } }) t.Run("HTML with italics", func(t *testing.T) { - html := "

This is italic.

" - expected := "This is italic." + html := `

This is italic.

` + expected := `_This is italic\._` + if got := ParsePassageFromHtml(html); got != expected { + t.Errorf("ParsePassageFromHtml() = %v, want %v", got, expected) + } + }) + + t.Run("HTML with bold", func(t *testing.T) { + html := `

This is bold.

` + expected := `*This is bold\.*` + if got := ParsePassageFromHtml(html); got != expected { + t.Errorf("ParsePassageFromHtml() = %v, want %v", got, expected) + } + }) + + t.Run("HTML with line breaks", func(t *testing.T) { + html := `

Line 1.
Line 2.

` + expected := "Line 1\\.\nLine 2\\." if got := ParsePassageFromHtml(html); got != expected { t.Errorf("ParsePassageFromHtml() = %v, want %v", got, expected) } }) t.Run("Invalid HTML", func(t *testing.T) { - html := "

This is malformed HTML" - // The parser should still try its best. In this case, it should just return the text. - expected := "This is malformed HTML" + html := `

This is malformed HTML` + expected := `This is malformed HTML` + if got := ParsePassageFromHtml(html); got != expected { + t.Errorf("ParsePassageFromHtml() = %v, want %v", got, expected) + } + }) + + t.Run("Nested HTML tags", func(t *testing.T) { + html := `

This is bold, and this is italic.

` + expected := `*This is bold, *_and this is italic\._` + if got := ParsePassageFromHtml(html); got != expected { + t.Errorf("ParsePassageFromHtml() = %v, want %v", got, expected) + } + }) + + t.Run("MarkdownV2 escaping", func(t *testing.T) { + html := `

This has special characters: *_. [hello](world)!

` + expected := `This has special characters: \*\_\. \[hello\]\(world\)\!` if got := ParsePassageFromHtml(html); got != expected { t.Errorf("ParsePassageFromHtml() = %v, want %v", got, expected) }