go-shiori
diff --git a/‎parser.go‎
Lines changed: 117 additions & 61 deletions b/‎parser.go‎
Lines changed: 117 additions & 61 deletions
@@ -12,6 +12,7 @@ import (
 	"strconv"
 	"strings"
 	"time"
+	"unicode"
 
 	"github.com/go-shiori/dom"
 	"github.com/go-shiori/go-readability/internal/re2go"
@@ -229,6 +230,63 @@ func (ps *Parser) getAllNodesWithTag(node *html.Node, tagNames ...string) []*htm
 	return result
 }
 
+// hasTextContent reports whether a node or any of its descendants have text content other than spaces.
+func hasTextContent(node *html.Node) bool {
+	if node.Type == html.TextNode {
+		return hasContent(node.Data)
+	}
+	for child := range node.ChildNodes() {
+		if hasTextContent(child) {
+			return true
+		}
+	}
+	return false
+}
+
+// countCharsAndCommas returns counts for both characters and commas in a node's
+// text. Leading and trailing whitespace is not counted, nor are consecutive
+// runs of whitespace.
+func countCharsAndCommas(node *html.Node) (int, int) {
+	numChars := 0
+	numCommas := 0
+	lastCharWasSpace := false
+	seenNonSpace := false
+
+	// Walk the node and its descendants to count all non-space characters and
+	// different comma variants separately.
+	var walk func(*html.Node)
+	walk = func(n *html.Node) {
+		if n.Type == html.TextNode {
+			for _, r := range n.Data {
+				if unicode.IsSpace(r) {
+					lastCharWasSpace = true
+					continue
+				}
+				if lastCharWasSpace && seenNonSpace {
+					numChars += 2
+				} else {
+					numChars += 1
+				}
+				lastCharWasSpace = false
+				seenNonSpace = true
+				switch r {
+				// Commas as used in Latin, Sindhi, Chinese and various other scripts.
+				// see: https://en.wikipedia.org/wiki/Comma#Comma_variants
+				case '\u002C', '\u060C', '\uFE50', '\uFE10', '\uFE11', '\u2E41', '\u2E34', '\u2E32', '\uFF0C':
+					numCommas++
+				}
+			}
+			return
+		}
+		for child := range n.ChildNodes() {
+			walk(child)
+		}
+	}
+
+	walk(node)
+	return numChars, numCommas
+}
+
 // cleanClasses removes the class="" attribute from every element in the
 // given subtree, except those that match CLASSES_TO_PRESERVE and the
 // classesToPreserve array from the options object.
@@ -327,7 +385,7 @@ func (ps *Parser) simplifyNestedElements(articleContent *html.Node) {
 
 		if node.Parent != nil && (nodeTagName == "div" || nodeTagName == "section") &&
 			!strings.HasPrefix(nodeID, "readability") {
-			if ps.isElementWithoutContent(node) {
+			if isElementWithoutContent(node) {
 				node = ps.removeAndGetNext(node)
 				continue
 			}
@@ -405,8 +463,7 @@ func (ps *Parser) getArticleTitle() string {
 		}
 	}
 
-	curTitle = strings.TrimSpace(curTitle)
-	curTitle = re2go.NormalizeSpaces(curTitle)
+	curTitle = normalizeWhitespace(curTitle)
 	// If we now have 4 words or fewer as our title, and either no
 	// 'hierarchical' separators (\, /, > or ») were found in the original
 	// title or we decreased the number of words by more than 1 word, use
@@ -589,7 +646,7 @@ func (ps *Parser) prepArticle(articleContent *html.Node) {
 		iframeCount := len(dom.GetElementsByTagName(p, "iframe"))
 		totalCount := imgCount + embedCount + objectCount + iframeCount
 
-		return totalCount == 0 && ps.getInnerText(p, false) == ""
+		return totalCount == 0 && !hasTextContent(p)
 	})
 
 	ps.forEachNode(dom.GetElementsByTagName(articleContent, "br"), func(br *html.Node, _ int) {
@@ -714,28 +771,32 @@ func (ps *Parser) checkByline(node *html.Node, matchString string) bool {
 
 	rel := dom.GetAttribute(node, "rel")
 	itemprop := dom.GetAttribute(node, "itemprop")
-	nodeText := dom.TextContent(node)
-	if (rel == "author" || strings.Contains(itemprop, "author") || re2go.IsByline(matchString)) &&
-		ps.isValidByline(nodeText) {
-		nodeText = strings.TrimSpace(nodeText)
-		nodeText = strings.Join(strings.Fields(nodeText), " ")
-		ps.articleByline = nodeText
-		return true
+	if rel != "author" && !strings.Contains(itemprop, "author") && !re2go.IsByline(matchString) {
+		return false
 	}
 
+	nodeText := ps.getInnerText(node, false)
+	// For now, it's intentional that counting characters happens before
+	// whitespace normalization. Doing it the other way around breaks several
+	// tests and the bylines end up different.
+	if nChar := charCount(nodeText); nChar > 0 && nChar < 100 {
+		ps.articleByline = normalizeWhitespace(nodeText)
+		return true
+	}
 	return false
 }
 
 func (ps *Parser) getTextDensity(node *html.Node, tags ...string) float64 {
-	textLength := charCount(ps.getInnerText(node, true))
+	textLength, _ := countCharsAndCommas(node)
 	if textLength == 0 {
 		return 0
 	}
 
 	var childrenLength int
 	children := ps.getAllNodesWithTag(node, tags...)
 	ps.forEachNode(children, func(child *html.Node, _ int) {
-		childrenLength += charCount(ps.getInnerText(child, true))
+		childLength, _ := countCharsAndCommas(child)
+		childrenLength += childLength
 	})
 
 	return float64(childrenLength) / float64(textLength)
@@ -810,13 +871,14 @@ func (ps *Parser) grabArticle() *html.Node {
 			// Check to see if this node is a byline, and remove it if
 			// it is true.
 			if ps.checkByline(node, matchString) {
+				ps.logf("found byline: %q", dom.OuterHTML(node))
 				node = ps.removeAndGetNext(node)
 				continue
 			}
 
 			if shouldRemoveTitleHeader && ps.headerDuplicatesTitle(node) {
 				ps.logf("removing header: %q duplicate of %q\n",
-					trim(dom.TextContent(node)), trim(ps.articleTitle))
+					ps.getInnerText(node, true), normalizeWhitespace(ps.articleTitle))
 				shouldRemoveTitleHeader = false
 				node = ps.removeAndGetNext(node)
 				continue
@@ -848,7 +910,7 @@ func (ps *Parser) grabArticle() *html.Node {
 			switch nodeTagName {
 			case "div", "section", "header",
 				"h1", "h2", "h3", "h4", "h5", "h6":
-				if ps.isElementWithoutContent(node) {
+				if isElementWithoutContent(node) {
 					node = ps.removeAndGetNext(node)
 					continue
 				}
@@ -911,9 +973,9 @@ func (ps *Parser) grabArticle() *html.Node {
 				return
 			}
 
+			numChars, numCommas := countCharsAndCommas(elementToScore)
 			// If this paragraph is less than 25 characters, don't even count it.
-			innerText := ps.getInnerText(elementToScore, true)
-			if charCount(innerText) < 25 {
+			if numChars < 25 {
 				return
 			}
 
@@ -927,10 +989,10 @@ func (ps *Parser) grabArticle() *html.Node {
 			contentScore := 1
 
 			// Add points for any commas within this paragraph.
-			contentScore += re2go.CountCommas(innerText)
+			contentScore += numCommas
 
 			// For every 100 characters in this paragraph, add another point. Up to 3 points.
-			contentScore += int(math.Min(math.Floor(float64(charCount(innerText))/100.0), 3.0))
+			contentScore += int(math.Min(math.Floor(float64(numChars)/100.0), 3.0))
 
 			// Initialize and score ancestors.
 			ps.forEachNode(ancestors, func(ancestor *html.Node, level int) {
@@ -1199,7 +1261,7 @@ func (ps *Parser) grabArticle() *html.Node {
 		// gives us a higher likelihood of finding the content, and
 		// the sieve approach gives us a higher likelihood of
 		// finding the -right- content.
-		textLength := charCount(ps.getInnerText(articleContent, true))
+		textLength, _ := countCharsAndCommas(articleContent)
 		if textLength < ps.CharThresholds {
 			parseSuccessful = false
 
@@ -1249,15 +1311,6 @@ func (ps *Parser) grabArticle() *html.Node {
 	}
 }
 
-// isValidByline checks whether the input string could be a byline.
-// This verifies that the input is a string, and that the length
-// is less than 100 chars.
-func (ps *Parser) isValidByline(byline string) bool {
-	byline = strings.TrimSpace(byline)
-	nChar := charCount(byline)
-	return nChar > 0 && nChar < 100
-}
-
 // getJSONLD try to extract metadata from JSON-LD object.
 // For now, only Schema.org objects of type Article or its subtypes are supported.
 func (ps *Parser) getJSONLD() (map[string]string, error) {
@@ -1515,8 +1568,7 @@ func (ps *Parser) isSingleImage(node *html.Node) bool {
 	}
 
 	children := dom.Children(node)
-	textContent := dom.TextContent(node)
-	if len(children) != 1 || strings.TrimSpace(textContent) != "" {
+	if len(children) != 1 || hasTextContent(node) {
 		return false
 	}
 
@@ -1623,16 +1675,25 @@ func (ps *Parser) hasSingleTagInsideElement(element *html.Node, tag string) bool
 	})
 }
 
-// isElementWithoutContent determines if node is empty
-// or only fille with <br> and <hr>.
-func (ps *Parser) isElementWithoutContent(node *html.Node) bool {
-	brs := dom.GetElementsByTagName(node, "br")
-	hrs := dom.GetElementsByTagName(node, "hr")
-	childs := dom.Children(node)
-
-	return node.Type == html.ElementNode &&
-		strings.TrimSpace(dom.TextContent(node)) == "" &&
-		(len(childs) == 0 || len(childs) == len(brs)+len(hrs))
+func isElementWithoutContent(node *html.Node) bool {
+	if node.Type != html.ElementNode {
+		return false
+	}
+	// Traverse the node's descendants to find any text content that is
+	// non-whitespace or any elements other than <br> and <hr>.
+	foundContent := false
+	for child := range node.ChildNodes() {
+		if child.Type == html.TextNode {
+			if hasContent(child.Data) {
+				foundContent = true
+				break
+			}
+		} else if child.Type == html.ElementNode && child.Data != "br" && child.Data != "hr" {
+			foundContent = true
+			break
+		}
+	}
+	return !foundContent
 }
 
 // hasChildBlockElement determines whether element has any children
@@ -1654,26 +1715,18 @@ func (ps *Parser) isPhrasingContent(node *html.Node) bool {
 
 // isWhitespace determines if a node only used as whitespace.
 func (ps *Parser) isWhitespace(node *html.Node) bool {
-	return (node.Type == html.TextNode && strings.TrimSpace(dom.TextContent(node)) == "") ||
+	return (node.Type == html.TextNode && !hasTextContent(node)) ||
 		(node.Type == html.ElementNode && dom.TagName(node) == "br")
 }
 
-// getInnerText gets the inner text of a node.
-// This also strips * out any excess whitespace to be found.
-// In Readability.js, normalizeSpaces default to true.
+// getInnerText gets the inner text of a node. This also strips out any excess
+// whitespace to be found. In Readability.js, normalizeSpaces defaults to true.
 func (ps *Parser) getInnerText(node *html.Node, normalizeSpaces bool) string {
-	textContent := strings.TrimSpace(dom.TextContent(node))
+	textContent := dom.TextContent(node)
 	if normalizeSpaces {
-		textContent = re2go.NormalizeSpaces(textContent)
+		return normalizeWhitespace(textContent)
 	}
-	return textContent
-}
-
-// getCharCount returns the number of times a string s
-// appears in the node.
-func (ps *Parser) getCharCount(node *html.Node, s string) int {
-	innerText := ps.getInnerText(node, true)
-	return strings.Count(innerText, s)
+	return strings.TrimSpace(textContent)
 }
 
 // cleanStyles removes the style attribute on every node and under.
@@ -1702,7 +1755,7 @@ func (ps *Parser) cleanStyles(node *html.Node) {
 // content. This is the amount of text that is inside a link divided
 // by the total text in the node.
 func (ps *Parser) getLinkDensity(element *html.Node) float64 {
-	textLength := charCount(ps.getInnerText(element, true))
+	textLength, _ := countCharsAndCommas(element)
 	if textLength == 0 {
 		return 0
 	}
@@ -1717,7 +1770,7 @@ func (ps *Parser) getLinkDensity(element *html.Node) float64 {
 			coefficient = 0.3
 		}
 
-		nodeLength := charCount(ps.getInnerText(linkNode, true))
+		nodeLength, _ := countCharsAndCommas(linkNode)
 		linkLength += float64(nodeLength) * coefficient
 	})
 
@@ -2019,10 +2072,11 @@ func (ps *Parser) cleanConditionally(element *html.Node, tag string) {
 			var listLength int
 			listNodes := ps.getAllNodesWithTag(node, "ul", "ol")
 			ps.forEachNode(listNodes, func(list *html.Node, _ int) {
-				listLength += charCount(ps.getInnerText(list, true))
+				n, _ := countCharsAndCommas(list)
+				listLength += n
 			})
 
-			nodeLength := charCount(ps.getInnerText(node, true))
+			nodeLength, _ := countCharsAndCommas(node)
 			isList = float64(listLength)/float64(nodeLength) > 0.9
 		}
 
@@ -2041,14 +2095,16 @@ func (ps *Parser) cleanConditionally(element *html.Node, tag string) {
 			return true
 		}
 
-		if ps.getCharCount(node, ",") < 10 {
+		// FIXME: countCharsAndCommas(node) was already called for non-lists above
+		if contentLength, commaCount := countCharsAndCommas(node); commaCount < 10 {
 			// If there are not very many commas, and the number of
 			// non-paragraph elements is more than paragraphs or other
 			// ominous signs, remove the element.
 			p := float64(len(dom.GetElementsByTagName(node, "p")))
 			img := float64(len(dom.GetElementsByTagName(node, "img")))
 			li := float64(len(dom.GetElementsByTagName(node, "li")) - 100)
 			input := float64(len(dom.GetElementsByTagName(node, "input")))
+			// FIXME: this also calls countCharsAndCommas(node)
 			headingDensity := ps.getTextDensity(node, "h1", "h2", "h3", "h4", "h5", "h6")
 
 			embedCount := 0
@@ -2071,8 +2127,8 @@ func (ps *Parser) cleanConditionally(element *html.Node, tag string) {
 				embedCount++
 			}
 
+			// FIXME: this also calls countCharsAndCommas(node)
 			linkDensity := ps.getLinkDensity(node)
-			contentLength := charCount(ps.getInnerText(node, true))
 			haveToRemove := (img > 1 && p/img < 0.5 && !ps.hasAncestorTag(node, "figure", 3, nil)) ||
 				(!isList && li > p) ||
 				(input > math.Floor(p/3)) ||