@@ -12,6 +12,7 @@ import (
1212 "strconv"
1313 "strings"
1414 "time"
15+ "unicode"
1516
1617 "github.com/go-shiori/dom"
1718 "github.com/go-shiori/go-readability/internal/re2go"
@@ -229,6 +230,63 @@ func (ps *Parser) getAllNodesWithTag(node *html.Node, tagNames ...string) []*htm
229230 return result
230231}
231232
233+ // hasTextContent reports whether a node or any of its descendants have text content other than spaces.
234+ func hasTextContent (node * html.Node ) bool {
235+ if node .Type == html .TextNode {
236+ return hasContent (node .Data )
237+ }
238+ for child := range node .ChildNodes () {
239+ if hasTextContent (child ) {
240+ return true
241+ }
242+ }
243+ return false
244+ }
245+
246+ // countCharsAndCommas returns counts for both characters and commas in a node's
247+ // text. Leading and trailing whitespace is not counted, nor are consecutive
248+ // runs of whitespace.
249+ func countCharsAndCommas (node * html.Node ) (int , int ) {
250+ numChars := 0
251+ numCommas := 0
252+ lastCharWasSpace := false
253+ seenNonSpace := false
254+
255+ // Walk the node and its descendants to count all non-space characters and
256+ // different comma variants separately.
257+ var walk func (* html.Node )
258+ walk = func (n * html.Node ) {
259+ if n .Type == html .TextNode {
260+ for _ , r := range n .Data {
261+ if unicode .IsSpace (r ) {
262+ lastCharWasSpace = true
263+ continue
264+ }
265+ if lastCharWasSpace && seenNonSpace {
266+ numChars += 2
267+ } else {
268+ numChars += 1
269+ }
270+ lastCharWasSpace = false
271+ seenNonSpace = true
272+ switch r {
273+ // Commas as used in Latin, Sindhi, Chinese and various other scripts.
274+ // see: https://en.wikipedia.org/wiki/Comma#Comma_variants
275+ case '\u002C' , '\u060C' , '\uFE50' , '\uFE10' , '\uFE11' , '\u2E41' , '\u2E34' , '\u2E32' , '\uFF0C' :
276+ numCommas ++
277+ }
278+ }
279+ return
280+ }
281+ for child := range n .ChildNodes () {
282+ walk (child )
283+ }
284+ }
285+
286+ walk (node )
287+ return numChars , numCommas
288+ }
289+
232290// cleanClasses removes the class="" attribute from every element in the
233291// given subtree, except those that match CLASSES_TO_PRESERVE and the
234292// classesToPreserve array from the options object.
@@ -327,7 +385,7 @@ func (ps *Parser) simplifyNestedElements(articleContent *html.Node) {
327385
328386 if node .Parent != nil && (nodeTagName == "div" || nodeTagName == "section" ) &&
329387 ! strings .HasPrefix (nodeID , "readability" ) {
330- if ps . isElementWithoutContent (node ) {
388+ if isElementWithoutContent (node ) {
331389 node = ps .removeAndGetNext (node )
332390 continue
333391 }
@@ -405,8 +463,7 @@ func (ps *Parser) getArticleTitle() string {
405463 }
406464 }
407465
408- curTitle = strings .TrimSpace (curTitle )
409- curTitle = re2go .NormalizeSpaces (curTitle )
466+ curTitle = normalizeWhitespace (curTitle )
410467 // If we now have 4 words or fewer as our title, and either no
411468 // 'hierarchical' separators (\, /, > or ») were found in the original
412469 // title or we decreased the number of words by more than 1 word, use
@@ -589,7 +646,7 @@ func (ps *Parser) prepArticle(articleContent *html.Node) {
589646 iframeCount := len (dom .GetElementsByTagName (p , "iframe" ))
590647 totalCount := imgCount + embedCount + objectCount + iframeCount
591648
592- return totalCount == 0 && ps . getInnerText ( p , false ) == ""
649+ return totalCount == 0 && ! hasTextContent ( p )
593650 })
594651
595652 ps .forEachNode (dom .GetElementsByTagName (articleContent , "br" ), func (br * html.Node , _ int ) {
@@ -714,28 +771,32 @@ func (ps *Parser) checkByline(node *html.Node, matchString string) bool {
714771
715772 rel := dom .GetAttribute (node , "rel" )
716773 itemprop := dom .GetAttribute (node , "itemprop" )
717- nodeText := dom .TextContent (node )
718- if (rel == "author" || strings .Contains (itemprop , "author" ) || re2go .IsByline (matchString )) &&
719- ps .isValidByline (nodeText ) {
720- nodeText = strings .TrimSpace (nodeText )
721- nodeText = strings .Join (strings .Fields (nodeText ), " " )
722- ps .articleByline = nodeText
723- return true
774+ if rel != "author" && ! strings .Contains (itemprop , "author" ) && ! re2go .IsByline (matchString ) {
775+ return false
724776 }
725777
778+ nodeText := ps .getInnerText (node , false )
779+ // For now, it's intentional that counting characters happens before
780+ // whitespace normalization. Doing it the other way around breaks several
781+ // tests and the bylines end up different.
782+ if nChar := charCount (nodeText ); nChar > 0 && nChar < 100 {
783+ ps .articleByline = normalizeWhitespace (nodeText )
784+ return true
785+ }
726786 return false
727787}
728788
729789func (ps * Parser ) getTextDensity (node * html.Node , tags ... string ) float64 {
730- textLength := charCount ( ps . getInnerText ( node , true ) )
790+ textLength , _ := countCharsAndCommas ( node )
731791 if textLength == 0 {
732792 return 0
733793 }
734794
735795 var childrenLength int
736796 children := ps .getAllNodesWithTag (node , tags ... )
737797 ps .forEachNode (children , func (child * html.Node , _ int ) {
738- childrenLength += charCount (ps .getInnerText (child , true ))
798+ childLength , _ := countCharsAndCommas (child )
799+ childrenLength += childLength
739800 })
740801
741802 return float64 (childrenLength ) / float64 (textLength )
@@ -810,13 +871,14 @@ func (ps *Parser) grabArticle() *html.Node {
810871 // Check to see if this node is a byline, and remove it if
811872 // it is true.
812873 if ps .checkByline (node , matchString ) {
874+ ps .logf ("found byline: %q" , dom .OuterHTML (node ))
813875 node = ps .removeAndGetNext (node )
814876 continue
815877 }
816878
817879 if shouldRemoveTitleHeader && ps .headerDuplicatesTitle (node ) {
818880 ps .logf ("removing header: %q duplicate of %q\n " ,
819- trim ( dom . TextContent (node )), trim (ps .articleTitle ))
881+ ps . getInnerText (node , true ), normalizeWhitespace (ps .articleTitle ))
820882 shouldRemoveTitleHeader = false
821883 node = ps .removeAndGetNext (node )
822884 continue
@@ -848,7 +910,7 @@ func (ps *Parser) grabArticle() *html.Node {
848910 switch nodeTagName {
849911 case "div" , "section" , "header" ,
850912 "h1" , "h2" , "h3" , "h4" , "h5" , "h6" :
851- if ps . isElementWithoutContent (node ) {
913+ if isElementWithoutContent (node ) {
852914 node = ps .removeAndGetNext (node )
853915 continue
854916 }
@@ -911,9 +973,9 @@ func (ps *Parser) grabArticle() *html.Node {
911973 return
912974 }
913975
976+ numChars , numCommas := countCharsAndCommas (elementToScore )
914977 // If this paragraph is less than 25 characters, don't even count it.
915- innerText := ps .getInnerText (elementToScore , true )
916- if charCount (innerText ) < 25 {
978+ if numChars < 25 {
917979 return
918980 }
919981
@@ -927,10 +989,10 @@ func (ps *Parser) grabArticle() *html.Node {
927989 contentScore := 1
928990
929991 // Add points for any commas within this paragraph.
930- contentScore += re2go . CountCommas ( innerText )
992+ contentScore += numCommas
931993
932994 // For every 100 characters in this paragraph, add another point. Up to 3 points.
933- contentScore += int (math .Min (math .Floor (float64 (charCount ( innerText ) )/ 100.0 ), 3.0 ))
995+ contentScore += int (math .Min (math .Floor (float64 (numChars )/ 100.0 ), 3.0 ))
934996
935997 // Initialize and score ancestors.
936998 ps .forEachNode (ancestors , func (ancestor * html.Node , level int ) {
@@ -1199,7 +1261,7 @@ func (ps *Parser) grabArticle() *html.Node {
11991261 // gives us a higher likelihood of finding the content, and
12001262 // the sieve approach gives us a higher likelihood of
12011263 // finding the -right- content.
1202- textLength := charCount ( ps . getInnerText ( articleContent , true ) )
1264+ textLength , _ := countCharsAndCommas ( articleContent )
12031265 if textLength < ps .CharThresholds {
12041266 parseSuccessful = false
12051267
@@ -1249,15 +1311,6 @@ func (ps *Parser) grabArticle() *html.Node {
12491311 }
12501312}
12511313
1252- // isValidByline checks whether the input string could be a byline.
1253- // This verifies that the input is a string, and that the length
1254- // is less than 100 chars.
1255- func (ps * Parser ) isValidByline (byline string ) bool {
1256- byline = strings .TrimSpace (byline )
1257- nChar := charCount (byline )
1258- return nChar > 0 && nChar < 100
1259- }
1260-
12611314// getJSONLD try to extract metadata from JSON-LD object.
12621315// For now, only Schema.org objects of type Article or its subtypes are supported.
12631316func (ps * Parser ) getJSONLD () (map [string ]string , error ) {
@@ -1515,8 +1568,7 @@ func (ps *Parser) isSingleImage(node *html.Node) bool {
15151568 }
15161569
15171570 children := dom .Children (node )
1518- textContent := dom .TextContent (node )
1519- if len (children ) != 1 || strings .TrimSpace (textContent ) != "" {
1571+ if len (children ) != 1 || hasTextContent (node ) {
15201572 return false
15211573 }
15221574
@@ -1623,16 +1675,25 @@ func (ps *Parser) hasSingleTagInsideElement(element *html.Node, tag string) bool
16231675 })
16241676}
16251677
1626- // isElementWithoutContent determines if node is empty
1627- // or only fille with <br> and <hr>.
1628- func (ps * Parser ) isElementWithoutContent (node * html.Node ) bool {
1629- brs := dom .GetElementsByTagName (node , "br" )
1630- hrs := dom .GetElementsByTagName (node , "hr" )
1631- childs := dom .Children (node )
1632-
1633- return node .Type == html .ElementNode &&
1634- strings .TrimSpace (dom .TextContent (node )) == "" &&
1635- (len (childs ) == 0 || len (childs ) == len (brs )+ len (hrs ))
1678+ func isElementWithoutContent (node * html.Node ) bool {
1679+ if node .Type != html .ElementNode {
1680+ return false
1681+ }
1682+ // Traverse the node's descendants to find any text content that is
1683+ // non-whitespace or any elements other than <br> and <hr>.
1684+ foundContent := false
1685+ for child := range node .ChildNodes () {
1686+ if child .Type == html .TextNode {
1687+ if hasContent (child .Data ) {
1688+ foundContent = true
1689+ break
1690+ }
1691+ } else if child .Type == html .ElementNode && child .Data != "br" && child .Data != "hr" {
1692+ foundContent = true
1693+ break
1694+ }
1695+ }
1696+ return ! foundContent
16361697}
16371698
16381699// hasChildBlockElement determines whether element has any children
@@ -1654,26 +1715,18 @@ func (ps *Parser) isPhrasingContent(node *html.Node) bool {
16541715
16551716// isWhitespace determines if a node only used as whitespace.
16561717func (ps * Parser ) isWhitespace (node * html.Node ) bool {
1657- return (node .Type == html .TextNode && strings . TrimSpace ( dom . TextContent ( node )) == "" ) ||
1718+ return (node .Type == html .TextNode && ! hasTextContent ( node )) ||
16581719 (node .Type == html .ElementNode && dom .TagName (node ) == "br" )
16591720}
16601721
1661- // getInnerText gets the inner text of a node.
1662- // This also strips * out any excess whitespace to be found.
1663- // In Readability.js, normalizeSpaces default to true.
1722+ // getInnerText gets the inner text of a node. This also strips out any excess
1723+ // whitespace to be found. In Readability.js, normalizeSpaces defaults to true.
16641724func (ps * Parser ) getInnerText (node * html.Node , normalizeSpaces bool ) string {
1665- textContent := strings . TrimSpace ( dom .TextContent (node ) )
1725+ textContent := dom .TextContent (node )
16661726 if normalizeSpaces {
1667- textContent = re2go . NormalizeSpaces (textContent )
1727+ return normalizeWhitespace (textContent )
16681728 }
1669- return textContent
1670- }
1671-
1672- // getCharCount returns the number of times a string s
1673- // appears in the node.
1674- func (ps * Parser ) getCharCount (node * html.Node , s string ) int {
1675- innerText := ps .getInnerText (node , true )
1676- return strings .Count (innerText , s )
1729+ return strings .TrimSpace (textContent )
16771730}
16781731
16791732// cleanStyles removes the style attribute on every node and under.
@@ -1702,7 +1755,7 @@ func (ps *Parser) cleanStyles(node *html.Node) {
17021755// content. This is the amount of text that is inside a link divided
17031756// by the total text in the node.
17041757func (ps * Parser ) getLinkDensity (element * html.Node ) float64 {
1705- textLength := charCount ( ps . getInnerText ( element , true ) )
1758+ textLength , _ := countCharsAndCommas ( element )
17061759 if textLength == 0 {
17071760 return 0
17081761 }
@@ -1717,7 +1770,7 @@ func (ps *Parser) getLinkDensity(element *html.Node) float64 {
17171770 coefficient = 0.3
17181771 }
17191772
1720- nodeLength := charCount ( ps . getInnerText ( linkNode , true ) )
1773+ nodeLength , _ := countCharsAndCommas ( linkNode )
17211774 linkLength += float64 (nodeLength ) * coefficient
17221775 })
17231776
@@ -2019,10 +2072,11 @@ func (ps *Parser) cleanConditionally(element *html.Node, tag string) {
20192072 var listLength int
20202073 listNodes := ps .getAllNodesWithTag (node , "ul" , "ol" )
20212074 ps .forEachNode (listNodes , func (list * html.Node , _ int ) {
2022- listLength += charCount (ps .getInnerText (list , true ))
2075+ n , _ := countCharsAndCommas (list )
2076+ listLength += n
20232077 })
20242078
2025- nodeLength := charCount ( ps . getInnerText ( node , true ) )
2079+ nodeLength , _ := countCharsAndCommas ( node )
20262080 isList = float64 (listLength )/ float64 (nodeLength ) > 0.9
20272081 }
20282082
@@ -2041,14 +2095,16 @@ func (ps *Parser) cleanConditionally(element *html.Node, tag string) {
20412095 return true
20422096 }
20432097
2044- if ps .getCharCount (node , "," ) < 10 {
2098+ // FIXME: countCharsAndCommas(node) was already called for non-lists above
2099+ if contentLength , commaCount := countCharsAndCommas (node ); commaCount < 10 {
20452100 // If there are not very many commas, and the number of
20462101 // non-paragraph elements is more than paragraphs or other
20472102 // ominous signs, remove the element.
20482103 p := float64 (len (dom .GetElementsByTagName (node , "p" )))
20492104 img := float64 (len (dom .GetElementsByTagName (node , "img" )))
20502105 li := float64 (len (dom .GetElementsByTagName (node , "li" )) - 100 )
20512106 input := float64 (len (dom .GetElementsByTagName (node , "input" )))
2107+ // FIXME: this also calls countCharsAndCommas(node)
20522108 headingDensity := ps .getTextDensity (node , "h1" , "h2" , "h3" , "h4" , "h5" , "h6" )
20532109
20542110 embedCount := 0
@@ -2071,8 +2127,8 @@ func (ps *Parser) cleanConditionally(element *html.Node, tag string) {
20712127 embedCount ++
20722128 }
20732129
2130+ // FIXME: this also calls countCharsAndCommas(node)
20742131 linkDensity := ps .getLinkDensity (node )
2075- contentLength := charCount (ps .getInnerText (node , true ))
20762132 haveToRemove := (img > 1 && p / img < 0.5 && ! ps .hasAncestorTag (node , "figure" , 3 , nil )) ||
20772133 (! isList && li > p ) ||
20782134 (input > math .Floor (p / 3 )) ||
0 commit comments