@@ -8,9 +8,8 @@ Represents a single string document. This object has two fields,
88* `text::String`
99* `metadata::T`
1010
11- The `text` is automatically processed by first applying the replacements
12- from [`AUTOMATIC_REPLACEMENTS`](@ref), then replacing punctuation
13- matching `r"[.!?><\-\n\r\v\t\f ]"` by spaces, and finally by
11+ The `text` is automatically processed by applying the replacements
12+ from [`AUTOMATIC_REPLACEMENTS`](@ref) and
1413adding a space to the end of the document.
1514"""
1615struct Document{T<: NamedTuple }
@@ -19,26 +18,20 @@ struct Document{T<:NamedTuple}
1918
2019 function Document (text:: AbstractString , metadata:: T ) where {T}
2120 check_keys (T)
22- return new {T} (process_document (text), metadata)
21+
22+ # Add a final space to ensure that the last word is recognized
23+ # as a word boundary.
24+ new_text = apply_replacements (text) * " "
25+ return new {T} (new_text, metadata)
2326 end
2427end
2528
2629Document (text:: AbstractString ) = Document (text, NamedTuple ())
2730
28- function process_document (str:: AbstractString )
31+ function apply_replacements (str:: AbstractString )
2932 # Apply automatic replacements
3033 # using https://github.com/JuliaLang/julia/issues/29849#issuecomment-449535743
31- str = foldl (replace, AUTOMATIC_REPLACEMENTS; init= str)
32- # Replace punctuation with a space
33- str = process_punct (str)
34- # Add a final space to ensure that the last word is recognized
35- # as a word boundary.
36- str = str * " "
37- return str
38- end
39-
40- function process_punct (str:: AbstractString )
41- return replace (str, r" [.!?><\-\n\r\v\t\f ]" => " " )
34+ return foldl (replace, AUTOMATIC_REPLACEMENTS; init= str)
4235end
4336
4437abstract type AbstractQuery end
@@ -69,12 +62,12 @@ with one field:
6962
7063* `text::String`
7164
72- The text is preprocessed by removing punctuation in
73- the same way as for [`Document `](@ref)s .
65+ The ` text` is automatically processed by applying the replacements
66+ from [`AUTOMATIC_REPLACEMENTS `](@ref).
7467"""
7568struct Query <: AbstractQuery
7669 text:: String
77- Query (str:: AbstractString ) = new (process_punct (str))
70+ Query (str:: AbstractString ) = new (apply_replacements (str))
7871end
7972
8073"""
@@ -155,15 +148,15 @@ with three fields:
155148* `dist::D`: the distance measure to use; defaults to `DamerauLevenshtein()`
156149* `threshold::T`: the maximum threshold allowed for a match; defaults to 2.
157150
158- The text is preprocessed by removing punctuation in
159- the same way as for [`Document `](@ref)s .
151+ The ` text` is automatically processed by applying the replacements
152+ from [`AUTOMATIC_REPLACEMENTS `](@ref).
160153"""
161154struct FuzzyQuery{D,T} <: AbstractQuery
162155 text:: String
163156 dist:: D
164157 threshold:: T
165158 function FuzzyQuery (str:: AbstractString , dist:: D , threshold:: T ) where {D,T}
166- return new {D,T} (process_punct (str), dist, threshold)
159+ return new {D,T} (apply_replacements (str), dist, threshold)
167160 end
168161end
169162
0 commit comments