Skip to content

Commit 7323c44

Browse files
authored
use AUTOMATIC_REPLACEMENTS instead of separate punct_preprocess function (#14)
* use `AUTOMATIC_REPLACEMENTS` for punctuation preprocessing * format
1 parent 6ecb64c commit 7323c44

File tree

2 files changed

+31
-25
lines changed

2 files changed

+31
-25
lines changed

src/KeywordSearch.jl

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,14 +7,27 @@ using Tables
77
export DamerauLevenshtein, FuzzyQuery, Query, Corpus, Document
88
export augment, explain, match_all, word_boundary, NamedQuery
99

10-
"""
11-
const AUTOMATIC_REPLACEMENTS::Vector{Pair{String, String}}
10+
@doc raw"""
11+
const AUTOMATIC_REPLACEMENTS::Vector{Pair{Union{Regex,String},String}}
1212
1313
A list of replacements to automatically perform when preprocessing a [`Document`](@ref).
1414
For example, if `KeywordSearch.AUTOMATIC_REPLACEMENTS == ["a" => "b"]`, then
1515
`Document("abc").text == "bbc"` instead of "abc".
16+
17+
By default, `AUTOMATIC_REPLACEMENTS` contains only one replacement,
18+
19+
```julia
20+
r"[.!?><\-\n\r\v\t\f]" => " "
21+
```
22+
23+
which replaces certain punctuation characters, whitespace, and newlines with a space.
24+
This replacement is needed for [`word_boundary`](@ref) to work correctly, but you
25+
can remove it with `empty!(KeywordSearch.AUTOMATIC_REPLACEMENTS)` if you wish.
26+
27+
You an also add other preprocessing directives by `push!`ing further replacements
28+
into `KeywordSearch.AUTOMATIC_REPLACEMENTS`.
1629
"""
17-
const AUTOMATIC_REPLACEMENTS = Pair{String,String}[]
30+
const AUTOMATIC_REPLACEMENTS = Pair{Union{Regex,String},String}[r"[.!?><\-\n\r\v\t\f]" => " "]
1831

1932
include("core.jl")
2033
include("corpus.jl")

src/core.jl

Lines changed: 15 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,8 @@ Represents a single string document. This object has two fields,
88
* `text::String`
99
* `metadata::T`
1010
11-
The `text` is automatically processed by first applying the replacements
12-
from [`AUTOMATIC_REPLACEMENTS`](@ref), then replacing punctuation
13-
matching `r"[.!?><\-\n\r\v\t\f]"` by spaces, and finally by
11+
The `text` is automatically processed by applying the replacements
12+
from [`AUTOMATIC_REPLACEMENTS`](@ref) and
1413
adding a space to the end of the document.
1514
"""
1615
struct Document{T<:NamedTuple}
@@ -19,26 +18,20 @@ struct Document{T<:NamedTuple}
1918

2019
function Document(text::AbstractString, metadata::T) where {T}
2120
check_keys(T)
22-
return new{T}(process_document(text), metadata)
21+
22+
# Add a final space to ensure that the last word is recognized
23+
# as a word boundary.
24+
new_text = apply_replacements(text) * " "
25+
return new{T}(new_text, metadata)
2326
end
2427
end
2528

2629
Document(text::AbstractString) = Document(text, NamedTuple())
2730

28-
function process_document(str::AbstractString)
31+
function apply_replacements(str::AbstractString)
2932
# Apply automatic replacements
3033
# using https://github.com/JuliaLang/julia/issues/29849#issuecomment-449535743
31-
str = foldl(replace, AUTOMATIC_REPLACEMENTS; init=str)
32-
# Replace punctuation with a space
33-
str = process_punct(str)
34-
# Add a final space to ensure that the last word is recognized
35-
# as a word boundary.
36-
str = str * " "
37-
return str
38-
end
39-
40-
function process_punct(str::AbstractString)
41-
return replace(str, r"[.!?><\-\n\r\v\t\f]" => " ")
34+
return foldl(replace, AUTOMATIC_REPLACEMENTS; init=str)
4235
end
4336

4437
abstract type AbstractQuery end
@@ -69,12 +62,12 @@ with one field:
6962
7063
* `text::String`
7164
72-
The text is preprocessed by removing punctuation in
73-
the same way as for [`Document`](@ref)s.
65+
The `text` is automatically processed by applying the replacements
66+
from [`AUTOMATIC_REPLACEMENTS`](@ref).
7467
"""
7568
struct Query <: AbstractQuery
7669
text::String
77-
Query(str::AbstractString) = new(process_punct(str))
70+
Query(str::AbstractString) = new(apply_replacements(str))
7871
end
7972

8073
"""
@@ -155,15 +148,15 @@ with three fields:
155148
* `dist::D`: the distance measure to use; defaults to `DamerauLevenshtein()`
156149
* `threshold::T`: the maximum threshold allowed for a match; defaults to 2.
157150
158-
The text is preprocessed by removing punctuation in
159-
the same way as for [`Document`](@ref)s.
151+
The `text` is automatically processed by applying the replacements
152+
from [`AUTOMATIC_REPLACEMENTS`](@ref).
160153
"""
161154
struct FuzzyQuery{D,T} <: AbstractQuery
162155
text::String
163156
dist::D
164157
threshold::T
165158
function FuzzyQuery(str::AbstractString, dist::D, threshold::T) where {D,T}
166-
return new{D,T}(process_punct(str), dist, threshold)
159+
return new{D,T}(apply_replacements(str), dist, threshold)
167160
end
168161
end
169162

0 commit comments

Comments
 (0)