Skip to content

Commit ebc3493

Browse files
committed
Support strings of all lowercase
1 parent dfe76e1 commit ebc3493

File tree

2 files changed

+18
-13
lines changed

2 files changed

+18
-13
lines changed

lib/index.test.ts

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,12 @@ for (const [language, locale, corpus] of [
1616
"Dr. John Smith, Jr. gave a lecture.",
1717
"As of 2.14.2023, the average price of apples is $1.31 per pound!",
1818
"There's pool, pinball, Ms. Pac Man, tvs for Football (or playoff baseball!!!!) and they are kid friendly on one half of the bar.",
19+
// all lowercase
20+
"i just spoke with michelle at cinergy.",
21+
"she said she's hearing a rumor that lonestar may be doing more work at the end of the month.",
22+
"kevin, could you please call jeff or gary to see if the rumor is true.",
23+
"i'd like to have a \"heads up\" if possible.",
24+
1925
],
2026
],
2127
[

lib/index.ts

Lines changed: 12 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -17,17 +17,19 @@ const makeSegmenter = cached((locale: string) => ({
1717
* Split a string into sentences, respecting common abbreviations.
1818
*/
1919
export function* splitBySentence(
20-
input: string,
20+
rawInput: string,
2121
locale: Intl.LocalesArgument = "en"
2222
): Generator<Intl.SegmentData> {
23-
if (!input || typeof input !== "string")
23+
if (!rawInput || typeof rawInput !== "string")
2424
throw new TypeError("input must be a string")
2525

2626
const { abbreviations, segmenter } = makeSegmenter(locale.toString())
2727
const rLastWord = /(?<=\s|^)\S+(?=\s+$)/
28+
const input = rawInput.replaceAll(/(?<=\.\s+)\S/g, (char) =>
29+
char.toLocaleUpperCase()
30+
)
2831

29-
let continuationIndex: number | undefined
30-
let continuation = ""
32+
let left = 0
3133
for (const { segment, index } of segmenter.segment(input)) {
3234
const match = segment.match(rLastWord)
3335

@@ -37,20 +39,17 @@ export function* splitBySentence(
3739
(abbreviations.has(match[0].toLocaleLowerCase(locale)) ||
3840
// 2. A closing parenthesis without a period.
3941
match[0].endsWith(")"))
40-
) {
41-
continuationIndex = continuationIndex ?? index
42-
continuation += segment
42+
)
4343
continue
44-
}
4544

45+
const right = index + segment.length
4646
yield {
47-
segment: continuation + segment,
48-
index: continuationIndex ?? index,
49-
input,
47+
segment: rawInput.slice(left, right),
48+
index: left,
49+
input: rawInput,
5050
}
5151

52-
continuation = ""
53-
continuationIndex = undefined
52+
left = right
5453
}
5554
}
5655

0 commit comments

Comments
 (0)