Skip to content

Commit 2ae4ed6

Browse files
authored
Merge pull request #964 from streamich/fix-diff-sfx
Fix diff suffix calculation
2 parents 5a0f830 + 84abf6c commit 2ae4ed6

File tree

2 files changed

+298
-5
lines changed

2 files changed

+298
-5
lines changed

packages/json-joy/src/util/diff/__tests__/str.spec.ts

Lines changed: 268 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,150 @@
1-
import {PATCH_OP_TYPE, type Patch, diff, diffEdit, overlap, normalize, apply, src, dst, invert} from '../str';
1+
import {PATCH_OP_TYPE, type Patch, diff, diffEdit, overlap, normalize, apply, src, dst, invert, pfx, sfx} from '../str';
22
import {assertPatch} from './util';
33

4+
describe('pfx()', () => {
5+
test('finds common prefixes', () => {
6+
expect(pfx('abc', 'b')).toEqual(0);
7+
expect(pfx('abc', 'a')).toEqual(1);
8+
expect(pfx('abc', 'ab')).toEqual(2);
9+
expect(pfx('abc', 'abc')).toEqual(3);
10+
expect(pfx('abc', 'abcd')).toEqual(3);
11+
expect(pfx('abc', 'abcde')).toEqual(3);
12+
expect(pfx('👨‍🍳', '👨‍🍳')).toEqual(5);
13+
expect(pfx('👨‍🍳', '👨‍🍳chef')).toEqual(5);
14+
expect(pfx('👨‍🍳chef', '👨‍🍳')).toEqual(5);
15+
expect(pfx('👨‍🍳👨‍🍳', '👨‍🍳')).toEqual(5);
16+
expect('👨‍🍳chef'.slice(0, 5)).toBe('👨‍🍳');
17+
});
18+
19+
test('handles grapheme clusters with ZWJ (Zero Width Joiner)', () => {
20+
const family = '👨‍👩‍👧‍👦';
21+
expect(pfx(family, family)).toEqual(11);
22+
expect(pfx(family + 'abc', family)).toEqual(11);
23+
expect(pfx(family + 'abc', family + 'xyz')).toEqual(11);
24+
expect(pfx('prefix' + family, 'prefix' + family)).toEqual(6 + 11);
25+
const womanTech = '👩🏽‍💻';
26+
expect(pfx(womanTech, womanTech)).toEqual(7);
27+
expect(pfx(womanTech + 'code', womanTech)).toEqual(7);
28+
expect(pfx('hello' + womanTech, 'hello' + womanTech)).toEqual(5 + 7);
29+
});
30+
31+
test('handles flag emojis (regional indicators)', () => {
32+
const usFlag = '🇺🇸';
33+
const ukFlag = '🇬🇧';
34+
expect(pfx(usFlag, usFlag)).toEqual(4);
35+
expect(pfx(usFlag + 'USA', usFlag)).toEqual(4);
36+
expect(pfx(usFlag, ukFlag)).toEqual(0);
37+
expect(pfx('hello' + usFlag, 'hello' + usFlag)).toEqual(5 + 4);
38+
});
39+
40+
test('handles combining diacritical marks', () => {
41+
const combining = 'e\u0301'; // e + combining acute accent
42+
expect(pfx(combining, combining)).toEqual(2);
43+
expect(pfx(combining + 'llo', combining)).toEqual(2);
44+
expect(pfx('hello' + combining, 'hello' + combining)).toEqual(5 + 2);
45+
46+
// Multiple combining marks
47+
const multiCombining = 'a\u0301\u0302\u0303';
48+
expect(pfx(multiCombining, multiCombining)).toEqual(4);
49+
});
50+
51+
test('handles variation selectors', () => {
52+
const heartText = '❤\uFE0E'; // text style
53+
const heartEmoji = '❤\uFE0F'; // emoji style
54+
expect(pfx(heartText, heartText)).toEqual(2);
55+
expect(pfx(heartEmoji, heartEmoji)).toEqual(2);
56+
expect(pfx(heartText, heartEmoji)).toEqual(1); // Only the base character matches
57+
});
58+
59+
test('handles mixed grapheme clusters', () => {
60+
const chef = '👨‍🍳';
61+
const family = '👨‍👩‍👧‍👦';
62+
const combined = chef + family;
63+
expect(pfx(combined, combined)).toEqual(16);
64+
expect(pfx(combined + 'text', combined)).toEqual(16);
65+
expect(pfx('abc' + combined, 'abc' + combined)).toEqual(3 + 16);
66+
});
67+
});
68+
69+
describe('sfx()', () => {
70+
test('finds common suffixes', () => {
71+
expect(sfx('abc', 'b')).toEqual(0);
72+
expect(sfx('abc', 'c')).toEqual(1);
73+
expect(sfx('abc', 'bc')).toEqual(2);
74+
expect(sfx('abc', 'abc')).toEqual(3);
75+
expect(sfx('abc', '_abc')).toEqual(3);
76+
expect(sfx('abc', 'abcd')).toEqual(0);
77+
expect(sfx('👨‍🍳', '👨‍🍳')).toEqual(5);
78+
expect(sfx('👨‍🍳', '👨‍🍳chef')).toEqual(0);
79+
expect(sfx('👨‍🍳chef', '👨‍🍳')).toEqual(0);
80+
expect(sfx('👨‍🍳', 'chef👨‍🍳')).toEqual(5);
81+
expect(sfx('chef👨‍🍳', '👨‍🍳')).toEqual(5);
82+
expect(sfx('👨‍🍳👨‍🍳', '👨‍🍳')).toEqual(5);
83+
});
84+
85+
test('handles grapheme clusters with ZWJ (Zero Width Joiner)', () => {
86+
const family = '👨‍👩‍👧‍👦';
87+
expect(sfx(family, family)).toEqual(11);
88+
expect(sfx('abc' + family, family)).toEqual(11);
89+
expect(sfx('xyz' + family, 'abc' + family)).toEqual(11);
90+
expect(sfx(family + 'suffix', family + 'suffix')).toEqual(6 + 11);
91+
const womanTech = '👩🏽‍💻';
92+
expect(sfx(womanTech, womanTech)).toEqual(7);
93+
expect(sfx('code' + womanTech, womanTech)).toEqual(7);
94+
expect(sfx(womanTech + 'hello', womanTech + 'hello')).toEqual(5 + 7);
95+
});
96+
97+
test('handles flag emojis (regional indicators)', () => {
98+
const usFlag = '🇺🇸';
99+
const ukFlag = '🇬🇧';
100+
expect(sfx(usFlag, usFlag)).toEqual(4);
101+
expect(sfx('USA' + usFlag, usFlag)).toEqual(4);
102+
expect(sfx(usFlag, ukFlag)).toEqual(0);
103+
expect(sfx(usFlag + 'hello', usFlag + 'hello')).toEqual(5 + 4);
104+
});
105+
106+
test('handles combining diacritical marks', () => {
107+
const combining = 'e\u0301'; // e + combining acute accent
108+
expect(sfx(combining, combining)).toEqual(2);
109+
expect(sfx('ell' + combining, combining)).toEqual(2);
110+
expect(sfx(combining + 'hello', combining + 'hello')).toEqual(5 + 2);
111+
const multiCombining = 'a\u0301\u0302\u0303'; // a with multiple accents
112+
expect(sfx(multiCombining, multiCombining)).toEqual(4);
113+
expect(sfx('text' + multiCombining, multiCombining)).toEqual(4);
114+
});
115+
116+
test('handles variation selectors', () => {
117+
const heartText = '❤\uFE0E'; // text style
118+
const heartEmoji = '❤\uFE0F'; // emoji style
119+
expect(sfx(heartText, heartText)).toEqual(2);
120+
expect(sfx(heartEmoji, heartEmoji)).toEqual(2);
121+
expect(sfx(heartText, heartEmoji)).toEqual(0);
122+
expect(sfx('love' + heartEmoji, heartEmoji)).toEqual(2);
123+
});
124+
125+
test('handles mixed grapheme clusters', () => {
126+
const chef = '👨‍🍳';
127+
const family = '👨‍👩‍👧‍👦';
128+
const combined = family + chef;
129+
expect(sfx(combined, combined)).toEqual(16);
130+
expect(sfx('text' + combined, combined)).toEqual(16);
131+
expect(sfx(combined + 'abc', combined + 'abc')).toEqual(3 + 16);
132+
});
133+
134+
test('does not split grapheme clusters at boundaries', () => {
135+
const chef = '👨‍🍳';
136+
const family = '👨‍👩‍👧‍👦';
137+
138+
// Ensure we don't split in the middle of a grapheme cluster
139+
expect(sfx('x' + chef, chef)).toEqual(5); // full chef emoji
140+
expect(sfx('xy' + family, family)).toEqual(11); // full family emoji
141+
142+
// When the suffix is part of a larger grapheme, it should not match partially
143+
expect(sfx('👨‍🍳👩', '👩')).toEqual(2); // Just the woman emoji at end
144+
expect(sfx('text👨‍🍳', '👨‍🍳')).toEqual(5); // Full chef emoji
145+
});
146+
});
147+
4148
describe('normalize()', () => {
5149
test('joins consecutive same type operations', () => {
6150
expect(
@@ -240,6 +384,75 @@ describe('diff()', () => {
240384
assertPatch('a🙃b', 'a👋b');
241385
});
242386

387+
test('grapheme clusters with ZWJ (Zero Width Joiner)', () => {
388+
const chef = '👨‍🍳';
389+
const family = '👨‍👩‍👧‍👦';
390+
const womanTech = '👩🏽‍💻';
391+
assertPatch(chef, family);
392+
assertPatch(family, chef);
393+
assertPatch(womanTech, chef);
394+
assertPatch('hello', 'hello' + chef);
395+
assertPatch('hello', chef + 'hello');
396+
assertPatch('hello world', 'hello' + family + 'world');
397+
assertPatch('hello' + chef, 'hello');
398+
assertPatch(chef + 'hello', 'hello');
399+
assertPatch('hello' + family + 'world', 'helloworld');
400+
assertPatch(chef + family, family + chef);
401+
assertPatch('a' + chef + 'b' + family + 'c', 'x' + family + 'y' + chef + 'z');
402+
assertPatch('The ' + chef + ' cooks', 'A ' + chef + ' bakes');
403+
assertPatch('Team: ' + family, 'Group: ' + womanTech);
404+
});
405+
406+
test('flag emojis (regional indicators)', () => {
407+
const ruFlag = '🇷🇺';
408+
const chFlag = '🇨🇳';
409+
const inFlag = '🇮🇳';
410+
assertPatch(ruFlag, chFlag);
411+
assertPatch(chFlag, inFlag);
412+
assertPatch('Made in ' + ruFlag, 'Made in ' + chFlag);
413+
assertPatch(ruFlag + ' USA', chFlag + ' UK');
414+
assertPatch('Hello ' + ruFlag + ' world', 'Hello ' + inFlag + ' world');
415+
assertPatch(ruFlag + chFlag, chFlag + ruFlag);
416+
assertPatch('Flags: ' + ruFlag + chFlag + inFlag, 'Flags: ' + inFlag + chFlag + ruFlag);
417+
});
418+
419+
test('combining diacritical marks', () => {
420+
const combining1 = 'e\u0301';
421+
const combining2 = 'e\u0300';
422+
const precomposed = 'é';
423+
assertPatch(combining1, combining2);
424+
assertPatch(combining1, precomposed);
425+
assertPatch(precomposed, combining1);
426+
assertPatch('cafe\u0301', 'café');
427+
assertPatch('naïve', 'naive');
428+
assertPatch('résumé', 'resume');
429+
const multiCombining = 'a\u0301\u0302\u0303';
430+
assertPatch('test' + multiCombining, 'test');
431+
assertPatch('test', 'test' + multiCombining);
432+
});
433+
434+
test('variation selectors', () => {
435+
const heartText = '❤\uFE0E'; // text style
436+
const heartEmoji = '❤\uFE0F'; // emoji style
437+
assertPatch(heartText, heartEmoji);
438+
assertPatch(heartEmoji, heartText);
439+
assertPatch('I ' + heartText + ' code', 'I ' + heartEmoji + ' code');
440+
assertPatch('Love ' + heartEmoji, 'Love ' + heartText);
441+
});
442+
443+
test('complex grapheme clusters in real scenarios', () => {
444+
const chef = '👨‍🍳';
445+
const family = '👨‍👩‍👧‍👦';
446+
const womanTech = '👩🏽‍💻';
447+
const usFlag = '🇺🇸';
448+
assertPatch('Hey ' + chef + ', dinner ready?', 'Hi ' + womanTech + ', code ready?');
449+
assertPatch(family + ' going to ' + usFlag, family + ' staying home');
450+
assertPatch(
451+
'The ' + chef + ' from ' + usFlag + ' is amazing',
452+
'A ' + womanTech + ' from ' + usFlag + ' is brilliant',
453+
);
454+
});
455+
243456
test('same strings', () => {
244457
assertPatch('', '');
245458
assertPatch('1', '1');
@@ -331,6 +544,33 @@ describe('diffEdit()', () => {
331544
assertDiffEdit('aaa', 'bbb', 'ccc');
332545
assertDiffEdit('1', '2', '3');
333546
});
547+
548+
test('handles grapheme cluster inserts and deletes', () => {
549+
const chef = '👨‍🍳';
550+
const family = '👨‍👩‍👧‍👦';
551+
const womanTech = '👩🏽‍💻';
552+
const usFlag = '🇺🇸';
553+
554+
// Insert grapheme clusters
555+
assertDiffEdit('', chef, '');
556+
assertDiffEdit('Hello ', chef, '');
557+
assertDiffEdit('', chef, ' world');
558+
assertDiffEdit('Hello ', chef, ' world');
559+
assertDiffEdit('Team: ', family, ' rocks!');
560+
561+
// Insert multiple grapheme clusters
562+
assertDiffEdit('', chef + family, '');
563+
assertDiffEdit('Coders: ', womanTech + chef, ' win');
564+
565+
// Insert with flags
566+
assertDiffEdit('Made in ', usFlag, '');
567+
assertDiffEdit('', usFlag, ' USA');
568+
569+
// Combining characters
570+
const combining = 'e\u0301';
571+
assertDiffEdit('caf', combining, '');
572+
assertDiffEdit('', combining, ' accent');
573+
});
334574
});
335575

336576
describe('overlap()', () => {
@@ -353,6 +593,21 @@ describe('overlap()', () => {
353593
expect(overlap('abc', 'abc')).toEqual(3);
354594
expect(overlap('a', 'a')).toEqual(1);
355595
});
596+
597+
test('handles grapheme clusters', () => {
598+
const chef = '👨‍🍳';
599+
const family = '👨‍👩‍👧‍👦';
600+
601+
// Overlap with grapheme clusters
602+
expect(overlap('hello' + chef, chef + 'world')).toEqual(5);
603+
expect(overlap('abc' + family, family + 'xyz')).toEqual(11);
604+
605+
// No overlap when grapheme differs
606+
expect(overlap('hello' + chef, family + 'world')).toEqual(0);
607+
608+
// Text overlap with grapheme clusters
609+
expect(overlap('prefix' + chef, chef + 'suffix')).toEqual(5);
610+
});
356611
});
357612

358613
describe('Unicode edge cases', () => {
@@ -404,6 +659,18 @@ describe('Unicode edge cases', () => {
404659
assertPatch(nfd, nfc);
405660
assertPatch(`hello ${nfc}`, `hello ${nfd}`);
406661
});
662+
663+
test('handles complex emoji with ZWJ sequences', () => {
664+
const chefEmoji = '👨‍🍳'; // chef emoji (man + ZWJ + cooking)
665+
const src = chefEmoji;
666+
const dst = 'chef' + chefEmoji;
667+
const patch = normalize(diff(src, dst));
668+
assertPatch(src, dst, patch);
669+
expect(patch).toEqual([
670+
[PATCH_OP_TYPE.INS, 'chef'],
671+
[PATCH_OP_TYPE.EQL, chefEmoji],
672+
]);
673+
});
407674
});
408675

409676
describe('Algorithm edge cases', () => {

packages/json-joy/src/util/diff/str.ts

Lines changed: 30 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -388,7 +388,7 @@ const diffNoCommonAffix = (src: string, dst: string): Patch => {
388388
* @param txt2 Second string.
389389
* @return The number of characters common to the start of each string.
390390
*/
391-
export const pfx = (txt1: string, txt2: string) => {
391+
export const pfx = (txt1: string, txt2: string): number => {
392392
if (!txt1 || !txt2 || txt1.charAt(0) !== txt2.charAt(0)) return 0;
393393
let min = 0;
394394
let max = Math.min(txt1.length, txt2.length);
@@ -427,9 +427,35 @@ export const sfx = (txt1: string, txt2: string): number => {
427427
} else max = mid;
428428
mid = Math.floor((max - min) / 2 + min);
429429
}
430-
const code = txt1.charCodeAt(txt1.length - mid);
431-
const isSurrogatePairEnd = code >= 0xd800 && code <= 0xdbff;
432-
if (isSurrogatePairEnd) mid--;
430+
// Check if we're splitting a surrogate pair or combining character sequence
431+
// We need to check the character BEFORE the matched suffix to see if we're
432+
// splitting a grapheme cluster.
433+
if (mid > 0 && mid < txt1.length) {
434+
const boundaryPos = txt1.length - mid - 1;
435+
const code = txt1.charCodeAt(boundaryPos);
436+
const isHighSurrogate = code >= 0xd800 && code <= 0xdbff;
437+
const isCombining =
438+
code === 0x200d || // ZWJ
439+
(code >= 0xfe00 && code <= 0xfe0f) || // Variation selectors
440+
(code >= 0x0300 && code <= 0x036f); // Combining diacritical marks
441+
442+
if (isHighSurrogate || isCombining) {
443+
// We're splitting a grapheme cluster. Walk backwards to include the full cluster.
444+
mid--;
445+
while (mid > 0) {
446+
const pos = txt1.length - mid - 1;
447+
if (pos < 0) break;
448+
const prevCode = txt1.charCodeAt(pos);
449+
const isPrevHighSurrogate = prevCode >= 0xd800 && prevCode <= 0xdbff;
450+
const isPrevCombining =
451+
prevCode === 0x200d ||
452+
(prevCode >= 0xfe00 && prevCode <= 0xfe0f) ||
453+
(prevCode >= 0x0300 && prevCode <= 0x036f);
454+
if (!isPrevHighSurrogate && !isPrevCombining) break;
455+
mid--;
456+
}
457+
}
458+
}
433459
return mid;
434460
};
435461

0 commit comments

Comments
 (0)