Skip to content

Commit 1a762b5

Browse files
authored
Look at the reverse Do_Not_Emit mappings (#1247)
* Backward Do_Not_Emit * Fix tests, except the one that should be broken * comment * meow * Actually those DoNotEmit sequences are fine. Also, let’s not emit them * Default to <none> * After Markus’s review * urgh
1 parent c052298 commit 1a762b5

File tree

13 files changed

+312
-60
lines changed

13 files changed

+312
-60
lines changed

unicodetools/src/main/java/org/unicode/props/PropertyParsingInfo.java

Lines changed: 113 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
import com.ibm.icu.text.UnicodeSet;
99
import com.ibm.icu.util.VersionInfo;
1010
import java.util.Arrays;
11+
import java.util.Comparator;
1112
import java.util.EnumMap;
1213
import java.util.HashMap;
1314
import java.util.HashSet;
@@ -49,11 +50,45 @@ enum SpecialProperty {
4950
public final SpecialProperty special;
5051

5152
/**
52-
* Maps from Unicode versions to field number. A property whose field number depends on the
53-
* version has more than one entry. A particular field number applies to the Unicode versions
53+
* Represents a mapping from one field of a UCD file to another. For instance, given the data
54+
* line ABCD ; Value ; 1234 FieldMapping(1) maps U+ABCD to Value, FieldMapping(2) maps U+ABCD to
55+
* 1234 (which may be interpreted as U+1234) depending on the property type, and FieldMapping(2,
56+
* 1) maps U+1234 to Value.
57+
*/
58+
public static class FieldMapping implements Comparable<FieldMapping> {
59+
/** A mapping from field 0 to field `valueField`. This is the most common case. */
60+
FieldMapping(int valueField) {
61+
this(0, valueField);
62+
}
63+
64+
FieldMapping(int keyField, int valueField) {
65+
this.keyField = keyField;
66+
this.valueField = valueField;
67+
}
68+
69+
@Override
70+
public int compareTo(FieldMapping other) {
71+
return comparator.compare(this, other);
72+
}
73+
74+
@Override
75+
public String toString() {
76+
return keyField + " ↦ " + valueField;
77+
}
78+
79+
final int keyField;
80+
final int valueField;
81+
static final Comparator<FieldMapping> comparator =
82+
Comparator.<FieldMapping>comparingInt(m -> m.keyField)
83+
.thenComparing(m -> m.valueField);
84+
}
85+
86+
/**
87+
* Maps from Unicode versions to field mapping. A property whose field mapping depends on the
88+
* version has more than one entry. A particular field mapping applies to the Unicode versions
5489
* after the previous-version entry, up to and including its own version.
5590
*/
56-
TreeMap<VersionInfo, Integer> fieldNumbers;
91+
TreeMap<VersionInfo, FieldMapping> fieldMappings;
5792

5893
/**
5994
* Maps from Unicode versions to files. A property whose file depends on the version has more
@@ -105,16 +140,17 @@ enum SpecialProperty {
105140
Relation.of(new HashMap<String, Set<PropertyParsingInfo>>(), HashSet.class);
106141

107142
public PropertyParsingInfo(
108-
String file, UcdProperty property, int fieldNumber, SpecialProperty special) {
143+
String file, UcdProperty property, FieldMapping fieldMapping, SpecialProperty special) {
109144
this.files = new TreeMap<>();
110145
files.put(Settings.LATEST_VERSION_INFO, file);
111146
this.property = property;
112-
this.fieldNumbers = new TreeMap<>();
113-
fieldNumbers.put(Settings.LATEST_VERSION_INFO, fieldNumber);
147+
this.fieldMappings = new TreeMap<>();
148+
fieldMappings.put(Settings.LATEST_VERSION_INFO, fieldMapping);
114149
this.special = special;
115150
}
116151

117152
static final Pattern VERSION = Pattern.compile("v\\d+(\\.\\d+)+");
153+
static final Pattern FIELD_MAPPING = Pattern.compile("(\\d+)\\s*↦\\s*(\\d+)");
118154

119155
private static void fromStrings(String... propertyInfo) {
120156
if (propertyInfo.length < 2 || propertyInfo.length > 4) {
@@ -130,13 +166,20 @@ private static void fromStrings(String... propertyInfo) {
130166

131167
String last = propertyInfo[propertyInfo.length - 1];
132168

133-
int temp = 1;
169+
var fieldMapping = new FieldMapping(1);
134170
if (propertyInfo.length > 2
135171
&& !propertyInfo[2].isEmpty()
136172
&& !VERSION.matcher(propertyInfo[2]).matches()) {
137-
temp = Integer.parseInt(propertyInfo[2]);
173+
final var matcher = FIELD_MAPPING.matcher(propertyInfo[2]);
174+
if (matcher.matches()) {
175+
fieldMapping =
176+
new FieldMapping(
177+
Integer.parseInt(matcher.group(1)),
178+
Integer.parseInt(matcher.group(2)));
179+
} else {
180+
fieldMapping = new FieldMapping(Integer.parseInt(propertyInfo[2]));
181+
}
138182
}
139-
int _fieldNumber = temp;
140183

141184
if (VERSION.matcher(last).matches()) {
142185
propertyInfo[propertyInfo.length - 1] = "";
@@ -146,7 +189,7 @@ private static void fromStrings(String... propertyInfo) {
146189
"No modern info for property with old file record: " + propName);
147190
}
148191
result.files.put(VersionInfo.getInstance(last.substring(1)), _file);
149-
result.fieldNumbers.put(VersionInfo.getInstance(last.substring(1)), _fieldNumber);
192+
result.fieldMappings.put(VersionInfo.getInstance(last.substring(1)), fieldMapping);
150193
file2PropertyInfoSet.put(_file, result);
151194
return;
152195
}
@@ -156,7 +199,7 @@ private static void fromStrings(String... propertyInfo) {
156199
? SpecialProperty.None
157200
: SpecialProperty.valueOf(propertyInfo[3]);
158201
PropertyParsingInfo result =
159-
new PropertyParsingInfo(_file, _property, _fieldNumber, _special);
202+
new PropertyParsingInfo(_file, _property, fieldMapping, _special);
160203

161204
try {
162205
PropertyUtilities.putNew(property2PropertyInfo, _property, result);
@@ -173,7 +216,9 @@ private static void fromUnihanProperty(UcdProperty prop) {
173216
}
174217
PropertyParsingInfo info = property2PropertyInfo.get(prop);
175218
if (info == null) {
176-
info = new PropertyParsingInfo(filename, prop, 1, SpecialProperty.None);
219+
info =
220+
new PropertyParsingInfo(
221+
filename, prop, new FieldMapping(1), SpecialProperty.None);
177222
property2PropertyInfo.put(prop, info);
178223
}
179224
file2PropertyInfoSet.put(filename, info);
@@ -185,7 +230,7 @@ public String toString() {
185230
+ " ;\t"
186231
+ property
187232
+ " ;\t"
188-
+ fieldNumbers
233+
+ fieldMappings
189234
+ " ;\t"
190235
+ special
191236
+ " ;\t"
@@ -212,8 +257,9 @@ public int compareTo(PropertyParsingInfo arg0) {
212257
if (0 != (result = property.toString().compareTo(arg0.property.toString()))) {
213258
return result;
214259
}
215-
return fieldNumbers.get(Settings.LATEST_VERSION_INFO)
216-
- arg0.fieldNumbers.get(Settings.LATEST_VERSION_INFO);
260+
return fieldMappings
261+
.get(Settings.LATEST_VERSION_INFO)
262+
.compareTo(arg0.fieldMappings.get(Settings.LATEST_VERSION_INFO));
217263
}
218264

219265
public static String getFullFileName(UcdProperty prop, VersionInfo ucdVersion) {
@@ -240,18 +286,18 @@ public String getFileName(VersionInfo ucdVersionRequested) {
240286
}
241287
}
242288

243-
public int getFieldNumber(VersionInfo ucdVersionRequested) {
244-
int fieldNumber = 0;
245-
if (fieldNumbers.size() == 1) {
246-
return fieldNumbers.values().iterator().next();
289+
public FieldMapping getFieldMapping(VersionInfo ucdVersionRequested) {
290+
FieldMapping fieldMapping = null;
291+
if (fieldMappings.size() == 1) {
292+
return fieldMappings.values().iterator().next();
247293
}
248-
for (final var entry : fieldNumbers.entrySet()) {
294+
for (final var entry : fieldMappings.entrySet()) {
249295
if (ucdVersionRequested.compareTo(entry.getKey()) <= 0) {
250-
fieldNumber = entry.getValue();
296+
fieldMapping = entry.getValue();
251297
break;
252298
}
253299
}
254-
return fieldNumber;
300+
return fieldMapping;
255301
}
256302

257303
private static final VersionInfo V13 = VersionInfo.getInstance(13);
@@ -662,10 +708,16 @@ static void parseSourceFile(
662708
propInfoSet);
663709
break;
664710
case Field:
711+
FieldMapping mapping;
665712
if (propInfoSet.size() == 1
666713
&& (propInfo = propInfoSet.iterator().next()).special
667714
== SpecialProperty.None
668-
&& propInfo.getFieldNumber(indexUnicodeProperties.ucdVersion) == 1) {
715+
&& (mapping =
716+
propInfo.getFieldMapping(
717+
indexUnicodeProperties.ucdVersion))
718+
.keyField
719+
== 0
720+
&& mapping.valueField == 1) {
669721
if (fileName.equals("math/*/MathClass")
670722
&& indexUnicodeProperties.ucdVersion.compareTo(
671723
VersionInfo.UNICODE_6_3)
@@ -1490,9 +1542,12 @@ private static void parseFields(
14901542
throw new UnicodePropertyException();
14911543
}
14921544
String value =
1493-
propInfo.getFieldNumber(indexUnicodeProperties.ucdVersion) >= parts.length
1545+
propInfo.getFieldMapping(indexUnicodeProperties.ucdVersion).valueField
1546+
>= parts.length
14941547
? null
1495-
: parts[propInfo.getFieldNumber(indexUnicodeProperties.ucdVersion)];
1548+
: parts[
1549+
propInfo.getFieldMapping(indexUnicodeProperties.ucdVersion)
1550+
.valueField];
14961551
if (propInfo.property == UcdProperty.Joining_Group
14971552
&& indexUnicodeProperties.ucdVersion.compareTo(VersionInfo.UNICODE_4_0_1)
14981553
<= 0
@@ -1562,22 +1617,43 @@ private static void parseFields(
15621617
// 21EA..21F3;;⇪..⇳;;;; 21EA-21F3 are keyboard
15631618
value = "None";
15641619
}
1565-
propInfo.put(
1566-
data,
1567-
line.getMissingSet(),
1568-
line.getRange(),
1569-
value,
1570-
merger,
1571-
hackHangul && propInfo.property == UcdProperty.Decomposition_Mapping,
1572-
nextProperties == null
1573-
? null
1574-
: nextProperties.getProperty(propInfo.property));
1620+
if (propInfo.getFieldMapping(indexUnicodeProperties.ucdVersion).keyField == 0) {
1621+
propInfo.put(
1622+
data,
1623+
line.getMissingSet(),
1624+
line.getRange(),
1625+
value,
1626+
merger,
1627+
hackHangul && propInfo.property == UcdProperty.Decomposition_Mapping,
1628+
nextProperties == null
1629+
? null
1630+
: nextProperties.getProperty(propInfo.property));
1631+
} else {
1632+
final var key = new IntRange();
1633+
key.set(
1634+
parts[
1635+
propInfo.getFieldMapping(indexUnicodeProperties.ucdVersion)
1636+
.keyField]);
1637+
propInfo.put(
1638+
data,
1639+
line.getMissingSet(),
1640+
key,
1641+
value,
1642+
IndexUnicodeProperties.MULTIVALUED_JOINER,
1643+
hackHangul && propInfo.property == UcdProperty.Decomposition_Mapping,
1644+
nextProperties == null
1645+
? null
1646+
: nextProperties.getProperty(propInfo.property));
1647+
}
15751648
}
15761649
} else {
15771650
for (final PropertyParsingInfo propInfo : propInfoSet) {
15781651
final String value =
1579-
propInfo.getFieldNumber(indexUnicodeProperties.ucdVersion) < parts.length
1580-
? parts[propInfo.getFieldNumber(indexUnicodeProperties.ucdVersion)]
1652+
propInfo.getFieldMapping(indexUnicodeProperties.ucdVersion).valueField
1653+
< parts.length
1654+
? parts[
1655+
propInfo.getFieldMapping(indexUnicodeProperties.ucdVersion)
1656+
.valueField]
15811657
: null;
15821658
setPropDefault(
15831659
propInfo.property,

unicodetools/src/main/java/org/unicode/props/UcdProperty.java

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
import org.unicode.props.UcdPropertyValues.Block_Values;
1111
import org.unicode.props.UcdPropertyValues.Canonical_Combining_Class_Values;
1212
import org.unicode.props.UcdPropertyValues.Decomposition_Type_Values;
13+
import org.unicode.props.UcdPropertyValues.Do_Not_Emit_Dispreferred_Type_Values;
1314
import org.unicode.props.UcdPropertyValues.Do_Not_Emit_Type_Values;
1415
import org.unicode.props.UcdPropertyValues.East_Asian_Width_Values;
1516
import org.unicode.props.UcdPropertyValues.General_Category_Values;
@@ -86,6 +87,12 @@ public enum UcdProperty {
8687
Confusable_SA(PropertyType.String, DerivedPropertyStatus.NonUCDNonProperty, "ConfSA"),
8788
Confusable_SL(PropertyType.String, DerivedPropertyStatus.NonUCDNonProperty, "ConfSL"),
8889
Decomposition_Mapping(PropertyType.String, DerivedPropertyStatus.Approved, "dm"),
90+
Do_Not_Emit_Dispreferred(
91+
PropertyType.String,
92+
DerivedPropertyStatus.UCDNonProperty,
93+
null,
94+
ValueCardinality.Unordered,
95+
"Do_Not_Emit_Dispreferred"),
8996
Do_Not_Emit_Preferred(
9097
PropertyType.String, DerivedPropertyStatus.UCDNonProperty, "Do_Not_Emit_Preferred"),
9198
Equivalent_Unified_Ideograph(PropertyType.String, DerivedPropertyStatus.Approved, "EqUIdeo"),
@@ -646,6 +653,12 @@ public enum UcdProperty {
646653
Decomposition_Type_Values.class,
647654
null,
648655
"dt"),
656+
Do_Not_Emit_Dispreferred_Type(
657+
PropertyType.Enumerated,
658+
DerivedPropertyStatus.UCDNonProperty,
659+
Do_Not_Emit_Dispreferred_Type_Values.class,
660+
ValueCardinality.Unordered,
661+
"Do_Not_Emit_Dispreferred_Type"),
649662
Do_Not_Emit_Type(
650663
PropertyType.Enumerated,
651664
DerivedPropertyStatus.UCDNonProperty,

unicodetools/src/main/java/org/unicode/props/UcdPropertyValues.java

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -792,6 +792,52 @@ public static Decomposition_Type_Values forName(String name) {
792792
}
793793
}
794794

795+
// Do_Not_Emit_Dispreferred
796+
public enum Do_Not_Emit_Dispreferred_Type_Values implements Named {
797+
None("None"),
798+
Indic_Atomic_Consonant("Indic_Atomic_Consonant"),
799+
Indic_Consonant_Conjunct("Indic_Consonant_Conjunct"),
800+
Indic_Vowel_Letter("Indic_Vowel_Letter"),
801+
Bengali_Khanda_Ta("Bengali_Khanda_Ta"),
802+
Malayalam_Chillu("Malayalam_Chillu"),
803+
Tamil_Shrii("Tamil_Shrii"),
804+
Dotless_Form("Dotless_Form"),
805+
Hamza_Form("Hamza_Form"),
806+
Precomposed_Hieroglyph("Precomposed_Hieroglyph"),
807+
Precomposed_Form("Precomposed_Form"),
808+
Deprecated("Deprecated"),
809+
Discouraged("Discouraged"),
810+
Preferred_Spelling("Preferred_Spelling"),
811+
Arabic_Tashkil("Arabic_Tashkil");
812+
private final PropertyNames<Do_Not_Emit_Dispreferred_Type_Values> names;
813+
814+
private Do_Not_Emit_Dispreferred_Type_Values(String shortName, String... otherNames) {
815+
names =
816+
new PropertyNames<Do_Not_Emit_Dispreferred_Type_Values>(
817+
Do_Not_Emit_Dispreferred_Type_Values.class,
818+
this,
819+
shortName,
820+
otherNames);
821+
}
822+
823+
@Override
824+
public PropertyNames<Do_Not_Emit_Dispreferred_Type_Values> getNames() {
825+
return names;
826+
}
827+
828+
@Override
829+
public String getShortName() {
830+
return names.getShortName();
831+
}
832+
833+
private static final NameMatcher<Do_Not_Emit_Dispreferred_Type_Values> NAME_MATCHER =
834+
PropertyNames.getNameToEnums(Do_Not_Emit_Dispreferred_Type_Values.class);
835+
836+
public static Do_Not_Emit_Dispreferred_Type_Values forName(String name) {
837+
return NAME_MATCHER.get(name);
838+
}
839+
}
840+
795841
// Do_Not_Emit_Preferred
796842
public enum Do_Not_Emit_Type_Values implements Named {
797843
None("None"),

unicodetools/src/main/resources/org/unicode/props/ExtraPropertyAliases.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ IDNA2008_Category ; IDNA2008_Category ; NonUCDProperty
2626
Other_Joining_Type ; Other_Joining_Type ; UCDNonProperty
2727

2828
Do_Not_Emit_Type ; Do_Not_Emit_Type ; UCDNonProperty
29+
Do_Not_Emit_Dispreferred_Type ; Do_Not_Emit_Dispreferred_Type ; UCDNonProperty
2930

3031
kEH_Core ; kEH_Core ; Provisional
3132

@@ -75,6 +76,7 @@ cjkTraditionalVariant ; kTraditionalVariant ; Provisional
7576
cjkSpoofingVariant ; kSpoofingVariant ; Provisional
7677

7778
Do_Not_Emit_Preferred ; Do_Not_Emit_Preferred ; UCDNonProperty
79+
Do_Not_Emit_Dispreferred ; Do_Not_Emit_Dispreferred ; UCDNonProperty
7880

7981
normalization_correction_original ; normalization_correction_original ; UCDNonProperty
8082
normalization_correction_corrected ; normalization_correction_corrected ; UCDNonProperty

0 commit comments

Comments
 (0)