Skip to content

Commit 9fbb3dd

Browse files
authored
Refine the types of some provisional properties (#1234)
* Refine the types of Provisional properties * GenerateEnums * Missing NaN * Stir the spaghetti logic and comment it a tiny bit * another missing line * not null but NaN
1 parent 3c3072d commit 9fbb3dd

File tree

4 files changed

+65
-52
lines changed

4 files changed

+65
-52
lines changed

unicodetools/src/main/java/org/unicode/props/PropertyParsingInfo.java

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1658,7 +1658,7 @@ private static void parseSimpleFieldFile(
16581658
var range = new IntRange();
16591659
range.start = cp;
16601660
range.end = cp;
1661-
if (unicodeDataValue == null) {
1661+
if (unicodeDataValue.equals("NaN")) {
16621662
if (!extractedValue.endsWith(".0")) {
16631663
throw new IllegalArgumentException(
16641664
"Non-integer numeric value extracted from Unihan for "
@@ -1891,6 +1891,8 @@ private static void setDefaultValueForPropertyName(
18911891

18921892
static void init() {
18931893
final Matcher semicolon = SEMICOLON.matcher("");
1894+
// Populate property2PropertyInfo, first from the index, then from our split Unihan
1895+
// implicitly.
18941896
for (final String line :
18951897
FileUtilities.in(IndexUnicodeProperties.class, "IndexUnicodeProperties.txt")) {
18961898
if (line.startsWith("#") || line.isEmpty()) {
@@ -1906,6 +1908,17 @@ static void init() {
19061908
fromStrings(parts);
19071909
}
19081910
}
1911+
1912+
// Starting with Unicode 13, we preprocess the Unihan data using the
1913+
// <Unicode Tools>/py/splitunihan.py script.
1914+
// It parses the small number of large, multi-property Unihan*.txt files
1915+
// and writes many smaller, single-property files like kTotalStrokes.txt.
1916+
for (UcdProperty prop : UcdProperty.values()) {
1917+
if (prop.getShortName().startsWith("cjk")) {
1918+
fromUnihanProperty(prop);
1919+
}
1920+
}
1921+
19091922
// DO THESE FIRST (overrides values in files!)
19101923
parseMissingFromValueAliases(
19111924
FileUtilities.in(IndexUnicodeProperties.class, "ExtraPropertyAliases.txt"));
@@ -1941,16 +1954,6 @@ static void init() {
19411954
// if (property2PropertyInfo.containsKey(x.toString())) continue;
19421955
// if (SHOW_PROP_INFO) System.out.println("Missing: " + x);
19431956
// }
1944-
1945-
// Starting with Unicode 13, we preprocess the Unihan data using the
1946-
// <Unicode Tools>/py/splitunihan.py script.
1947-
// It parses the small number of large, multi-property Unihan*.txt files
1948-
// and writes many smaller, single-property files like kTotalStrokes.txt.
1949-
for (UcdProperty prop : UcdProperty.values()) {
1950-
if (prop.getShortName().startsWith("cjk")) {
1951-
fromUnihanProperty(prop);
1952-
}
1953-
}
19541957
}
19551958

19561959
private static void parseMissingFromValueAliases(Iterable<String> aliasesLines) {

unicodetools/src/main/java/org/unicode/props/UcdProperty.java

Lines changed: 22 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,8 @@
4949
public enum UcdProperty {
5050

5151
// Numeric
52+
Non_Unihan_Numeric_Value(
53+
PropertyType.Numeric, DerivedPropertyStatus.UCDNonProperty, "Non_Unihan_Numeric_Value"),
5254
Numeric_Value(PropertyType.Numeric, DerivedPropertyStatus.Approved, "nv"),
5355
kAccountingNumeric(
5456
PropertyType.Numeric, DerivedPropertyStatus.Approved, "cjkAccountingNumeric"),
@@ -59,6 +61,20 @@ public enum UcdProperty {
5961
null,
6062
ValueCardinality.Ordered,
6163
"cjkPrimaryNumeric"),
64+
kTGT_Numeric(PropertyType.Numeric, DerivedPropertyStatus.Provisional, "kTGT_Numeric"),
65+
kTayNumeric(PropertyType.Numeric, DerivedPropertyStatus.Provisional, "cjkTayNumeric"),
66+
kVietnameseNumeric(
67+
PropertyType.Numeric,
68+
DerivedPropertyStatus.Provisional,
69+
null,
70+
ValueCardinality.Unordered,
71+
"cjkVietnameseNumeric"),
72+
kZhuangNumeric(
73+
PropertyType.Numeric,
74+
DerivedPropertyStatus.Provisional,
75+
null,
76+
ValueCardinality.Unordered,
77+
"cjkZhuangNumeric"),
6278

6379
// String
6480
Bidi_Mirroring_Glyph(PropertyType.String, DerivedPropertyStatus.Approved, "bmg"),
@@ -99,6 +115,12 @@ public enum UcdProperty {
99115
null,
100116
ValueCardinality.Unordered,
101117
"cjkSimplifiedVariant"),
118+
kSpoofingVariant(
119+
PropertyType.String,
120+
DerivedPropertyStatus.Provisional,
121+
null,
122+
ValueCardinality.Unordered,
123+
"cjkSpoofingVariant"),
102124
kTraditionalVariant(
103125
PropertyType.String,
104126
DerivedPropertyStatus.Provisional,
@@ -172,10 +194,6 @@ public enum UcdProperty {
172194
PropertyType.Miscellaneous,
173195
DerivedPropertyStatus.UCDNonProperty,
174196
"Names_List_Subheader_Notice"),
175-
Non_Unihan_Numeric_Value(
176-
PropertyType.Miscellaneous,
177-
DerivedPropertyStatus.UCDNonProperty,
178-
"Non_Unihan_Numeric_Value"),
179197
Standardized_Variant(
180198
PropertyType.Miscellaneous,
181199
DerivedPropertyStatus.UCDNonProperty,
@@ -517,12 +535,6 @@ public enum UcdProperty {
517535
null,
518536
ValueCardinality.Unordered,
519537
"cjkSpecializedSemanticVariant"),
520-
kSpoofingVariant(
521-
PropertyType.Miscellaneous,
522-
DerivedPropertyStatus.Provisional,
523-
null,
524-
ValueCardinality.Unordered,
525-
"cjkSpoofingVariant"),
526538
kStrange(
527539
PropertyType.Miscellaneous,
528540
DerivedPropertyStatus.Provisional,
@@ -542,7 +554,6 @@ public enum UcdProperty {
542554
ValueCardinality.Unordered,
543555
"cjkTGHZ2013"),
544556
kTGT_MergedSrc(PropertyType.Miscellaneous, DerivedPropertyStatus.Approved, "kTGT_MergedSrc"),
545-
kTGT_Numeric(PropertyType.Miscellaneous, DerivedPropertyStatus.Provisional, "kTGT_Numeric"),
546557
kTGT_RSUnicode(
547558
PropertyType.Miscellaneous,
548559
DerivedPropertyStatus.UCDNonProperty,
@@ -556,7 +567,6 @@ public enum UcdProperty {
556567
null,
557568
ValueCardinality.Unordered,
558569
"cjkTang"),
559-
kTayNumeric(PropertyType.Miscellaneous, DerivedPropertyStatus.Provisional, "cjkTayNumeric"),
560570
kTotalStrokes(
561571
PropertyType.Miscellaneous,
562572
DerivedPropertyStatus.Approved,
@@ -571,12 +581,6 @@ public enum UcdProperty {
571581
null,
572582
ValueCardinality.Unordered,
573583
"cjkVietnamese"),
574-
kVietnameseNumeric(
575-
PropertyType.Miscellaneous,
576-
DerivedPropertyStatus.Provisional,
577-
null,
578-
ValueCardinality.Unordered,
579-
"cjkVietnameseNumeric"),
580584
kXHC1983(
581585
PropertyType.Miscellaneous,
582586
DerivedPropertyStatus.Provisional,
@@ -591,12 +595,6 @@ public enum UcdProperty {
591595
null,
592596
ValueCardinality.Unordered,
593597
"cjkZhuang"),
594-
kZhuangNumeric(
595-
PropertyType.Miscellaneous,
596-
DerivedPropertyStatus.Provisional,
597-
null,
598-
ValueCardinality.Unordered,
599-
"cjkZhuangNumeric"),
600598
normalization_correction_version(
601599
PropertyType.Miscellaneous,
602600
DerivedPropertyStatus.UCDNonProperty,

unicodetools/src/main/resources/org/unicode/props/ExtraPropertyAliases.txt

Lines changed: 21 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,25 @@ Math_Class ; Math_Class ; NonUCDProperty
3737
# characters for which it historically lagged behind the value in MathClass.
3838
Math_Class_Ex ; Math_Class_Ex ; NonUCDNonProperty
3939

40+
# ================================================
41+
# Numeric Properties
42+
# ================================================
43+
44+
# 15.1
45+
cjkVietnameseNumeric ; kVietnameseNumeric ; Provisional
46+
cjkZhuangNumeric ; kZhuangNumeric ; Provisional
47+
# 17.0
48+
cjkTayNumeric ; kTayNumeric ; Provisional
49+
50+
# [185-C37] Consensus: Add a new provisional property, kTGT_Numeric to the
51+
# Tangut data files, based on L2/25-055, for Unicode Version 18.0.
52+
# [Ref. 5.2 in L2/25-232R]
53+
kTGT_Numeric ; kTGT_Numeric ; Provisional
54+
55+
# Contributory non-property matching exactly field 8 of UnicodeData.txt.
56+
# Mostly useful as a helper to diachronically parse Numeric_Value.
57+
Non_Unihan_Numeric_Value ; Non_Unihan_Numeric_Value ; UCDNonProperty
58+
4059
# ================================================
4160
# String Properties
4261
# ================================================
@@ -52,6 +71,8 @@ ConfMA ; Confusable_MA ; Confusable ; NonUCDNonProperty
5271

5372
cjkSimplifiedVariant ; kSimplifiedVariant ; Provisional
5473
cjkTraditionalVariant ; kTraditionalVariant ; Provisional
74+
# 13.0
75+
cjkSpoofingVariant ; kSpoofingVariant ; Provisional
5576

5677
Do_Not_Emit_Preferred ; Do_Not_Emit_Preferred ; UCDNonProperty
5778

@@ -167,7 +188,6 @@ cjkKoreanEducationHanja ; kKoreanEducationHanja ; Provisional
167188
cjkKoreanName ; kKoreanName ; Provisional
168189
cjkTGH ; kTGH ; Provisional
169190
# 13.0
170-
cjkSpoofingVariant ; kSpoofingVariant ; Provisional
171191
cjkTGHZ2013 ; kTGHZ2013 ; Provisional
172192
# 14.0
173193
cjkStrange ; kStrange ; Provisional
@@ -178,27 +198,16 @@ cjkJapanese ; kJapanese ; Provisional
178198
cjkMojiJoho ; kMojiJoho ; Provisional
179199
cjkSMSZD2003Index ; kSMSZD2003Index ; Provisional
180200
cjkSMSZD2003Readings ; kSMSZD2003Readings ; Provisional
181-
cjkVietnameseNumeric ; kVietnameseNumeric ; Provisional
182-
cjkZhuangNumeric ; kZhuangNumeric ; Provisional
183201
# 16.0
184202
cjkFanqie ; kFanqie ; Provisional
185203
cjkZhuang ; kZhuang ; Provisional
186-
# 17.0
187-
cjkTayNumeric ; kTayNumeric ; Provisional
188204

189205
# [183-C29] Consensus: In TangutSources.txt,
190206
# change the tag kRSTUnicode to kTGT_RSUnicode.
191207
# For Unicode Version 17.0. See L2/25-087 item 1.9.
192208
# (Changed between 17 alpha and beta.)
193209
kTGT_RSUnicode ; kTGT_RSUnicode ; kRSTUnicode ; UCDNonProperty
194210

195-
# [185-C37] Consensus: Add a new provisional property, kTGT_Numeric to the
196-
# Tangut data files, based on L2/25-055, for Unicode Version 18.0.
197-
# [Ref. 5.2 in L2/25-232R]
198-
kTGT_Numeric ; kTGT_Numeric ; Provisional
199-
200-
# TODO(egg): This file somehow classifies all provisional numeric properties as miscellaneous.
201-
202211
# [183-C30] Consensus: In NushuSources.txt,
203212
# change the tag kSrc_NushuDuben to kNSHU_DubenSrc, and change the tag kReading to kNSHU_Reading.
204213
# For Unicode Version 17.0. See L2/25-087 item 1.9.
@@ -210,10 +219,6 @@ kEH_Func ; kEH_Func ; Provisional
210219
kEH_FVal ; kEH_FVal ; Provisional
211220
kEH_UniK ; kEH_UniK ; Provisional
212221

213-
# Contributory non-property matching exactly field 8 of UnicodeData.txt.
214-
# Mostly useful as a helper to diachronically parse Numeric_Value.
215-
Non_Unihan_Numeric_Value ; Non_Unihan_Numeric_Value ; UCDNonProperty
216-
217222
Names_List_Subheader ; Names_List_Subheader ; subhead ; UCDNonProperty
218223
Names_List_Subheader_Notice ; Names_List_Subheader_Notice ; UCDNonProperty
219224
Names_List_Alias ; Names_List_Alias ; UCDNonProperty

unicodetools/src/main/resources/org/unicode/props/ExtraPropertyValueAliases.txt

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -226,6 +226,7 @@ jg; ALEF_MAQSURAH ; ALEF_MAQSURAH
226226

227227
# @missing: 0000..10FFFF; kSimplifiedVariant ; <none>
228228
# @missing: 0000..10FFFF; kTraditionalVariant ; <none>
229+
# @missing: 0000..10FFFF; kSpoofingVariant ; <none>
229230

230231
# @missing: 0000..10FFFF; Joining_Group ; No_Joining_Group
231232

@@ -418,4 +419,10 @@ Math_Class_Ex ; R ; Relation ; R?
418419
Math_Class_Ex ; S ; Space
419420
Math_Class_Ex ; U ; Unary
420421
Math_Class_Ex ; V ; Vary
421-
Math_Class_Ex ; X ; Special
422+
Math_Class_Ex ; X ; Special
423+
424+
# @missing: 0000..10FFFF; kVietnameseNumeric ; NaN
425+
# @missing: 0000..10FFFF; kZhuangNumeric ; NaN
426+
# @missing: 0000..10FFFF; kTayNumeric ; NaN
427+
# @missing: 0000..10FFFF; kTGT_Numeric ; NaN
428+
# @missing: 0000..10FFFF; Non_Unihan_Numeric_Value ; NaN

0 commit comments

Comments
 (0)