NVIDIA-NeMo · johnnygreco · Apr 24, 2026 · Apr 24, 2026 · Apr 24, 2026 · Apr 24, 2026
@@ -60,6 +60,7 @@ Supported locales:
 - `hi_Deva_IN`: India (Devanagari script)
 - `hi_Latn_IN`: India (Latin script)
 - `ja_JP`: Japan
+- `ko_KR`: South Korea (Korean)
 - `pt_BR`: Brazil (Portuguese)
 
 ### Features
@@ -126,6 +127,9 @@ ngc registry resource download-version "nvidia/nemotron-personas/nemotron-person
 # For Nemotron-Personas JP
 ngc registry resource download-version "nvidia/nemotron-personas/nemotron-personas-dataset-ja_jp"
 
+# For Nemotron-Personas KR
+ngc registry resource download-version "nvidia/nemotron-personas/nemotron-personas-dataset-ko_kr"
+
 # For Nemotron-Personas SG
 ngc registry resource download-version "nvidia/nemotron-personas/nemotron-personas-dataset-en_sg"
 
@@ -189,8 +193,6 @@ For more details, see the documentation for [`SamplerColumnConfig`](../code_refe
 
 **France-Specific Fields (`fr_FR`):**
 
-- `commune` - Smallest administrative division (includes arrondissements)
-- `departement` - Mid-level administrative division
 - `household_type` - Household composition (e.g., single person, couple with/without children)
 - `monthly_income_eur` - Estimated monthly income in euros
 - `first_name_heritage` - Cultural origin of the first name
@@ -200,20 +202,41 @@ For more details, see the documentation for [`SamplerColumnConfig`](../code_refe
 **Japan-Specific Fields (`ja_JP`):**
 
 - `area`
-- `prefecture`
-- `zone`
+
+**Korea-Specific Fields (`ko_KR`):**
+
+- `economic_activity_status` - Employment / economic activity status
+- `family_type` - Household / family composition type
+- `housing_type` - Dwelling type (apartment, detached home, etc.)
+- `housing_tenure` - Owned vs rented, etc.
+- `income_bracket` - Income range
+- `military_status` - Military service status
+- `drinking_status` - Drinking frequency / status
+- `smoking_status` - Smoking frequency / status
+- `blood_pressure_status` - Blood pressure health indicator
+- `blood_sugar_status` - Blood sugar health indicator
+- `bmi_status` - BMI health indicator
+- `waist_status` - Waist-circumference health indicator
 
 **Brazil-Specific Fields (`pt_BR`):**
 
 - `race` - Census-reported race
 
-**Brazil and India Shared Fields (`pt_BR`, `en_IN`, `hi_Deva_IN`, `hi_Latn_IN`):**
+**Singapore-Specific Fields (`en_SG`):**
+
+- `industry` - Industry of employment
+- `preferred_english_name` - Preferred English-form name
+
+**English Locales Shared Fields (`en_US`, `en_SG`):**
+
+- `ethnic_background` - Self-identified ethnic background
+
+**Religion Fields (`en_IN`, `hi_Deva_IN`, `hi_Latn_IN`, `en_SG`, `pt_BR`):**
 
 - `religion` - Census-reported religion
 
-**India-Specific Fields (`en_IN`, `hi_Deva_IN`, `hi_Latn_IN`):**
+**India Locales Fields (`en_IN`, `hi_Deva_IN`, `hi_Latn_IN`):**
 
-- `district` - Census-reported district
 - `education_degree` - Census-reported education degree
 - `first_language` - Native language
 - `second_language` - Second language (if applicable)
@@ -229,17 +252,21 @@ For more details, see the documentation for [`SamplerColumnConfig`](../code_refe
 - Career goals
 - Context-specific personas (professional, financial, healthcare, sports, arts & entertainment, travel, culinary, etc.)
 
-*Japan-specific persona fields:*
+*Japan-specific persona fields (`ja_JP`):*
 
 - `aspects`
-- `digital_skills`
+- `digital_skill`
+
+*Korea-specific persona fields (`ko_KR`):*
+
+- `family_persona`
 
-*Brazil and India shared persona fields (`pt_BR`, `en_IN`, `hi_Deva_IN`, `hi_Latn_IN`):*
+*Religious persona fields (`en_IN`, `hi_Deva_IN`, `hi_Latn_IN`, `en_SG`, `pt_BR`):*
 
 - `religious_persona`
 - `religious_background`
 
-*India-specific persona fields (`en_IN`, `hi_Deva_IN`, `hi_Latn_IN`):*
+*India-locales persona fields (`en_IN`, `hi_Deva_IN`, `hi_Latn_IN`):*
 
 - `linguistic_persona`
 - `linguistic_background`
@@ -248,7 +275,7 @@ For more details, see the documentation for [`SamplerColumnConfig`](../code_refe
 
 | Parameter | Type | Description |
 |-----------|------|-------------|
-| `locale` | str | Language/region code - must be one of: "en_US", "en_IN", "en_SG", "fr_FR", "hi_Deva_IN", "hi_Latn_IN", "ja_JP", "pt_BR" |
+| `locale` | str | Language/region code - must be one of: "en_US", "en_IN", "en_SG", "fr_FR", "hi_Deva_IN", "hi_Latn_IN", "ja_JP", "ko_KR", "pt_BR" |
 | `sex` | str (optional) | Filter by "Male" or "Female" |
 | `city` | str or list[str] (optional) | Filter by specific city or cities within locale |
 | `age_range` | list[int] (optional) | Two-element list [min_age, max_age] (default: [18, 114]) |

@@ -384,10 +384,11 @@ class NordColor(Enum):
     "en_US": "1.24 GB",
     "en_IN": "2.39 GB",
     "en_SG": "0.30 GB",
-    "fr_FR": "2.71 GB",
+    "fr_FR": "3.87 GB",
     "hi_Deva_IN": "4.14 GB",
     "hi_Latn_IN": "2.7 GB",
     "ja_JP": "1.69 GB",
+    "ko_KR": "2.66 GB",
     "pt_BR": "2.33 GB",
 }
 

@@ -15,84 +15,102 @@
 REQUIRED_FIELDS = {"first_name", "last_name", "age", "locale"}
 
 PII_FIELDS = [
-    # Core demographic fields
+    # Universal demographic fields (present in every managed locale)
     "uuid",
     "first_name",
     "middle_name",
     "last_name",
     "sex",
     "age",
-    "birth_date",
     "marital_status",
     "postcode",
     "city",
+    "district",
     "region",
     "country",
-    "locale",
-    "bachelors_field",
-    "education_level",
-    "occupation",
-    "national_id",
-    # US-specific fields
     "street_name",
     "street_number",
     "unit",
+    "bachelors_field",
+    "education_level",
+    "occupation",
+    # Runtime-generated / attached fields
+    "locale",
+    "birth_date",
     "state",
     "email_address",
     "phone_number",
-    # France-specific fields
+    "national_id",
+    # en_US + en_SG
+    "ethnic_background",
+    # en_SG-specific
+    "industry",
+    "preferred_english_name",
+    # fr_FR-specific
     "first_name_heritage",
     "name_heritage",
     "is_first_gen_immigrant",
     "household_type",
     "monthly_income_eur",
-    "commune",
-    "departement",
-    # Brazil-specific fields
-    "race",
-    # Japan-specific fields
+    # ja_JP-specific
     "area",
-    "prefecture",
-    "zone",
-    # Brazil and India shared fields
-    "religion",
-    # India-specific fields
-    "district",
+    # ko_KR-specific
+    "blood_pressure_status",
+    "blood_sugar_status",
+    "bmi_status",
+    "drinking_status",
+    "economic_activity_status",
+    "family_type",
+    "housing_tenure",
+    "housing_type",
+    "income_bracket",
+    "military_status",
+    "smoking_status",
+    "waist_status",
+    # pt_BR-specific
+    "race",
+    # India locales (en_IN, hi_Deva_IN, hi_Latn_IN)
     "education_degree",
     "first_language",
     "second_language",
     "third_language",
+    "zone",
+    # Shared across India locales, en_SG, and pt_BR
+    "religion",
 ]
 
 PERSONA_FIELDS = [
-    # Core persona fields
+    # Universal persona fields
     "persona",
+    "detailed_persona",
+    "professional_persona",
     "career_goals_and_ambitions",
+    "cultural_background",
     "arts_persona",
     "culinary_persona",
-    "cultural_background",
-    "detailed_persona",
     "finance_persona",
     "healthcare_persona",
-    "hobbies_and_interests_list",
-    "hobbies_and_interests",
-    "professional_persona",
-    "skills_and_expertise_list",
-    "skills_and_expertise",
     "sports_persona",
     "travel_persona",
+    "hobbies_and_interests",
+    "hobbies_and_interests_list",
+    "skills_and_expertise",
+    "skills_and_expertise_list",
+    # Big Five personality traits
     "openness",
     "conscientiousness",
     "extraversion",
     "agreeableness",
     "neuroticism",
-    # Japan-specific persona fields
+    # ja_JP-specific
     "aspects",
-    "digital_skills",
-    # Brazil and India shared persona fields
+    "digital_skill",
+    # ko_KR-specific
+    "family_persona",
+    # Shared across India locales, en_SG, and pt_BR
     "religious_persona",
     "religious_background",
-    # India-specific persona fields
+    # India locales only (en_IN, hi_Deva_IN, hi_Latn_IN)
     "linguistic_persona",
     "linguistic_background",
 ]
@@ -85,17 +85,19 @@ def test_run_personas_with_all_flag(
     # Verify NGC check was called
     mock_check_ngc.assert_called_once()
 
-    # Verify all 8 locales were downloaded
-    assert mock_download.call_count == 8
+    # Verify all 9 locales were downloaded
+    assert mock_download.call_count == 9
 
     # Verify each locale was downloaded
     downloaded_locales = [call[0][0] for call in mock_download.call_args_list]
     assert "en_US" in downloaded_locales
     assert "en_IN" in downloaded_locales
     assert "en_SG" in downloaded_locales
+    assert "fr_FR" in downloaded_locales
     assert "hi_Deva_IN" in downloaded_locales
     assert "hi_Latn_IN" in downloaded_locales
     assert "ja_JP" in downloaded_locales
+    assert "ko_KR" in downloaded_locales
     assert "pt_BR" in downloaded_locales
 
 
@@ -219,14 +221,15 @@ def test_determine_locales_with_all_flag(controller: DownloadController) -> None
     """Test _determine_locales returns all locales when all_locales=True."""
     result = controller._determine_locales(locales=None, all_locales=True)
 
-    assert len(result) == 8
+    assert len(result) == 9
     assert "en_US" in result
     assert "en_IN" in result
     assert "en_SG" in result
     assert "fr_FR" in result
     assert "hi_Deva_IN" in result
     assert "hi_Latn_IN" in result
     assert "ja_JP" in result
+    assert "ko_KR" in result
     assert "pt_BR" in result
 
 

@@ -15,7 +15,7 @@ def repository() -> PersonaRepository:
 def test_init(repository: PersonaRepository) -> None:
     """Test repository initialization creates registry."""
     assert repository._registry is not None
-    assert len(repository._registry.locales) == 8
+    assert len(repository._registry.locales) == 9
     assert repository._registry.dataset_prefix == "nemotron-personas-dataset-"
 
 
@@ -24,11 +24,21 @@ def test_list_all(repository: PersonaRepository) -> None:
     locales = repository.list_all()
 
     assert isinstance(locales, list)
-    assert len(locales) == 8
+    assert len(locales) == 9
 
     # Verify all expected locales are present
     locale_codes = {locale.code for locale in locales}
-    assert locale_codes == {"en_US", "en_IN", "en_SG", "fr_FR", "hi_Deva_IN", "hi_Latn_IN", "ja_JP", "pt_BR"}
+    assert locale_codes == {
+        "en_US",
+        "en_IN",
+        "en_SG",
+        "fr_FR",
+        "hi_Deva_IN",
+        "hi_Latn_IN",
+        "ja_JP",
+        "ko_KR",
+        "pt_BR",
+    }
 
     # Verify each locale has required fields
     for locale in locales:
@@ -56,6 +66,7 @@ def test_get_by_code_all_locales(repository: PersonaRepository) -> None:
         ("hi_Deva_IN", "4.14 GB", "nemotron-personas-dataset-hi_deva_in"),
         ("hi_Latn_IN", "2.7 GB", "nemotron-personas-dataset-hi_latn_in"),
         ("ja_JP", "1.69 GB", "nemotron-personas-dataset-ja_jp"),
+        ("ko_KR", "2.66 GB", "nemotron-personas-dataset-ko_kr"),
     ]
 
     for code, expected_size, expected_dataset_name in test_cases:

@@ -51,14 +51,15 @@ def test_get_available_locales(service: DownloadService) -> None:
     locales = service.get_available_locales()
 
     assert isinstance(locales, dict)
-    assert len(locales) == 8
+    assert len(locales) == 9
     assert "en_US" in locales
     assert "en_IN" in locales
     assert "en_SG" in locales
     assert "fr_FR" in locales
     assert "hi_Deva_IN" in locales
     assert "hi_Latn_IN" in locales
     assert "ja_JP" in locales
+    assert "ko_KR" in locales
     assert "pt_BR" in locales
 
     # Verify values are locale codes (not descriptions)