diff --git a/docs/concepts/person_sampling.md b/docs/concepts/person_sampling.md index c49e4e9f0..5452e7b98 100644 --- a/docs/concepts/person_sampling.md +++ b/docs/concepts/person_sampling.md @@ -60,6 +60,7 @@ Supported locales: - `hi_Deva_IN`: India (Devanagari script) - `hi_Latn_IN`: India (Latin script) - `ja_JP`: Japan +- `ko_KR`: South Korea (Korean) - `pt_BR`: Brazil (Portuguese) ### Features @@ -126,6 +127,9 @@ ngc registry resource download-version "nvidia/nemotron-personas/nemotron-person # For Nemotron-Personas JP ngc registry resource download-version "nvidia/nemotron-personas/nemotron-personas-dataset-ja_jp" +# For Nemotron-Personas KR +ngc registry resource download-version "nvidia/nemotron-personas/nemotron-personas-dataset-ko_kr" + # For Nemotron-Personas SG ngc registry resource download-version "nvidia/nemotron-personas/nemotron-personas-dataset-en_sg" @@ -189,8 +193,6 @@ For more details, see the documentation for [`SamplerColumnConfig`](../code_refe **France-Specific Fields (`fr_FR`):** -- `commune` - Smallest administrative division (includes arrondissements) -- `departement` - Mid-level administrative division - `household_type` - Household composition (e.g., single person, couple with/without children) - `monthly_income_eur` - Estimated monthly income in euros - `first_name_heritage` - Cultural origin of the first name @@ -200,20 +202,41 @@ For more details, see the documentation for [`SamplerColumnConfig`](../code_refe **Japan-Specific Fields (`ja_JP`):** - `area` -- `prefecture` -- `zone` + +**Korea-Specific Fields (`ko_KR`):** + +- `economic_activity_status` - Employment / economic activity status +- `family_type` - Household / family composition type +- `housing_type` - Dwelling type (apartment, detached home, etc.) +- `housing_tenure` - Owned vs rented, etc. +- `income_bracket` - Income range +- `military_status` - Military service status +- `drinking_status` - Drinking frequency / status +- `smoking_status` - Smoking frequency / status +- `blood_pressure_status` - Blood pressure health indicator +- `blood_sugar_status` - Blood sugar health indicator +- `bmi_status` - BMI health indicator +- `waist_status` - Waist-circumference health indicator **Brazil-Specific Fields (`pt_BR`):** - `race` - Census-reported race -**Brazil and India Shared Fields (`pt_BR`, `en_IN`, `hi_Deva_IN`, `hi_Latn_IN`):** +**Singapore-Specific Fields (`en_SG`):** + +- `industry` - Industry of employment +- `preferred_english_name` - Preferred English-form name + +**English Locales Shared Fields (`en_US`, `en_SG`):** + +- `ethnic_background` - Self-identified ethnic background + +**Religion Fields (`en_IN`, `hi_Deva_IN`, `hi_Latn_IN`, `en_SG`, `pt_BR`):** - `religion` - Census-reported religion -**India-Specific Fields (`en_IN`, `hi_Deva_IN`, `hi_Latn_IN`):** +**India Locales Fields (`en_IN`, `hi_Deva_IN`, `hi_Latn_IN`):** -- `district` - Census-reported district - `education_degree` - Census-reported education degree - `first_language` - Native language - `second_language` - Second language (if applicable) @@ -229,17 +252,21 @@ For more details, see the documentation for [`SamplerColumnConfig`](../code_refe - Career goals - Context-specific personas (professional, financial, healthcare, sports, arts & entertainment, travel, culinary, etc.) -*Japan-specific persona fields:* +*Japan-specific persona fields (`ja_JP`):* - `aspects` -- `digital_skills` +- `digital_skill` + +*Korea-specific persona fields (`ko_KR`):* + +- `family_persona` -*Brazil and India shared persona fields (`pt_BR`, `en_IN`, `hi_Deva_IN`, `hi_Latn_IN`):* +*Religious persona fields (`en_IN`, `hi_Deva_IN`, `hi_Latn_IN`, `en_SG`, `pt_BR`):* - `religious_persona` - `religious_background` -*India-specific persona fields (`en_IN`, `hi_Deva_IN`, `hi_Latn_IN`):* +*India-locales persona fields (`en_IN`, `hi_Deva_IN`, `hi_Latn_IN`):* - `linguistic_persona` - `linguistic_background` @@ -248,7 +275,7 @@ For more details, see the documentation for [`SamplerColumnConfig`](../code_refe | Parameter | Type | Description | |-----------|------|-------------| -| `locale` | str | Language/region code - must be one of: "en_US", "en_IN", "en_SG", "fr_FR", "hi_Deva_IN", "hi_Latn_IN", "ja_JP", "pt_BR" | +| `locale` | str | Language/region code - must be one of: "en_US", "en_IN", "en_SG", "fr_FR", "hi_Deva_IN", "hi_Latn_IN", "ja_JP", "ko_KR", "pt_BR" | | `sex` | str (optional) | Filter by "Male" or "Female" | | `city` | str or list[str] (optional) | Filter by specific city or cities within locale | | `age_range` | list[int] (optional) | Two-element list [min_age, max_age] (default: [18, 114]) | diff --git a/packages/data-designer-config/src/data_designer/config/utils/constants.py b/packages/data-designer-config/src/data_designer/config/utils/constants.py index 35cd381df..64d459f6e 100644 --- a/packages/data-designer-config/src/data_designer/config/utils/constants.py +++ b/packages/data-designer-config/src/data_designer/config/utils/constants.py @@ -384,10 +384,11 @@ class NordColor(Enum): "en_US": "1.24 GB", "en_IN": "2.39 GB", "en_SG": "0.30 GB", - "fr_FR": "2.71 GB", + "fr_FR": "3.87 GB", "hi_Deva_IN": "4.14 GB", "hi_Latn_IN": "2.7 GB", "ja_JP": "1.69 GB", + "ko_KR": "2.66 GB", "pt_BR": "2.33 GB", } diff --git a/packages/data-designer-engine/src/data_designer/engine/sampling_gen/entities/dataset_based_person_fields.py b/packages/data-designer-engine/src/data_designer/engine/sampling_gen/entities/dataset_based_person_fields.py index 1e07a4f2c..57625d777 100644 --- a/packages/data-designer-engine/src/data_designer/engine/sampling_gen/entities/dataset_based_person_fields.py +++ b/packages/data-designer-engine/src/data_designer/engine/sampling_gen/entities/dataset_based_person_fields.py @@ -15,84 +15,102 @@ REQUIRED_FIELDS = {"first_name", "last_name", "age", "locale"} PII_FIELDS = [ - # Core demographic fields + # Universal demographic fields (present in every managed locale) "uuid", "first_name", "middle_name", "last_name", "sex", "age", - "birth_date", "marital_status", "postcode", "city", + "district", "region", "country", - "locale", - "bachelors_field", - "education_level", - "occupation", - "national_id", - # US-specific fields "street_name", "street_number", "unit", + "bachelors_field", + "education_level", + "occupation", + # Runtime-generated / attached fields + "locale", + "birth_date", "state", "email_address", "phone_number", - # France-specific fields + "national_id", + # en_US + en_SG + "ethnic_background", + # en_SG-specific + "industry", + "preferred_english_name", + # fr_FR-specific "first_name_heritage", "name_heritage", "is_first_gen_immigrant", "household_type", "monthly_income_eur", - "commune", - "departement", - # Brazil-specific fields - "race", - # Japan-specific fields + # ja_JP-specific "area", - "prefecture", - "zone", - # Brazil and India shared fields - "religion", - # India-specific fields - "district", + # ko_KR-specific + "blood_pressure_status", + "blood_sugar_status", + "bmi_status", + "drinking_status", + "economic_activity_status", + "family_type", + "housing_tenure", + "housing_type", + "income_bracket", + "military_status", + "smoking_status", + "waist_status", + # pt_BR-specific + "race", + # India locales (en_IN, hi_Deva_IN, hi_Latn_IN) "education_degree", "first_language", "second_language", "third_language", + "zone", + # Shared across India locales, en_SG, and pt_BR + "religion", ] PERSONA_FIELDS = [ - # Core persona fields + # Universal persona fields "persona", + "detailed_persona", + "professional_persona", "career_goals_and_ambitions", + "cultural_background", "arts_persona", "culinary_persona", - "cultural_background", - "detailed_persona", "finance_persona", "healthcare_persona", - "hobbies_and_interests_list", - "hobbies_and_interests", - "professional_persona", - "skills_and_expertise_list", - "skills_and_expertise", "sports_persona", "travel_persona", + "hobbies_and_interests", + "hobbies_and_interests_list", + "skills_and_expertise", + "skills_and_expertise_list", + # Big Five personality traits "openness", "conscientiousness", "extraversion", "agreeableness", "neuroticism", - # Japan-specific persona fields + # ja_JP-specific "aspects", - "digital_skills", - # Brazil and India shared persona fields + "digital_skill", + # ko_KR-specific + "family_persona", + # Shared across India locales, en_SG, and pt_BR "religious_persona", "religious_background", - # India-specific persona fields + # India locales only (en_IN, hi_Deva_IN, hi_Latn_IN) "linguistic_persona", "linguistic_background", ] diff --git a/packages/data-designer/tests/cli/controllers/test_download_controller.py b/packages/data-designer/tests/cli/controllers/test_download_controller.py index c130e9275..b6b7d5292 100644 --- a/packages/data-designer/tests/cli/controllers/test_download_controller.py +++ b/packages/data-designer/tests/cli/controllers/test_download_controller.py @@ -85,17 +85,19 @@ def test_run_personas_with_all_flag( # Verify NGC check was called mock_check_ngc.assert_called_once() - # Verify all 8 locales were downloaded - assert mock_download.call_count == 8 + # Verify all 9 locales were downloaded + assert mock_download.call_count == 9 # Verify each locale was downloaded downloaded_locales = [call[0][0] for call in mock_download.call_args_list] assert "en_US" in downloaded_locales assert "en_IN" in downloaded_locales assert "en_SG" in downloaded_locales + assert "fr_FR" in downloaded_locales assert "hi_Deva_IN" in downloaded_locales assert "hi_Latn_IN" in downloaded_locales assert "ja_JP" in downloaded_locales + assert "ko_KR" in downloaded_locales assert "pt_BR" in downloaded_locales @@ -219,7 +221,7 @@ def test_determine_locales_with_all_flag(controller: DownloadController) -> None """Test _determine_locales returns all locales when all_locales=True.""" result = controller._determine_locales(locales=None, all_locales=True) - assert len(result) == 8 + assert len(result) == 9 assert "en_US" in result assert "en_IN" in result assert "en_SG" in result @@ -227,6 +229,7 @@ def test_determine_locales_with_all_flag(controller: DownloadController) -> None assert "hi_Deva_IN" in result assert "hi_Latn_IN" in result assert "ja_JP" in result + assert "ko_KR" in result assert "pt_BR" in result diff --git a/packages/data-designer/tests/cli/repositories/test_persona_repository.py b/packages/data-designer/tests/cli/repositories/test_persona_repository.py index ce9a53304..905a26c91 100644 --- a/packages/data-designer/tests/cli/repositories/test_persona_repository.py +++ b/packages/data-designer/tests/cli/repositories/test_persona_repository.py @@ -15,7 +15,7 @@ def repository() -> PersonaRepository: def test_init(repository: PersonaRepository) -> None: """Test repository initialization creates registry.""" assert repository._registry is not None - assert len(repository._registry.locales) == 8 + assert len(repository._registry.locales) == 9 assert repository._registry.dataset_prefix == "nemotron-personas-dataset-" @@ -24,11 +24,21 @@ def test_list_all(repository: PersonaRepository) -> None: locales = repository.list_all() assert isinstance(locales, list) - assert len(locales) == 8 + assert len(locales) == 9 # Verify all expected locales are present locale_codes = {locale.code for locale in locales} - assert locale_codes == {"en_US", "en_IN", "en_SG", "fr_FR", "hi_Deva_IN", "hi_Latn_IN", "ja_JP", "pt_BR"} + assert locale_codes == { + "en_US", + "en_IN", + "en_SG", + "fr_FR", + "hi_Deva_IN", + "hi_Latn_IN", + "ja_JP", + "ko_KR", + "pt_BR", + } # Verify each locale has required fields for locale in locales: @@ -56,6 +66,7 @@ def test_get_by_code_all_locales(repository: PersonaRepository) -> None: ("hi_Deva_IN", "4.14 GB", "nemotron-personas-dataset-hi_deva_in"), ("hi_Latn_IN", "2.7 GB", "nemotron-personas-dataset-hi_latn_in"), ("ja_JP", "1.69 GB", "nemotron-personas-dataset-ja_jp"), + ("ko_KR", "2.66 GB", "nemotron-personas-dataset-ko_kr"), ] for code, expected_size, expected_dataset_name in test_cases: diff --git a/packages/data-designer/tests/cli/services/test_download_service.py b/packages/data-designer/tests/cli/services/test_download_service.py index b452c0682..3abdaef75 100644 --- a/packages/data-designer/tests/cli/services/test_download_service.py +++ b/packages/data-designer/tests/cli/services/test_download_service.py @@ -51,7 +51,7 @@ def test_get_available_locales(service: DownloadService) -> None: locales = service.get_available_locales() assert isinstance(locales, dict) - assert len(locales) == 8 + assert len(locales) == 9 assert "en_US" in locales assert "en_IN" in locales assert "en_SG" in locales @@ -59,6 +59,7 @@ def test_get_available_locales(service: DownloadService) -> None: assert "hi_Deva_IN" in locales assert "hi_Latn_IN" in locales assert "ja_JP" in locales + assert "ko_KR" in locales assert "pt_BR" in locales # Verify values are locale codes (not descriptions)