diff --git a/HISTORY.rst b/HISTORY.rst index 7512e78..e192bac 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -3,6 +3,13 @@ Release History Unreleased Changes ------------------ +* Fixed a bug where extra attributes returned from frictionless + would cause a ValidationError when instantiating a ``Resource``. + https://github.com/natcap/geometamaker/issues/128 +* Removed the ``encoding`` value when describing raster or vector + datasets as they are generally binary files and the value that was + given did not reflect a true encoding even for the attribute table + of a GDAL vector. https://github.com/natcap/geometamaker/issues/121 * Added an optional ``spatial`` attribute for tables, archives, and collections. The ``spatial`` attribute for rasters and vectors remains required. Spatial information for Collections represents the union of the diff --git a/src/geometamaker/geometamaker.py b/src/geometamaker/geometamaker.py index 1edd67d..27f4e5a 100644 --- a/src/geometamaker/geometamaker.py +++ b/src/geometamaker/geometamaker.py @@ -287,9 +287,16 @@ def describe_file(source_dataset_path, scheme): {description["path"]}'.encode('utf-8')) description['uid'] = f'sizetimestamp:{hash_func.hexdigest()}' - # We don't have a use for including these attributes in our metadata: + # These are other attributes sometimes returned by frictionless. + # We don't have a use for them in our metadata and we do not permit + # arbitrary extra attributes in our models. description.pop('mediatype', None) description.pop('name', None) + description.pop('profile', None) + description.pop('dialect', None) + description.pop('hash', None) + description.pop('sources', None) + description.pop('licenses', None) return description @@ -357,6 +364,7 @@ def describe_vector(source_dataset_path, scheme, **kwargs): """ description = describe_file(source_dataset_path, scheme) + description.pop('encoding', None) # does not make sense for binary data if 'http' in scheme: source_dataset_path = f'/vsicurl/{source_dataset_path}' @@ -404,6 +412,7 @@ def describe_raster(source_dataset_path, scheme, **kwargs): """ compute_stats = kwargs.get('compute_stats', False) description = describe_file(source_dataset_path, scheme) + description.pop('encoding', None) # does not make sense for binary data if 'http' in scheme: source_dataset_path = f'/vsicurl/{source_dataset_path}' info = pygeoprocessing.get_raster_info(source_dataset_path) diff --git a/src/geometamaker/models.py b/src/geometamaker/models.py index 95deb5b..b896564 100644 --- a/src/geometamaker/models.py +++ b/src/geometamaker/models.py @@ -38,7 +38,12 @@ def _deep_update_dict(self_dict, other_dict): class Parent(BaseModel): - """Parent class on which to configure validation.""" + """Parent class on which to configure validation. + + Extra attributes are forbidden because we anticipate + users editing YML docs manually and this can help catch + accidental edits like a typo in an attribute name. + """ model_config = ConfigDict(validate_assignment=True, extra='forbid', diff --git a/tests/test_geometamaker.py b/tests/test_geometamaker.py index 12e436e..573efda 100644 --- a/tests/test_geometamaker.py +++ b/tests/test_geometamaker.py @@ -172,6 +172,19 @@ def test_describe_csv(self): self.assertEqual(field.units, units) self.assertEqual(resource.spatial, spatial) + def test_describe_csv_semicolon_dialect(self): + """Test a CSV that uses semicolon delimiter.""" + import geometamaker + + datasource_path = os.path.join(self.workspace_dir, 'data.csv') + with open(datasource_path, 'w') as file: + file.write('a;b;c\n') + file.write('1;2;3\n') + + resource = geometamaker.describe(datasource_path) + field = resource.get_field_description('a') + self.assertEqual(field.type, 'integer') + def test_describe_bad_csv(self): """MetadataControl: CSV with extra item in row does not fail.""" import geometamaker