diff --git a/HISTORY.rst b/HISTORY.rst index 7512e78..198cbcb 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -17,6 +17,18 @@ Unreleased Changes table will be included as band metadata under the 'raster_attribute_table' key. It can be retrieved by the ``get_rat`` method of a ``RasterResource`` instance. https://github.com/natcap/geometamaker/issues/25 +* Changed the default path to write a collection's metadata document. The + document will now be created within the directory being described, rather + than as a sibling of the directory. + https://github.com/natcap/geometamaker/issues/126 +* Added a ``target_filename`` parameter to ``describe_collection`` and + the ``-o`` or ``--output`` option to ``geometamaker describe``. These + parameters are optional and allow users to specify the target filename + of the YML document that will be created for a collection. + https://github.com/natcap/geometamaker/issues/125 +* Fixed a bug where ``describe_collection`` would improperly try to add + metadata to a DBF sidecar file as if it was a standalone dataset. + https://github.com/natcap/geometamaker/issues/124 * The Natural Capital Project changed its name to the Natural Capital Alliance. References to the old name and website URL have been updated to reflect this change. https://github.com/natcap/geometamaker/issues/115 diff --git a/README.md b/README.md index 53b932c..0901df3 100644 --- a/README.md +++ b/README.md @@ -83,7 +83,7 @@ or limiting the number of subdirectory levels to traverse using the ```python import geometamaker -collection_path = 'invest/data/invest-sample-data' +collection_path = 'data/invest-sample-data' metadata = geometamaker.describe_collection(collection_path, depth=2, exclude_regex=r'.*\.json$', @@ -95,8 +95,18 @@ metadata.write() ``` geometamaker describe -d 2 --exclude .*\.json$ data/invest-sample-data ``` -These examples will create `invest-sample-data-metadata.yml` as well as -create individual `.yml` documents for each dataset within the directory. +These examples will create `data/invest-sample-data/invest-sample-data-metadata.yml` +as well as create individual `.yml` documents for each dataset within the directory. + +#### Override the default filename of the collection's YML document +```python +geometamaker.describe_collection(collection_path, target_filename='README.yml') +``` +or +``` +geometamaker describe data/invest-sample-data -o README.yml +``` +These examples will create `data/invest-sample-data/README.yml`. ### Validating a metadata document: If you have manually edited a `.yml` metadata document, diff --git a/src/geometamaker/cli.py b/src/geometamaker/cli.py index 59e56f2..92e35ad 100644 --- a/src/geometamaker/cli.py +++ b/src/geometamaker/cli.py @@ -106,8 +106,14 @@ def convert(self, value, param, ctx): help='If FILEPATH is a directory, do not write metadata documents' ' for all files in the directory. Only create a single' ' *-metadata.yml document for the collection') +@click.option('-o', '--output', 'target_filename', + default=None, + help='if FILEPATH is a directory, this is the filename of the' + ' target YML document to be created within the directory.' + ' If output is not specified, the filename will be' + ' -metadata.yml.') def describe(filepath, depth, exclude, all_files, no_write, stats, - collection_only): + collection_only, target_filename): describing_single = True # if filepath is a file, or collection_only=True if os.path.isdir(filepath): resource = geometamaker.describe_collection( @@ -116,7 +122,8 @@ def describe(filepath, depth, exclude, all_files, no_write, stats, exclude_regex=exclude, exclude_hidden=(not all_files), describe_files=(not collection_only), - compute_stats=stats) + compute_stats=stats, + target_filename=target_filename) describing_single = collection_only else: resource = geometamaker.describe(filepath, compute_stats=stats) diff --git a/src/geometamaker/geometamaker.py b/src/geometamaker/geometamaker.py index 1edd67d..fc96156 100644 --- a/src/geometamaker/geometamaker.py +++ b/src/geometamaker/geometamaker.py @@ -171,19 +171,6 @@ def _list_files_with_depth(directory, depth, exclude_regex=None, return sorted(file_list) -def _group_files_by_root(file_list): - """Get set of files (roots) and extensions by filename""" - root_set = set() - root_ext_map = defaultdict(set) - for filepath in file_list: - root, ext = os.path.splitext(filepath) - # tracking which files share a root name - # so we can check if these comprise a shapefile - root_ext_map[root].add(ext) - root_set.add(root) - return root_ext_map, sorted(list(root_set)) - - def _get_collection_size_time_uid(directory): """Get size of directory (in bytes), when it was last modified, and uid""" total_bytes = 0 @@ -491,7 +478,8 @@ def describe_table(source_dataset_path, scheme, **kwargs): def describe_collection(directory, depth=numpy.iinfo(numpy.int16).max, exclude_regex=None, exclude_hidden=True, - describe_files=False, backup=True, **kwargs): + describe_files=False, backup=True, target_filename=None, + **kwargs): """Create a single metadata document to describe a collection of files. Describe all the files within a directory as members of a "collection". @@ -528,49 +516,48 @@ def describe_collection(directory, depth=numpy.iinfo(numpy.int16).max, file_list = _list_files_with_depth(directory, depth, exclude_regex, exclude_hidden) - root_ext_map, root_list = _group_files_by_root(file_list) - items = [] collection_crs_set = set() item_spatial_list = [] - for root in root_list: - extensions = root_ext_map[root] - if '.shp' in extensions: - # if we're dealing with a shapefile, we do not want to describe any - # of these other files with the same root name - extensions.difference_update(['.shx', '.sbn', '.sbx', '.prj', '.dbf', '.cpg']) - # Only drop .yml if its sidecar file, i.e. the corresponding data file - # (root) exists on disk - if '.yml' in extensions and os.path.exists(root): - extensions.discard('.yml') - for ext in extensions: - filepath = os.path.join(directory, f'{root}{ext}') - try: - item_resource = describe(filepath, **kwargs) - if item_resource.spatial is not None: - collection_crs_set.add(item_resource.spatial.crs) - item_spatial_list.append(item_resource.spatial) - - except ValueError: - # if file type isn't supported by geometamaker, e.g. pdf - # or if trying to describe a dir - item_resource = None - - if describe_files and item_resource: - item_resource.write(backup=backup) - - if ext and os.path.exists(filepath + '.yml'): - metadata_yml = f'{root}{ext}' + '.yml' - else: - metadata_yml = '' - - collection_item = models.CollectionItemSchema( - path=f'{root}{ext}', - description=item_resource.description if item_resource else '', - metadata=metadata_yml - ) - items.append(collection_item) + # These extensions almost always represent sidecar files that should + # not be described in isolation. Typically, these are components of a + # shapefile, but '.dbf' can also represent a raster attribute table. + # Theoretically a DBF can also be a standalone table, but that + # is not currently supported by this function. + skip_extensions = [ + '.shx', '.sbn', '.sbx', '.prj', '.dbf', '.cpg', '.qix', '.xml', '.tfw', + '.qlr', '.lyr', '.qpj', '.yml'] + for rel_filepath in file_list: + abs_filepath = os.path.join(directory, rel_filepath) + root, extension = os.path.splitext(abs_filepath) + if extension.lower() in skip_extensions: + continue + try: + item_resource = describe(abs_filepath, **kwargs) + if item_resource.spatial is not None: + collection_crs_set.add(item_resource.spatial.crs) + item_spatial_list.append(item_resource.spatial) + + except ValueError: + # if file type isn't supported by geometamaker, e.g. pdf + # or if trying to describe a dir + item_resource = None + + if describe_files and item_resource: + item_resource.write(backup=backup) + + if os.path.exists(f'{abs_filepath}.yml'): + metadata_yml = f'{rel_filepath}.yml' + else: + metadata_yml = '' + + collection_item = models.CollectionItemSchema( + path=rel_filepath, + description=item_resource.description if item_resource else '', + metadata=metadata_yml + ) + items.append(collection_item) total_bytes, last_modified, uid = _get_collection_size_time_uid(directory) @@ -621,8 +608,10 @@ def describe_collection(directory, depth=numpy.iinfo(numpy.int16).max, ) # Check if there is existing metadata for the collection + if not target_filename: + target_filename = f'{os.path.basename(directory)}-metadata.yml' + metadata_path = os.path.join(directory, target_filename) try: - metadata_path = f'{directory}-metadata.yml' existing_metadata = models.CollectionResource.load(metadata_path) # Copy any existing item descriptions from existing yml to new metadata @@ -658,6 +647,7 @@ def describe_collection(directory, depth=numpy.iinfo(numpy.int16).max, # Add profile metadata config = Config() resource = resource.replace(config.profile) + resource.metadata_path = metadata_path return resource @@ -673,7 +663,8 @@ def describe_collection(directory, depth=numpy.iinfo(numpy.int16).max, 'archive': models.ArchiveResource, 'table': models.TableResource, 'vector': models.VectorResource, - 'raster': models.RasterResource + 'raster': models.RasterResource, + 'collection': models.CollectionResource } @@ -694,8 +685,11 @@ def describe(source_dataset_path, compute_stats=False): Returns: geometamaker.models.Resource: a metadata object - """ + Raises: + ValueError if the file type of the dataset is not supported. + FileNotFoundError if the path does not exist. + """ metadata_path = f'{source_dataset_path}.yml' if os.path.isdir(source_dataset_path): diff --git a/src/geometamaker/models.py b/src/geometamaker/models.py index 95deb5b..a421f24 100644 --- a/src/geometamaker/models.py +++ b/src/geometamaker/models.py @@ -751,7 +751,7 @@ def get_url(self): return self.url def write(self, workspace=None, backup=True): - """Write datapackage yaml to disk. + """Write metadata yaml to disk. This creates sidecar files with '.yml' appended to the full filename of the data source. For example, @@ -759,12 +759,15 @@ def write(self, workspace=None, backup=True): - 'myraster.tif' - 'myraster.tif.yml' + For a ``CollectionResource``, the name of the target YML document can + be specified with the ``target_filename`` argument of + ``describe_collection``. + Args: workspace (str): if ``None``, files write to the same location as the source data. If not ``None``, a path to a local directory - to write files. They will still be named to match the source - filename. Use this option if the source data is not on the local - filesystem. + to write the file. Use this option if the source data is not on + the local filesystem. backup (bool): whether to write a backup of a pre-existing metadata file before ovewriting it in cases where that file is not a valid geometamaker document. diff --git a/tests/test_geometamaker.py b/tests/test_geometamaker.py index 12e436e..1311b04 100644 --- a/tests/test_geometamaker.py +++ b/tests/test_geometamaker.py @@ -950,8 +950,12 @@ def test_describe_collection_spatial_single_crs(self): self.assertEqual(resource.spatial.crs_units, 'metre') self.assertEqual(resource.spatial.bounding_box.to_list(), [0, 0, 2, 2]) - def test_describe_collection_spatial_multiple_crs(self): - """Test describe_collection spatial section represents union.""" + def test_describe_collection_multiple_crs_and_formats(self): + """Test describe_collection: multiple file formats and spatial extents. + + Spatial section of the collection should represent the union + of the extents of the items. + """ import geometamaker collection_path = os.path.join(self.workspace_dir, "collection") @@ -963,12 +967,30 @@ def test_describe_collection_spatial_multiple_crs(self): raster2_path = os.path.join(collection_path, 'raster2.tif') create_raster(numpy.int16, raster2_path, projection_epsg=4326, origin=(2, 2)) + csv_path = os.path.join(collection_path, 'table.csv') + with open(csv_path, 'w') as file: + file.write('a,b,c') resource = geometamaker.describe_collection(collection_path) self.assertEqual(resource.spatial.crs, 'EPSG:4326') self.assertEqual(resource.spatial.crs_units, 'degree') self.assertEqual(resource.spatial.bounding_box.to_list(), [0, 0, 4, 4]) + def test_describe_collection_raster_dbf_tables(self): + """Test describe_collection: when raster has a DBF table.""" + import geometamaker + + collection_path = os.path.join(self.workspace_dir, "collection") + os.mkdir(collection_path) + + test_data_dir = os.path.join(os.path.dirname(__file__), 'data') + shutil.copy(os.path.join(test_data_dir, 'testrat.tif'), + os.path.join(collection_path, 'testrat.tif')) + shutil.copy(os.path.join(test_data_dir, 'testrat.tif.vat.dbf'), + os.path.join(collection_path, 'testrat.tif.vat.dbf')) + resource = geometamaker.describe_collection(collection_path) + self.assertEqual(len(resource.items), 1) + def test_describe_collection_spatial_no_crs(self): """Test describe_collection spatial section is None.""" import geometamaker @@ -1014,7 +1036,10 @@ def test_describe_collection_with_depth(self): metadata = geometamaker.describe_collection( collection_path, depth=1, exclude_regex="exclude_this*") metadata.write() - self.assertTrue(os.path.exists(collection_path+"-metadata.yml")) + default_target = os.path.join( + collection_path, + f'{os.path.basename(collection_path)}-metadata.yml') + self.assertTrue(os.path.exists(default_target)) # assert that with depth=1, items list only includes csv and # subdir and excludes exclude_this.csv self.assertEqual(len(metadata.items), 2) @@ -1022,13 +1047,13 @@ def test_describe_collection_with_depth(self): geometamaker.describe_collection( collection_path, depth=1, exclude_regex="exclude_this*", describe_files=True) - self.assertTrue(os.path.exists(csv_path+".yml")) - self.assertFalse(os.path.exists(raster_path+".yml")) - self.assertFalse(os.path.exists(csv_path_excluded+".yml")) + self.assertTrue(os.path.exists(f'{csv_path}.yml')) + self.assertFalse(os.path.exists(f'{raster_path}.yml')) + self.assertFalse(os.path.exists(f'{csv_path_excluded}.yml')) geometamaker.describe_collection(collection_path, depth=2, describe_files=True) - self.assertTrue(os.path.exists(raster_path+".yml")) + self.assertTrue(os.path.exists(f'{raster_path}.yml')) def test_describe_collection_existing_yml(self): """test `describe_collection` does not overwrite existing attributes""" @@ -1064,7 +1089,9 @@ def test_describe_collection_preexisting_invalid_yml(self): os.mkdir(collection_path) # Setup an incompatible yml file at the expected path - target_yml_path = f'{collection_path}-metadata.yml' + target_yml_path = os.path.join( + collection_path, + f'{os.path.basename(collection_path)}-metadata.yml') with open(target_yml_path, 'w') as file: file.write(yaml.dump({'foo': 'bar'})) @@ -1082,6 +1109,29 @@ def test_describe_collection_preexisting_invalid_yml(self): self.assertIn(msg1, actualMessages) self.assertIn(msg2, actualMessages) + def test_describe_collection_custom_target(self): + """test `describe_collection`: user-specified target filename.""" + import geometamaker + + # Create collection with 1 item + collection_path = os.path.join(self.workspace_dir, "collection") + os.mkdir(collection_path) + + csv_path = os.path.join(collection_path, 'table.csv') + with open(csv_path, 'w') as file: + file.write('a,b,c') + + target_filename = 'README.yml' + resource = geometamaker.describe_collection( + collection_path, target_filename=target_filename) + resource.write() + self.assertTrue( + os.path.exists(os.path.join(collection_path, target_filename))) + + resource.write(workspace=self.workspace_dir) + self.assertTrue( + os.path.exists(os.path.join(self.workspace_dir, target_filename))) + def test_describe_directory_error(self): """Test that `describing` a directory raises useful error""" import geometamaker @@ -1351,7 +1401,10 @@ def test_cli_describe_directory(self): self.assertEqual(result.exit_code, 0) self.assertEqual(result.output, '') self.assertTrue(os.path.exists(f'{datasource_path}.yml')) - self.assertTrue(os.path.exists(f'{self.workspace_dir}-metadata.yml')) + default_target = os.path.join( + self.workspace_dir, + f'{os.path.basename(self.workspace_dir)}-metadata.yml') + self.assertTrue(os.path.exists(default_target)) def test_cli_describe_directory_collection_options(self): """CLI: test describe with a directory with various options.""" @@ -1364,19 +1417,26 @@ def test_cli_describe_directory_collection_options(self): result = runner.invoke( cli.cli, ['describe', '--no-write', '--collection-only', self.workspace_dir]) + + default_target = os.path.join( + self.workspace_dir, + f'{os.path.basename(self.workspace_dir)}-metadata.yml') self.assertEqual(result.exit_code, 0) # one of many things expected to print to stdout: self.assertIn('last_modified', result.output) self.assertFalse(os.path.exists(f'{datasource_path}.yml')) - self.assertFalse(os.path.exists(f'{self.workspace_dir}-metadata.yml')) + self.assertFalse(os.path.exists(default_target)) + target_filename = 'README.yml' result = runner.invoke( cli.cli, - ['describe', '--collection-only', self.workspace_dir]) + ['describe', '--collection-only', self.workspace_dir, + '-o', target_filename]) self.assertEqual(result.exit_code, 0) self.assertEqual(result.output, '') self.assertFalse(os.path.exists(f'{datasource_path}.yml')) - self.assertTrue(os.path.exists(f'{self.workspace_dir}-metadata.yml')) + self.assertTrue( + os.path.exists(os.path.join(self.workspace_dir, target_filename))) def test_cli_validate_valid(self): """CLI: test validate emits stdout for valid document."""