Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions HISTORY.rst
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,18 @@ Unreleased Changes
table will be included as band metadata under the 'raster_attribute_table'
key. It can be retrieved by the ``get_rat`` method of a ``RasterResource``
instance. https://github.com/natcap/geometamaker/issues/25
* Changed the default path to write a collection's metadata document. The
document will now be created within the directory being described, rather
than as a sibling of the directory.
https://github.com/natcap/geometamaker/issues/126
* Added a ``target_filename`` parameter to ``describe_collection`` and
the ``-o`` or ``--output`` option to ``geometamaker describe``. These
parameters are optional and allow users to specify the target filename
of the YML document that will be created for a collection.
https://github.com/natcap/geometamaker/issues/125
* Fixed a bug where ``describe_collection`` would improperly try to add
metadata to a DBF sidecar file as if it was a standalone dataset.
https://github.com/natcap/geometamaker/issues/124
* The Natural Capital Project changed its name to the Natural Capital Alliance.
References to the old name and website URL have been updated to reflect
this change. https://github.com/natcap/geometamaker/issues/115
Expand Down
16 changes: 13 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ or limiting the number of subdirectory levels to traverse using the
```python
import geometamaker

collection_path = 'invest/data/invest-sample-data'
collection_path = 'data/invest-sample-data'
metadata = geometamaker.describe_collection(collection_path,
depth=2,
exclude_regex=r'.*\.json$',
Expand All @@ -95,8 +95,18 @@ metadata.write()
```
geometamaker describe -d 2 --exclude .*\.json$ data/invest-sample-data
```
These examples will create `invest-sample-data-metadata.yml` as well as
create individual `.yml` documents for each dataset within the directory.
These examples will create `data/invest-sample-data/invest-sample-data-metadata.yml`
as well as create individual `.yml` documents for each dataset within the directory.

#### Override the default filename of the collection's YML document
```python
geometamaker.describe_collection(collection_path, target_filename='README.yml')
```
or
```
geometamaker describe data/invest-sample-data -o README.yml
```
These examples will create `data/invest-sample-data/README.yml`.

### Validating a metadata document:
If you have manually edited a `.yml` metadata document,
Expand Down
11 changes: 9 additions & 2 deletions src/geometamaker/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,8 +106,14 @@ def convert(self, value, param, ctx):
help='If FILEPATH is a directory, do not write metadata documents'
' for all files in the directory. Only create a single'
' *-metadata.yml document for the collection')
@click.option('-o', '--output', 'target_filename',
default=None,
help='if FILEPATH is a directory, this is the filename of the'
' target YML document to be created within the directory.'
' If output is not specified, the filename will be'
' <directory_name>-metadata.yml.')
def describe(filepath, depth, exclude, all_files, no_write, stats,
collection_only):
collection_only, target_filename):
describing_single = True # if filepath is a file, or collection_only=True
if os.path.isdir(filepath):
resource = geometamaker.describe_collection(
Expand All @@ -116,7 +122,8 @@ def describe(filepath, depth, exclude, all_files, no_write, stats,
exclude_regex=exclude,
exclude_hidden=(not all_files),
describe_files=(not collection_only),
compute_stats=stats)
compute_stats=stats,
target_filename=target_filename)
describing_single = collection_only
else:
resource = geometamaker.describe(filepath, compute_stats=stats)
Expand Down
106 changes: 50 additions & 56 deletions src/geometamaker/geometamaker.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,19 +171,6 @@ def _list_files_with_depth(directory, depth, exclude_regex=None,
return sorted(file_list)


def _group_files_by_root(file_list):
"""Get set of files (roots) and extensions by filename"""
root_set = set()
root_ext_map = defaultdict(set)
for filepath in file_list:
root, ext = os.path.splitext(filepath)
# tracking which files share a root name
# so we can check if these comprise a shapefile
root_ext_map[root].add(ext)
root_set.add(root)
return root_ext_map, sorted(list(root_set))


def _get_collection_size_time_uid(directory):
"""Get size of directory (in bytes), when it was last modified, and uid"""
total_bytes = 0
Expand Down Expand Up @@ -491,7 +478,8 @@ def describe_table(source_dataset_path, scheme, **kwargs):

def describe_collection(directory, depth=numpy.iinfo(numpy.int16).max,
exclude_regex=None, exclude_hidden=True,
describe_files=False, backup=True, **kwargs):
describe_files=False, backup=True, target_filename=None,
**kwargs):
"""Create a single metadata document to describe a collection of files.

Describe all the files within a directory as members of a "collection".
Expand Down Expand Up @@ -528,49 +516,48 @@ def describe_collection(directory, depth=numpy.iinfo(numpy.int16).max,
file_list = _list_files_with_depth(directory, depth, exclude_regex,
exclude_hidden)

root_ext_map, root_list = _group_files_by_root(file_list)

items = []
collection_crs_set = set()
item_spatial_list = []

for root in root_list:
extensions = root_ext_map[root]
if '.shp' in extensions:
# if we're dealing with a shapefile, we do not want to describe any
# of these other files with the same root name
extensions.difference_update(['.shx', '.sbn', '.sbx', '.prj', '.dbf', '.cpg'])
# Only drop .yml if its sidecar file, i.e. the corresponding data file
# (root) exists on disk
if '.yml' in extensions and os.path.exists(root):
extensions.discard('.yml')
for ext in extensions:
filepath = os.path.join(directory, f'{root}{ext}')
try:
item_resource = describe(filepath, **kwargs)
if item_resource.spatial is not None:
collection_crs_set.add(item_resource.spatial.crs)
item_spatial_list.append(item_resource.spatial)

except ValueError:
# if file type isn't supported by geometamaker, e.g. pdf
# or if trying to describe a dir
item_resource = None

if describe_files and item_resource:
item_resource.write(backup=backup)

if ext and os.path.exists(filepath + '.yml'):
metadata_yml = f'{root}{ext}' + '.yml'
else:
metadata_yml = ''

collection_item = models.CollectionItemSchema(
path=f'{root}{ext}',
description=item_resource.description if item_resource else '',
metadata=metadata_yml
)
items.append(collection_item)
# These extensions almost always represent sidecar files that should
# not be described in isolation. Typically, these are components of a
# shapefile, but '.dbf' can also represent a raster attribute table.
# Theoretically a DBF can also be a standalone table, but that
# is not currently supported by this function.
skip_extensions = [
'.shx', '.sbn', '.sbx', '.prj', '.dbf', '.cpg', '.qix', '.xml', '.tfw',
'.qlr', '.lyr', '.qpj', '.yml']
for rel_filepath in file_list:
abs_filepath = os.path.join(directory, rel_filepath)
root, extension = os.path.splitext(abs_filepath)
if extension.lower() in skip_extensions:
continue
try:
item_resource = describe(abs_filepath, **kwargs)
if item_resource.spatial is not None:
collection_crs_set.add(item_resource.spatial.crs)
item_spatial_list.append(item_resource.spatial)

except ValueError:
# if file type isn't supported by geometamaker, e.g. pdf
# or if trying to describe a dir
item_resource = None

if describe_files and item_resource:
item_resource.write(backup=backup)

if os.path.exists(f'{abs_filepath}.yml'):
metadata_yml = f'{rel_filepath}.yml'
else:
metadata_yml = ''

collection_item = models.CollectionItemSchema(
path=rel_filepath,
description=item_resource.description if item_resource else '',
metadata=metadata_yml
)
items.append(collection_item)

total_bytes, last_modified, uid = _get_collection_size_time_uid(directory)

Expand Down Expand Up @@ -621,8 +608,10 @@ def describe_collection(directory, depth=numpy.iinfo(numpy.int16).max,
)

# Check if there is existing metadata for the collection
if not target_filename:
target_filename = f'{os.path.basename(directory)}-metadata.yml'
metadata_path = os.path.join(directory, target_filename)
try:
metadata_path = f'{directory}-metadata.yml'
existing_metadata = models.CollectionResource.load(metadata_path)

# Copy any existing item descriptions from existing yml to new metadata
Expand Down Expand Up @@ -658,6 +647,7 @@ def describe_collection(directory, depth=numpy.iinfo(numpy.int16).max,
# Add profile metadata
config = Config()
resource = resource.replace(config.profile)
resource.metadata_path = metadata_path

return resource

Expand All @@ -673,7 +663,8 @@ def describe_collection(directory, depth=numpy.iinfo(numpy.int16).max,
'archive': models.ArchiveResource,
'table': models.TableResource,
'vector': models.VectorResource,
'raster': models.RasterResource
'raster': models.RasterResource,
'collection': models.CollectionResource
}


Expand All @@ -694,8 +685,11 @@ def describe(source_dataset_path, compute_stats=False):
Returns:
geometamaker.models.Resource: a metadata object

"""
Raises:
ValueError if the file type of the dataset is not supported.
FileNotFoundError if the path does not exist.

"""
metadata_path = f'{source_dataset_path}.yml'

if os.path.isdir(source_dataset_path):
Expand Down
11 changes: 7 additions & 4 deletions src/geometamaker/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -751,20 +751,23 @@ def get_url(self):
return self.url

def write(self, workspace=None, backup=True):
"""Write datapackage yaml to disk.
"""Write metadata yaml to disk.

This creates sidecar files with '.yml'
appended to the full filename of the data source. For example,

- 'myraster.tif'
- 'myraster.tif.yml'

For a ``CollectionResource``, the name of the target YML document can
be specified with the ``target_filename`` argument of
``describe_collection``.

Args:
workspace (str): if ``None``, files write to the same location
as the source data. If not ``None``, a path to a local directory
to write files. They will still be named to match the source
filename. Use this option if the source data is not on the local
filesystem.
to write the file. Use this option if the source data is not on
the local filesystem.
backup (bool): whether to write a backup of a pre-existing metadata
file before ovewriting it in cases where that file is not a valid
geometamaker document.
Expand Down
Loading
Loading