Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 39 additions & 0 deletions autotest/ogr/ogr_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -982,3 +982,42 @@ def test_ogr_arrow_convert_from_geoarrow_wkb_with_extension_loaded(tmp_path):
with gdal.OpenEx(tmp_path / "out.feather") as ds:
lyr = ds.GetLayer(0)
assert lyr.GetFeatureCount() == 9


###############################################################################


def test_ogr_arrow_lists_as_string_json():

ds = gdal.OpenEx(
"data/arrow/test.feather", open_options=["LISTS_AS_STRING_JSON=YES"]
)
lyr = ds.GetLayer(0)
lyr_defn = lyr.GetLayerDefn()
assert (
lyr_defn.GetFieldDefn(lyr_defn.GetFieldIndex("list_boolean")).GetType()
== ogr.OFTString
)
assert (
lyr_defn.GetFieldDefn(lyr_defn.GetFieldIndex("list_boolean")).GetSubType()
== ogr.OFSTJSON
)
assert (
lyr_defn.GetFieldDefn(
lyr_defn.GetFieldIndex("fixed_size_list_float64")
).GetType()
== ogr.OFTString
)
assert (
lyr_defn.GetFieldDefn(
lyr_defn.GetFieldIndex("fixed_size_list_float64")
).GetSubType()
== ogr.OFSTJSON
)
f = lyr.GetFeature(4)
assert f["list_boolean"] == "[null,false,true,false]"
assert f["list_uint8"] == "[null,7,8,9]"
assert f["list_int64"] == "[null,7,8,9]"
assert f["list_float64"] == "[null,7.5,8.5,9.5]"
assert f["list_string"] == "[null]"
assert f["fixed_size_list_float64"] == "[8.0,9.0]"
39 changes: 39 additions & 0 deletions autotest/ogr/ogr_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -4684,3 +4684,42 @@ def test_ogr_parquet_arrow_stream_list_of_struct_ignored_fields():
{"a": 1, "b": 2},
{"a": 1, "b": 2},
]


###############################################################################


def test_ogr_parquet_lists_as_string_json():

ds = gdal.OpenEx(
"data/parquet/test.parquet", open_options=["LISTS_AS_STRING_JSON=YES"]
)
lyr = ds.GetLayer(0)
lyr_defn = lyr.GetLayerDefn()
assert (
lyr_defn.GetFieldDefn(lyr_defn.GetFieldIndex("list_boolean")).GetType()
== ogr.OFTString
)
assert (
lyr_defn.GetFieldDefn(lyr_defn.GetFieldIndex("list_boolean")).GetSubType()
== ogr.OFSTJSON
)
assert (
lyr_defn.GetFieldDefn(
lyr_defn.GetFieldIndex("fixed_size_list_float64")
).GetType()
== ogr.OFTString
)
assert (
lyr_defn.GetFieldDefn(
lyr_defn.GetFieldIndex("fixed_size_list_float64")
).GetSubType()
== ogr.OFSTJSON
)
f = lyr.GetFeature(4)
assert f["list_boolean"] == "[null,false,true,false]"
assert f["list_uint8"] == "[null,7,8,9]"
assert f["list_int64"] == "[null,7,8,9]"
assert f["list_float64"] == "[null,7.5,8.5,9.5]"
assert f["list_string"] == "[null]"
assert f["fixed_size_list_float64"] == "[8.0,9.0]"
17 changes: 17 additions & 0 deletions doc/source/drivers/vector/arrow.rst
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,23 @@ Driver capabilities

.. supports_virtualio::

Open options
------------

|about-open-options|
The following open options are supported:

- .. oo:: LISTS_AS_STRING_JSON
:choices: YES, NO
:default: NO
:since: 3.12.1

Whether lists of strings/integers/reals should be reported as String(JSON)
fields rather than String/Integer[64]/RealList.
Useful when null values in such lists must be exactly mapped as such,
instead of being omitted (for lists of strings), or set to 0 (for list of
boolean, integer or real).

Creation issues
---------------

Expand Down
11 changes: 11 additions & 0 deletions doc/source/drivers/vector/parquet.rst
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,17 @@ The following open options are supported:
The string is typically formatted as CODE:AUTH (e.g "EPSG:4326"), or can
be a PROJ.4 or WKT CRS string.

- .. oo:: LISTS_AS_STRING_JSON
:choices: YES, NO
:default: NO
:since: 3.12.1

Whether lists of strings/integers/reals should be reported as String(JSON)
fields rather than String/Integer[64]/RealList.
Useful when null values in such lists must be exactly mapped as such,
instead of being omitted (for lists of strings), or set to 0 (for list of
boolean, integer or real).

Creation issues
---------------

Expand Down
1 change: 1 addition & 0 deletions doc/source/spelling_wordlist.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2816,6 +2816,7 @@ ReadCompressedData
readdir
ReadDir
ReadRaster
RealList
realloc
rebased
Recherches
Expand Down
6 changes: 4 additions & 2 deletions ogr/ogrsf_frmts/arrow/ogr_feather.h
Original file line number Diff line number Diff line change
Expand Up @@ -97,12 +97,14 @@ class OGRFeatherLayer final : public OGRArrowLayer
public:
OGRFeatherLayer(OGRFeatherDataset *poDS, const char *pszLayerName,
std::shared_ptr<arrow::ipc::RecordBatchFileReader>
&poRecordBatchFileReader);
&poRecordBatchFileReader,
CSLConstList papszOpenOptions);
OGRFeatherLayer(OGRFeatherDataset *poDS, const char *pszLayerName,
std::shared_ptr<arrow::io::RandomAccessFile> poFile,
bool bSeekable, const arrow::ipc::IpcReadOptions &oOptions,
std::shared_ptr<arrow::ipc::RecordBatchStreamReader>
&poRecordBatchStreamReader);
&poRecordBatchStreamReader,
CSLConstList papszOpenOptions);

void ResetReading() override;
int TestCapability(const char *pszCap) const override;
Expand Down
4 changes: 2 additions & 2 deletions ogr/ogrsf_frmts/arrow/ogrfeatherdriver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -234,7 +234,7 @@ static GDALDataset *OGRFeatherDriverOpen(GDALOpenInfo *poOpenInfo)
osLayername = "layer";
auto poLayer = std::make_unique<OGRFeatherLayer>(
poDS.get(), osLayername.c_str(), infile, bSeekable, options,
poRecordBatchStreamReader);
poRecordBatchStreamReader, poOpenInfo->papszOpenOptions);
poDS->SetLayer(std::move(poLayer));

// Pre-load field domains, as this depends on the first record batch
Expand Down Expand Up @@ -270,7 +270,7 @@ static GDALDataset *OGRFeatherDriverOpen(GDALOpenInfo *poOpenInfo)
auto poRecordBatchReader = *result;
auto poLayer = std::make_unique<OGRFeatherLayer>(
poDS.get(), CPLGetBasenameSafe(poOpenInfo->pszFilename).c_str(),
poRecordBatchReader);
poRecordBatchReader, poOpenInfo->papszOpenOptions);
poDS->SetLayer(std::move(poLayer));
}
return poDS.release();
Expand Down
10 changes: 10 additions & 0 deletions ogr/ogrsf_frmts/arrow/ogrfeatherdrivercore.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,16 @@ void OGRFeatherDriverSetCommonMetadata(GDALDriver *poDriver)
"WidthPrecision Nullable "
"Comment AlternativeName Domain");

poDriver->SetMetadataItem(
GDAL_DMD_OPENOPTIONLIST,
"<OpenOptionList>"
" <Option name='LISTS_AS_STRING_JSON' type='boolean' description='"
"Whether lists of strings/integers/reals should be reported as "
"String(JSON) fields rather than String/Integer[64]/RealList. Useful "
"when null values in such lists must be exactly mapped as such.' "
"default='NO'/>"
"</OpenOptionList>");

poDriver->pfnIdentify = OGRFeatherDriverIdentify;
poDriver->SetMetadataItem(GDAL_DCAP_OPEN, "YES");
poDriver->SetMetadataItem(GDAL_DCAP_CREATE, "YES");
Expand Down
20 changes: 13 additions & 7 deletions ogr/ogrsf_frmts/arrow/ogrfeatherlayer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,12 @@

OGRFeatherLayer::OGRFeatherLayer(
OGRFeatherDataset *poDS, const char *pszLayerName,
std::shared_ptr<arrow::ipc::RecordBatchFileReader> &poRecordBatchFileReader)
: OGRArrowLayer(poDS, pszLayerName), m_poDS(poDS),
m_poRecordBatchFileReader(poRecordBatchFileReader)
std::shared_ptr<arrow::ipc::RecordBatchFileReader> &poRecordBatchFileReader,
CSLConstList papszOpenOptions)
: OGRArrowLayer(poDS, pszLayerName,
CPLTestBool(CSLFetchNameValueDef(
papszOpenOptions, "LISTS_AS_STRING_JSON", "NO"))),
m_poDS(poDS), m_poRecordBatchFileReader(poRecordBatchFileReader)
{
EstablishFeatureDefn();
CPLAssert(static_cast<int>(m_aeGeomEncoding.size()) ==
Expand All @@ -51,10 +54,13 @@ OGRFeatherLayer::OGRFeatherLayer(
std::shared_ptr<arrow::io::RandomAccessFile> poFile, bool bSeekable,
const arrow::ipc::IpcReadOptions &oOptions,
std::shared_ptr<arrow::ipc::RecordBatchStreamReader>
&poRecordBatchStreamReader)
: OGRArrowLayer(poDS, pszLayerName), m_poDS(poDS),
m_poFile(std::move(poFile)), m_bSeekable(bSeekable), m_oOptions(oOptions),
m_poRecordBatchReader(poRecordBatchStreamReader)
&poRecordBatchStreamReader,
CSLConstList papszOpenOptions)
: OGRArrowLayer(poDS, pszLayerName,
CPLTestBool(CSLFetchNameValueDef(
papszOpenOptions, "LISTS_AS_STRING_JSON", "NO"))),
m_poDS(poDS), m_poFile(std::move(poFile)), m_bSeekable(bSeekable),
m_oOptions(oOptions), m_poRecordBatchReader(poRecordBatchStreamReader)
{
EstablishFeatureDefn();
CPLAssert(static_cast<int>(m_aeGeomEncoding.size()) ==
Expand Down
4 changes: 3 additions & 1 deletion ogr/ogrsf_frmts/arrow_common/ogr_arrow.h
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,7 @@ class OGRArrowLayer CPL_NON_FINAL

protected:
OGRArrowDataset *m_poArrowDS = nullptr;
const bool m_bListsAsStringJson;
arrow::MemoryPool *m_poMemoryPool = nullptr;
OGRFeatureDefn *m_poFeatureDefn = nullptr;
std::shared_ptr<arrow::Schema> m_poSchema{};
Expand Down Expand Up @@ -228,7 +229,8 @@ class OGRArrowLayer CPL_NON_FINAL

void LoadGDALMetadata(const arrow::KeyValueMetadata *kv_metadata);

OGRArrowLayer(OGRArrowDataset *poDS, const char *pszLayerName);
OGRArrowLayer(OGRArrowDataset *poDS, const char *pszLayerName,
bool bListsAsStringJson);

virtual std::string GetDriverUCName() const = 0;
static bool IsIntegerArrowType(arrow::Type::type typeId);
Expand Down
45 changes: 39 additions & 6 deletions ogr/ogrsf_frmts/arrow_common/ograrrowlayer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,10 @@ inline IOGRArrowLayer::~IOGRArrowLayer() = default;
/************************************************************************/

inline OGRArrowLayer::OGRArrowLayer(OGRArrowDataset *poDS,
const char *pszLayerName)
: m_poArrowDS(poDS), m_poMemoryPool(poDS->GetMemoryPool())
const char *pszLayerName,
bool bListsAsStringJson)
: m_poArrowDS(poDS), m_bListsAsStringJson(bListsAsStringJson),
m_poMemoryPool(poDS->GetMemoryPool())
{
m_poFeatureDefn = new OGRFeatureDefn(pszLayerName);
m_poFeatureDefn->SetGeomType(wkbNone);
Expand Down Expand Up @@ -522,6 +524,13 @@ inline bool OGRArrowLayer::MapArrowTypeToOGR(
break;
}
}

if (bTypeOK && m_bListsAsStringJson)
{
eType = OFTString;
eSubType = OFSTJSON;
}

break;
}

Expand Down Expand Up @@ -2525,8 +2534,20 @@ inline OGRFeature *OGRArrowLayer::ReadFeature(
static_cast<const arrow::ListArray *>(array);
const auto listType = static_cast<const arrow::ListType *>(
array->data()->type.get());
ReadList(poFeature, i, nIdxInBatch, castArray,
listType->value_field()->type()->id());

if (m_bListsAsStringJson)
{
poFeature->SetField(
i, GetListAsJSON(castArray,
static_cast<size_t>(nIdxInBatch))
.Format(CPLJSONObject::PrettyFormat::Plain)
.c_str());
}
else
{
ReadList(poFeature, i, nIdxInBatch, castArray,
listType->value_field()->type()->id());
}
break;
}

Expand All @@ -2537,8 +2558,20 @@ inline OGRFeature *OGRArrowLayer::ReadFeature(
const auto listType =
static_cast<const arrow::FixedSizeListType *>(
array->data()->type.get());
ReadList(poFeature, i, nIdxInBatch, castArray,
listType->value_field()->type()->id());

if (m_bListsAsStringJson)
{
poFeature->SetField(
i, GetListAsJSON(castArray,
static_cast<size_t>(nIdxInBatch))
.Format(CPLJSONObject::PrettyFormat::Plain)
.c_str());
}
else
{
ReadList(poFeature, i, nIdxInBatch, castArray,
listType->value_field()->type()->id());
}
break;
}

Expand Down
5 changes: 5 additions & 0 deletions ogr/ogrsf_frmts/parquet/ogrparquetdrivercore.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,11 @@ void OGRParquetDriverSetCommonMetadata(GDALDriver *poDriver)
" <Option name='CRS' type='string' "
"description='Set/override CRS, typically defined as AUTH:CODE "
"(e.g EPSG:4326), of geometry column(s)'/>"
" <Option name='LISTS_AS_STRING_JSON' type='boolean' description='"
"Whether lists of strings/integers/reals should be reported as "
"String(JSON) fields rather than String/Integer[64]/RealList. Useful "
"when null values in such lists must be exactly mapped as such.' "
"default='NO'/>"
"</OpenOptionList>");

poDriver->pfnIdentify = OGRParquetDriverIdentify;
Expand Down
5 changes: 4 additions & 1 deletion ogr/ogrsf_frmts/parquet/ogrparquetlayer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,10 @@
OGRParquetLayerBase::OGRParquetLayerBase(OGRParquetDataset *poDS,
const char *pszLayerName,
CSLConstList papszOpenOptions)
: OGRArrowLayer(poDS, pszLayerName), m_poDS(poDS),
: OGRArrowLayer(poDS, pszLayerName,
CPLTestBool(CSLFetchNameValueDef(
papszOpenOptions, "LISTS_AS_STRING_JSON", "NO"))),
m_poDS(poDS),
m_aosGeomPossibleNames(CSLTokenizeString2(
CSLFetchNameValueDef(papszOpenOptions, "GEOM_POSSIBLE_NAMES",
"geometry,wkb_geometry,wkt_geometry"),
Expand Down
Loading