Skip to content

Commit f6e7a4b

Browse files
committed
Arrow/Parquet: add a LISTS_AS_STRING_JSON=YES/NO open option
- .. oo:: LISTS_AS_STRING_JSON :choices: YES, NO :default: NO :since: 3.12.1 Whether lists of strings/integers/reals should be reported as String(JSON) fields rather than String/Integer[64]/RealList. Useful when null values in such lists must be exactly mapped as such, instead of being omitted (for lists of strings), or set to 0 (for list of boolean, integer or real). Fixes #13448
1 parent d9cd4aa commit f6e7a4b

File tree

12 files changed

+186
-19
lines changed

12 files changed

+186
-19
lines changed

autotest/ogr/ogr_arrow.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -982,3 +982,42 @@ def test_ogr_arrow_convert_from_geoarrow_wkb_with_extension_loaded(tmp_path):
982982
with gdal.OpenEx(tmp_path / "out.feather") as ds:
983983
lyr = ds.GetLayer(0)
984984
assert lyr.GetFeatureCount() == 9
985+
986+
987+
###############################################################################
988+
989+
990+
def test_ogr_arrow_lists_as_string_json():
991+
992+
ds = gdal.OpenEx(
993+
"data/arrow/test.feather", open_options=["LISTS_AS_STRING_JSON=YES"]
994+
)
995+
lyr = ds.GetLayer(0)
996+
lyr_defn = lyr.GetLayerDefn()
997+
assert (
998+
lyr_defn.GetFieldDefn(lyr_defn.GetFieldIndex("list_boolean")).GetType()
999+
== ogr.OFTString
1000+
)
1001+
assert (
1002+
lyr_defn.GetFieldDefn(lyr_defn.GetFieldIndex("list_boolean")).GetSubType()
1003+
== ogr.OFSTJSON
1004+
)
1005+
assert (
1006+
lyr_defn.GetFieldDefn(
1007+
lyr_defn.GetFieldIndex("fixed_size_list_float64")
1008+
).GetType()
1009+
== ogr.OFTString
1010+
)
1011+
assert (
1012+
lyr_defn.GetFieldDefn(
1013+
lyr_defn.GetFieldIndex("fixed_size_list_float64")
1014+
).GetSubType()
1015+
== ogr.OFSTJSON
1016+
)
1017+
f = lyr.GetFeature(4)
1018+
assert f["list_boolean"] == "[null,false,true,false]"
1019+
assert f["list_uint8"] == "[null,7,8,9]"
1020+
assert f["list_int64"] == "[null,7,8,9]"
1021+
assert f["list_float64"] == "[null,7.5,8.5,9.5]"
1022+
assert f["list_string"] == "[null]"
1023+
assert f["fixed_size_list_float64"] == "[8.0,9.0]"

autotest/ogr/ogr_parquet.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4684,3 +4684,42 @@ def test_ogr_parquet_arrow_stream_list_of_struct_ignored_fields():
46844684
{"a": 1, "b": 2},
46854685
{"a": 1, "b": 2},
46864686
]
4687+
4688+
4689+
###############################################################################
4690+
4691+
4692+
def test_ogr_parquet_lists_as_string_json():
4693+
4694+
ds = gdal.OpenEx(
4695+
"data/parquet/test.parquet", open_options=["LISTS_AS_STRING_JSON=YES"]
4696+
)
4697+
lyr = ds.GetLayer(0)
4698+
lyr_defn = lyr.GetLayerDefn()
4699+
assert (
4700+
lyr_defn.GetFieldDefn(lyr_defn.GetFieldIndex("list_boolean")).GetType()
4701+
== ogr.OFTString
4702+
)
4703+
assert (
4704+
lyr_defn.GetFieldDefn(lyr_defn.GetFieldIndex("list_boolean")).GetSubType()
4705+
== ogr.OFSTJSON
4706+
)
4707+
assert (
4708+
lyr_defn.GetFieldDefn(
4709+
lyr_defn.GetFieldIndex("fixed_size_list_float64")
4710+
).GetType()
4711+
== ogr.OFTString
4712+
)
4713+
assert (
4714+
lyr_defn.GetFieldDefn(
4715+
lyr_defn.GetFieldIndex("fixed_size_list_float64")
4716+
).GetSubType()
4717+
== ogr.OFSTJSON
4718+
)
4719+
f = lyr.GetFeature(4)
4720+
assert f["list_boolean"] == "[null,false,true,false]"
4721+
assert f["list_uint8"] == "[null,7,8,9]"
4722+
assert f["list_int64"] == "[null,7,8,9]"
4723+
assert f["list_float64"] == "[null,7.5,8.5,9.5]"
4724+
assert f["list_string"] == "[null]"
4725+
assert f["fixed_size_list_float64"] == "[8.0,9.0]"

doc/source/drivers/vector/arrow.rst

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,23 @@ Driver capabilities
4949

5050
.. supports_virtualio::
5151

52+
Open options
53+
------------
54+
55+
|about-open-options|
56+
The following open options are supported:
57+
58+
- .. oo:: LISTS_AS_STRING_JSON
59+
:choices: YES, NO
60+
:default: NO
61+
:since: 3.12.1
62+
63+
Whether lists of strings/integers/reals should be reported as String(JSON)
64+
fields rather than String/Integer[64]/RealList.
65+
Useful when null values in such lists must be exactly mapped as such,
66+
instead of being omitted (for lists of strings), or set to 0 (for list of
67+
boolean, integer or real).
68+
5269
Creation issues
5370
---------------
5471

doc/source/drivers/vector/parquet.rst

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,17 @@ The following open options are supported:
5353
The string is typically formatted as CODE:AUTH (e.g "EPSG:4326"), or can
5454
be a PROJ.4 or WKT CRS string.
5555

56+
- .. oo:: LISTS_AS_STRING_JSON
57+
:choices: YES, NO
58+
:default: NO
59+
:since: 3.12.1
60+
61+
Whether lists of strings/integers/reals should be reported as String(JSON)
62+
fields rather than String/Integer[64]/RealList.
63+
Useful when null values in such lists must be exactly mapped as such,
64+
instead of being omitted (for lists of strings), or set to 0 (for list of
65+
boolean, integer or real).
66+
5667
Creation issues
5768
---------------
5869

ogr/ogrsf_frmts/arrow/ogr_feather.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -97,12 +97,14 @@ class OGRFeatherLayer final : public OGRArrowLayer
9797
public:
9898
OGRFeatherLayer(OGRFeatherDataset *poDS, const char *pszLayerName,
9999
std::shared_ptr<arrow::ipc::RecordBatchFileReader>
100-
&poRecordBatchFileReader);
100+
&poRecordBatchFileReader,
101+
CSLConstList papszOpenOptions);
101102
OGRFeatherLayer(OGRFeatherDataset *poDS, const char *pszLayerName,
102103
std::shared_ptr<arrow::io::RandomAccessFile> poFile,
103104
bool bSeekable, const arrow::ipc::IpcReadOptions &oOptions,
104105
std::shared_ptr<arrow::ipc::RecordBatchStreamReader>
105-
&poRecordBatchStreamReader);
106+
&poRecordBatchStreamReader,
107+
CSLConstList papszOpenOptions);
106108

107109
void ResetReading() override;
108110
int TestCapability(const char *pszCap) const override;

ogr/ogrsf_frmts/arrow/ogrfeatherdriver.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -234,7 +234,7 @@ static GDALDataset *OGRFeatherDriverOpen(GDALOpenInfo *poOpenInfo)
234234
osLayername = "layer";
235235
auto poLayer = std::make_unique<OGRFeatherLayer>(
236236
poDS.get(), osLayername.c_str(), infile, bSeekable, options,
237-
poRecordBatchStreamReader);
237+
poRecordBatchStreamReader, poOpenInfo->papszOpenOptions);
238238
poDS->SetLayer(std::move(poLayer));
239239

240240
// Pre-load field domains, as this depends on the first record batch
@@ -270,7 +270,7 @@ static GDALDataset *OGRFeatherDriverOpen(GDALOpenInfo *poOpenInfo)
270270
auto poRecordBatchReader = *result;
271271
auto poLayer = std::make_unique<OGRFeatherLayer>(
272272
poDS.get(), CPLGetBasenameSafe(poOpenInfo->pszFilename).c_str(),
273-
poRecordBatchReader);
273+
poRecordBatchReader, poOpenInfo->papszOpenOptions);
274274
poDS->SetLayer(std::move(poLayer));
275275
}
276276
return poDS.release();

ogr/ogrsf_frmts/arrow/ogrfeatherdrivercore.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,16 @@ void OGRFeatherDriverSetCommonMetadata(GDALDriver *poDriver)
182182
"WidthPrecision Nullable "
183183
"Comment AlternativeName Domain");
184184

185+
poDriver->SetMetadataItem(
186+
GDAL_DMD_OPENOPTIONLIST,
187+
"<OpenOptionList>"
188+
" <Option name='LISTS_AS_STRING_JSON' type='boolean' description='"
189+
"Whether lists of strings/integers/reals should be reported as "
190+
"String(JSON) fields rather than String/Integer[64]/RealList. Useful "
191+
"when null values in such lists must be exactly mapped as such.' "
192+
"default='NO'/>"
193+
"</OpenOptionList>");
194+
185195
poDriver->pfnIdentify = OGRFeatherDriverIdentify;
186196
poDriver->SetMetadataItem(GDAL_DCAP_OPEN, "YES");
187197
poDriver->SetMetadataItem(GDAL_DCAP_CREATE, "YES");

ogr/ogrsf_frmts/arrow/ogrfeatherlayer.cpp

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -33,9 +33,12 @@
3333

3434
OGRFeatherLayer::OGRFeatherLayer(
3535
OGRFeatherDataset *poDS, const char *pszLayerName,
36-
std::shared_ptr<arrow::ipc::RecordBatchFileReader> &poRecordBatchFileReader)
37-
: OGRArrowLayer(poDS, pszLayerName), m_poDS(poDS),
38-
m_poRecordBatchFileReader(poRecordBatchFileReader)
36+
std::shared_ptr<arrow::ipc::RecordBatchFileReader> &poRecordBatchFileReader,
37+
CSLConstList papszOpenOptions)
38+
: OGRArrowLayer(poDS, pszLayerName,
39+
CPLTestBool(CSLFetchNameValueDef(
40+
papszOpenOptions, "LISTS_AS_STRING_JSON", "NO"))),
41+
m_poDS(poDS), m_poRecordBatchFileReader(poRecordBatchFileReader)
3942
{
4043
EstablishFeatureDefn();
4144
CPLAssert(static_cast<int>(m_aeGeomEncoding.size()) ==
@@ -51,10 +54,13 @@ OGRFeatherLayer::OGRFeatherLayer(
5154
std::shared_ptr<arrow::io::RandomAccessFile> poFile, bool bSeekable,
5255
const arrow::ipc::IpcReadOptions &oOptions,
5356
std::shared_ptr<arrow::ipc::RecordBatchStreamReader>
54-
&poRecordBatchStreamReader)
55-
: OGRArrowLayer(poDS, pszLayerName), m_poDS(poDS),
56-
m_poFile(std::move(poFile)), m_bSeekable(bSeekable), m_oOptions(oOptions),
57-
m_poRecordBatchReader(poRecordBatchStreamReader)
57+
&poRecordBatchStreamReader,
58+
CSLConstList papszOpenOptions)
59+
: OGRArrowLayer(poDS, pszLayerName,
60+
CPLTestBool(CSLFetchNameValueDef(
61+
papszOpenOptions, "LISTS_AS_STRING_JSON", "NO"))),
62+
m_poDS(poDS), m_poFile(std::move(poFile)), m_bSeekable(bSeekable),
63+
m_oOptions(oOptions), m_poRecordBatchReader(poRecordBatchStreamReader)
5864
{
5965
EstablishFeatureDefn();
6066
CPLAssert(static_cast<int>(m_aeGeomEncoding.size()) ==

ogr/ogrsf_frmts/arrow_common/ogr_arrow.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,7 @@ class OGRArrowLayer CPL_NON_FINAL
141141

142142
protected:
143143
OGRArrowDataset *m_poArrowDS = nullptr;
144+
const bool m_bListsAsStringJson;
144145
arrow::MemoryPool *m_poMemoryPool = nullptr;
145146
OGRFeatureDefn *m_poFeatureDefn = nullptr;
146147
std::shared_ptr<arrow::Schema> m_poSchema{};
@@ -228,7 +229,8 @@ class OGRArrowLayer CPL_NON_FINAL
228229

229230
void LoadGDALMetadata(const arrow::KeyValueMetadata *kv_metadata);
230231

231-
OGRArrowLayer(OGRArrowDataset *poDS, const char *pszLayerName);
232+
OGRArrowLayer(OGRArrowDataset *poDS, const char *pszLayerName,
233+
bool bListsAsStringJson);
232234

233235
virtual std::string GetDriverUCName() const = 0;
234236
static bool IsIntegerArrowType(arrow::Type::type typeId);

ogr/ogrsf_frmts/arrow_common/ograrrowlayer.hpp

Lines changed: 39 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -43,8 +43,10 @@ inline IOGRArrowLayer::~IOGRArrowLayer() = default;
4343
/************************************************************************/
4444

4545
inline OGRArrowLayer::OGRArrowLayer(OGRArrowDataset *poDS,
46-
const char *pszLayerName)
47-
: m_poArrowDS(poDS), m_poMemoryPool(poDS->GetMemoryPool())
46+
const char *pszLayerName,
47+
bool bListsAsStringJson)
48+
: m_poArrowDS(poDS), m_bListsAsStringJson(bListsAsStringJson),
49+
m_poMemoryPool(poDS->GetMemoryPool())
4850
{
4951
m_poFeatureDefn = new OGRFeatureDefn(pszLayerName);
5052
m_poFeatureDefn->SetGeomType(wkbNone);
@@ -522,6 +524,13 @@ inline bool OGRArrowLayer::MapArrowTypeToOGR(
522524
break;
523525
}
524526
}
527+
528+
if (bTypeOK && m_bListsAsStringJson)
529+
{
530+
eType = OFTString;
531+
eSubType = OFSTJSON;
532+
}
533+
525534
break;
526535
}
527536

@@ -2525,8 +2534,20 @@ inline OGRFeature *OGRArrowLayer::ReadFeature(
25252534
static_cast<const arrow::ListArray *>(array);
25262535
const auto listType = static_cast<const arrow::ListType *>(
25272536
array->data()->type.get());
2528-
ReadList(poFeature, i, nIdxInBatch, castArray,
2529-
listType->value_field()->type()->id());
2537+
2538+
if (m_bListsAsStringJson)
2539+
{
2540+
poFeature->SetField(
2541+
i, GetListAsJSON(castArray,
2542+
static_cast<size_t>(nIdxInBatch))
2543+
.Format(CPLJSONObject::PrettyFormat::Plain)
2544+
.c_str());
2545+
}
2546+
else
2547+
{
2548+
ReadList(poFeature, i, nIdxInBatch, castArray,
2549+
listType->value_field()->type()->id());
2550+
}
25302551
break;
25312552
}
25322553

@@ -2537,8 +2558,20 @@ inline OGRFeature *OGRArrowLayer::ReadFeature(
25372558
const auto listType =
25382559
static_cast<const arrow::FixedSizeListType *>(
25392560
array->data()->type.get());
2540-
ReadList(poFeature, i, nIdxInBatch, castArray,
2541-
listType->value_field()->type()->id());
2561+
2562+
if (m_bListsAsStringJson)
2563+
{
2564+
poFeature->SetField(
2565+
i, GetListAsJSON(castArray,
2566+
static_cast<size_t>(nIdxInBatch))
2567+
.Format(CPLJSONObject::PrettyFormat::Plain)
2568+
.c_str());
2569+
}
2570+
else
2571+
{
2572+
ReadList(poFeature, i, nIdxInBatch, castArray,
2573+
listType->value_field()->type()->id());
2574+
}
25422575
break;
25432576
}
25442577

0 commit comments

Comments
 (0)