del env py
This commit is contained in:
@@ -1,887 +0,0 @@
|
||||
"""Low level functions to read and write OGR data sources."""
|
||||
|
||||
import warnings
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
|
||||
from pyogrio._compat import HAS_ARROW_API, HAS_ARROW_WRITE_API, HAS_PYARROW
|
||||
from pyogrio._env import GDALEnv
|
||||
from pyogrio.core import detect_write_driver
|
||||
from pyogrio.errors import DataSourceError
|
||||
from pyogrio.util import (
|
||||
_mask_to_wkb,
|
||||
_preprocess_options_key_value,
|
||||
get_vsi_path_or_buffer,
|
||||
vsi_path,
|
||||
)
|
||||
|
||||
with GDALEnv():
|
||||
from pyogrio._io import ogr_open_arrow, ogr_read, ogr_write, ogr_write_arrow
|
||||
from pyogrio._ogr import (
|
||||
_get_driver_metadata_item,
|
||||
get_gdal_version,
|
||||
get_gdal_version_string,
|
||||
ogr_driver_supports_vsi,
|
||||
ogr_driver_supports_write,
|
||||
)
|
||||
|
||||
|
||||
DRIVERS_NO_MIXED_SINGLE_MULTI = {
|
||||
"FlatGeobuf",
|
||||
"GPKG",
|
||||
}
|
||||
|
||||
DRIVERS_NO_MIXED_DIMENSIONS = {
|
||||
"FlatGeobuf",
|
||||
}
|
||||
|
||||
|
||||
def read(
|
||||
path_or_buffer,
|
||||
/,
|
||||
layer=None,
|
||||
encoding=None,
|
||||
columns=None,
|
||||
read_geometry=True,
|
||||
force_2d=False,
|
||||
skip_features=0,
|
||||
max_features=None,
|
||||
where=None,
|
||||
bbox=None,
|
||||
mask=None,
|
||||
fids=None,
|
||||
sql=None,
|
||||
sql_dialect=None,
|
||||
return_fids=False,
|
||||
datetime_as_string=False,
|
||||
**kwargs,
|
||||
):
|
||||
"""Read OGR data source into numpy arrays.
|
||||
|
||||
IMPORTANT: non-linear geometry types (e.g., MultiSurface) are converted
|
||||
to their linear approximations.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path_or_buffer : pathlib.Path or str, or bytes buffer
|
||||
A dataset path or URI, raw buffer, or file-like object with a read method.
|
||||
layer : int or str, optional (default: first layer)
|
||||
If an integer is provided, it corresponds to the index of the layer
|
||||
with the data source. If a string is provided, it must match the name
|
||||
of the layer in the data source. Defaults to first layer in data source.
|
||||
encoding : str, optional (default: None)
|
||||
If present, will be used as the encoding for reading string values from
|
||||
the data source. By default will automatically try to detect the native
|
||||
encoding and decode to ``UTF-8``.
|
||||
columns : list-like, optional (default: all columns)
|
||||
List of column names to import from the data source. Column names must
|
||||
exactly match the names in the data source, and will be returned in
|
||||
the order they occur in the data source. To avoid reading any columns,
|
||||
pass an empty list-like. If combined with ``where`` parameter, must
|
||||
include columns referenced in the ``where`` expression or the data may
|
||||
not be correctly read; the data source may return empty results or
|
||||
raise an exception (behavior varies by driver).
|
||||
read_geometry : bool, optional (default: True)
|
||||
If True, will read geometry into WKB. If False, geometry will be None.
|
||||
force_2d : bool, optional (default: False)
|
||||
If the geometry has Z values, setting this to True will cause those to
|
||||
be ignored and 2D geometries to be returned
|
||||
skip_features : int, optional (default: 0)
|
||||
Number of features to skip from the beginning of the file before
|
||||
returning features. If greater than available number of features, an
|
||||
empty DataFrame will be returned. Using this parameter may incur
|
||||
significant overhead if the driver does not support the capability to
|
||||
randomly seek to a specific feature, because it will need to iterate
|
||||
over all prior features.
|
||||
max_features : int, optional (default: None)
|
||||
Number of features to read from the file.
|
||||
where : str, optional (default: None)
|
||||
Where clause to filter features in layer by attribute values. If the data source
|
||||
natively supports SQL, its specific SQL dialect should be used (eg. SQLite and
|
||||
GeoPackage: `SQLITE`_, PostgreSQL). If it doesn't, the `OGRSQL WHERE`_ syntax
|
||||
should be used. Note that it is not possible to overrule the SQL dialect, this
|
||||
is only possible when you use the SQL parameter.
|
||||
Examples: ``"ISO_A3 = 'CAN'"``, ``"POP_EST > 10000000 AND POP_EST < 100000000"``
|
||||
bbox : tuple of (xmin, ymin, xmax, ymax), optional (default: None)
|
||||
If present, will be used to filter records whose geometry intersects this
|
||||
box. This must be in the same CRS as the dataset. If GEOS is present
|
||||
and used by GDAL, only geometries that intersect this bbox will be
|
||||
returned; if GEOS is not available or not used by GDAL, all geometries
|
||||
with bounding boxes that intersect this bbox will be returned.
|
||||
Cannot be combined with ``mask`` keyword.
|
||||
mask : Shapely geometry, optional (default: None)
|
||||
If present, will be used to filter records whose geometry intersects
|
||||
this geometry. This must be in the same CRS as the dataset. If GEOS is
|
||||
present and used by GDAL, only geometries that intersect this geometry
|
||||
will be returned; if GEOS is not available or not used by GDAL, all
|
||||
geometries with bounding boxes that intersect the bounding box of this
|
||||
geometry will be returned. Requires Shapely >= 2.0.
|
||||
Cannot be combined with ``bbox`` keyword.
|
||||
fids : array-like, optional (default: None)
|
||||
Array of integer feature id (FID) values to select. Cannot be combined
|
||||
with other keywords to select a subset (``skip_features``,
|
||||
``max_features``, ``where``, ``bbox``, or ``mask``). Note that the
|
||||
starting index is driver and file specific (e.g. typically 0 for
|
||||
Shapefile and 1 for GeoPackage, but can still depend on the specific
|
||||
file). The performance of reading a large number of features usings FIDs
|
||||
is also driver specific.
|
||||
sql : str, optional (default: None)
|
||||
The SQL statement to execute. Look at the sql_dialect parameter for more
|
||||
information on the syntax to use for the query. When combined with other
|
||||
keywords like ``columns``, ``skip_features``, ``max_features``,
|
||||
``where``, ``bbox``, or ``mask``, those are applied after the SQL query.
|
||||
Be aware that this can have an impact on performance, (e.g. filtering
|
||||
with the ``bbox`` or ``mask`` keywords may not use spatial indexes).
|
||||
Cannot be combined with the ``layer`` or ``fids`` keywords.
|
||||
sql_dialect : str, optional (default: None)
|
||||
The SQL dialect the ``sql`` statement is written in. Possible values:
|
||||
|
||||
- **None**: if the data source natively supports SQL, its specific SQL dialect
|
||||
will be used by default (eg. SQLite and Geopackage: `SQLITE`_, PostgreSQL).
|
||||
If the data source doesn't natively support SQL, the `OGRSQL`_ dialect is
|
||||
the default.
|
||||
- '`OGRSQL`_': can be used on any data source. Performance can suffer
|
||||
when used on data sources with native support for SQL.
|
||||
- '`SQLITE`_': can be used on any data source. All spatialite_
|
||||
functions can be used. Performance can suffer on data sources with
|
||||
native support for SQL, except for Geopackage and SQLite as this is
|
||||
their native SQL dialect.
|
||||
|
||||
return_fids : bool, optional (default: False)
|
||||
If True, will return the FIDs of the feature that were read.
|
||||
datetime_as_string : bool, optional (default: False)
|
||||
If True, will return datetime dtypes as detected by GDAL as a string
|
||||
array (which can be used to extract timezone info), instead of
|
||||
a datetime64 array.
|
||||
|
||||
**kwargs
|
||||
Additional driver-specific dataset open options passed to OGR. Invalid
|
||||
options will trigger a warning.
|
||||
|
||||
Returns
|
||||
-------
|
||||
(dict, fids, geometry, data fields)
|
||||
Returns a tuple of meta information about the data source in a dict,
|
||||
an ndarray of FIDs corresponding to the features that were read or None
|
||||
(if return_fids is False),
|
||||
an ndarray of geometry objects or None (if data source does not include
|
||||
geometry or read_geometry is False), a tuple of ndarrays for each field
|
||||
in the data layer.
|
||||
|
||||
Meta is: {
|
||||
"crs": "<crs>",
|
||||
"fields": <ndarray of field names>,
|
||||
"dtypes": <ndarray of numpy dtypes corresponding to fields>
|
||||
"encoding": "<encoding>",
|
||||
"geometry_type": "<geometry type>"
|
||||
}
|
||||
|
||||
.. _OGRSQL:
|
||||
|
||||
https://gdal.org/user/ogr_sql_dialect.html#ogr-sql-dialect
|
||||
|
||||
.. _OGRSQL WHERE:
|
||||
|
||||
https://gdal.org/user/ogr_sql_dialect.html#where
|
||||
|
||||
.. _SQLITE:
|
||||
|
||||
https://gdal.org/user/sql_sqlite_dialect.html#sql-sqlite-dialect
|
||||
|
||||
.. _spatialite:
|
||||
|
||||
https://www.gaia-gis.it/gaia-sins/spatialite-sql-latest.html
|
||||
|
||||
"""
|
||||
dataset_kwargs = _preprocess_options_key_value(kwargs) if kwargs else {}
|
||||
|
||||
return ogr_read(
|
||||
get_vsi_path_or_buffer(path_or_buffer),
|
||||
layer=layer,
|
||||
encoding=encoding,
|
||||
columns=columns,
|
||||
read_geometry=read_geometry,
|
||||
force_2d=force_2d,
|
||||
skip_features=skip_features,
|
||||
max_features=max_features or 0,
|
||||
where=where,
|
||||
bbox=bbox,
|
||||
mask=_mask_to_wkb(mask),
|
||||
fids=fids,
|
||||
sql=sql,
|
||||
sql_dialect=sql_dialect,
|
||||
return_fids=return_fids,
|
||||
dataset_kwargs=dataset_kwargs,
|
||||
datetime_as_string=datetime_as_string,
|
||||
)
|
||||
|
||||
|
||||
def read_arrow(
|
||||
path_or_buffer,
|
||||
/,
|
||||
layer=None,
|
||||
encoding=None,
|
||||
columns=None,
|
||||
read_geometry=True,
|
||||
force_2d=False,
|
||||
skip_features=0,
|
||||
max_features=None,
|
||||
where=None,
|
||||
bbox=None,
|
||||
mask=None,
|
||||
fids=None,
|
||||
sql=None,
|
||||
sql_dialect=None,
|
||||
return_fids=False,
|
||||
**kwargs,
|
||||
):
|
||||
"""Read OGR data source into a pyarrow Table.
|
||||
|
||||
See docstring of `read` for parameters.
|
||||
|
||||
Returns
|
||||
-------
|
||||
(dict, pyarrow.Table)
|
||||
|
||||
Returns a tuple of meta information about the data source in a dict,
|
||||
and a pyarrow Table with data.
|
||||
|
||||
Meta is: {
|
||||
"crs": "<crs>",
|
||||
"fields": <ndarray of field names>,
|
||||
"encoding": "<encoding>",
|
||||
"geometry_type": "<geometry_type>",
|
||||
"geometry_name": "<name of geometry column in arrow table>",
|
||||
}
|
||||
|
||||
"""
|
||||
if not HAS_PYARROW:
|
||||
raise RuntimeError(
|
||||
"pyarrow required to read using 'read_arrow'. You can use 'open_arrow' "
|
||||
"to read data with an alternative Arrow implementation"
|
||||
)
|
||||
|
||||
from pyarrow import Table
|
||||
|
||||
gdal_version = get_gdal_version()
|
||||
|
||||
if skip_features < 0:
|
||||
raise ValueError("'skip_features' must be >= 0")
|
||||
|
||||
if max_features is not None and max_features < 0:
|
||||
raise ValueError("'max_features' must be >= 0")
|
||||
|
||||
# limit batch size to max_features if set
|
||||
if "batch_size" in kwargs:
|
||||
batch_size = kwargs.pop("batch_size")
|
||||
else:
|
||||
batch_size = 65_536
|
||||
|
||||
if max_features is not None and max_features < batch_size:
|
||||
batch_size = max_features
|
||||
|
||||
# handle skip_features internally within open_arrow if GDAL >= 3.8.0
|
||||
gdal_skip_features = 0
|
||||
if gdal_version >= (3, 8, 0):
|
||||
gdal_skip_features = skip_features
|
||||
skip_features = 0
|
||||
|
||||
with open_arrow(
|
||||
path_or_buffer,
|
||||
layer=layer,
|
||||
encoding=encoding,
|
||||
columns=columns,
|
||||
read_geometry=read_geometry,
|
||||
force_2d=force_2d,
|
||||
where=where,
|
||||
bbox=bbox,
|
||||
mask=mask,
|
||||
fids=fids,
|
||||
sql=sql,
|
||||
sql_dialect=sql_dialect,
|
||||
return_fids=return_fids,
|
||||
skip_features=gdal_skip_features,
|
||||
batch_size=batch_size,
|
||||
use_pyarrow=True,
|
||||
**kwargs,
|
||||
) as source:
|
||||
meta, reader = source
|
||||
|
||||
if max_features is not None:
|
||||
batches = []
|
||||
count = 0
|
||||
while True:
|
||||
try:
|
||||
batch = reader.read_next_batch()
|
||||
batches.append(batch)
|
||||
|
||||
count += len(batch)
|
||||
if count >= (skip_features + max_features):
|
||||
break
|
||||
|
||||
except StopIteration:
|
||||
break
|
||||
|
||||
# use combine_chunks to release the original memory that included
|
||||
# too many features
|
||||
table = (
|
||||
Table.from_batches(batches, schema=reader.schema)
|
||||
.slice(skip_features, max_features)
|
||||
.combine_chunks()
|
||||
)
|
||||
|
||||
elif skip_features > 0:
|
||||
table = reader.read_all().slice(skip_features).combine_chunks()
|
||||
|
||||
else:
|
||||
table = reader.read_all()
|
||||
|
||||
return meta, table
|
||||
|
||||
|
||||
def open_arrow(
|
||||
path_or_buffer,
|
||||
/,
|
||||
layer=None,
|
||||
encoding=None,
|
||||
columns=None,
|
||||
read_geometry=True,
|
||||
force_2d=False,
|
||||
skip_features=0,
|
||||
max_features=None,
|
||||
where=None,
|
||||
bbox=None,
|
||||
mask=None,
|
||||
fids=None,
|
||||
sql=None,
|
||||
sql_dialect=None,
|
||||
return_fids=False,
|
||||
batch_size=65_536,
|
||||
use_pyarrow=False,
|
||||
**kwargs,
|
||||
):
|
||||
"""Open OGR data source as a stream of Arrow record batches.
|
||||
|
||||
See docstring of `read` for parameters.
|
||||
|
||||
The returned object is reading from a stream provided by OGR and must not be
|
||||
accessed after the OGR dataset has been closed, i.e. after the context manager has
|
||||
been closed.
|
||||
|
||||
By default this functions returns a generic stream object implementing
|
||||
the `Arrow PyCapsule Protocol`_ (i.e. having an ``__arrow_c_stream__``
|
||||
method). This object can then be consumed by your Arrow implementation
|
||||
of choice that supports this protocol.
|
||||
Optionally, you can specify ``use_pyarrow=True`` to directly get the
|
||||
stream as a `pyarrow.RecordBatchReader`.
|
||||
|
||||
.. _Arrow PyCapsule Protocol: https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html
|
||||
|
||||
Other Parameters
|
||||
----------------
|
||||
batch_size : int (default: 65_536)
|
||||
Maximum number of features to retrieve in a batch.
|
||||
use_pyarrow : bool (default: False)
|
||||
If True, return a pyarrow RecordBatchReader instead of a generic
|
||||
ArrowStream object. In the default case, this stream object needs
|
||||
to be passed to another library supporting the Arrow PyCapsule
|
||||
Protocol to consume the stream of data.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from pyogrio.raw import open_arrow
|
||||
>>> import pyarrow as pa
|
||||
>>> import shapely
|
||||
>>>
|
||||
>>> with open_arrow(path) as source:
|
||||
>>> meta, stream = source
|
||||
>>> # wrap the arrow stream object in a pyarrow RecordBatchReader
|
||||
>>> reader = pa.RecordBatchReader.from_stream(stream)
|
||||
>>> geom_col = meta["geometry_name"] or "wkb_geometry"
|
||||
>>> for batch in reader:
|
||||
>>> geometries = shapely.from_wkb(batch[geom_col])
|
||||
|
||||
The returned `stream` object needs to be consumed by a library implementing
|
||||
the Arrow PyCapsule Protocol. In the above example, pyarrow is used through
|
||||
its RecordBatchReader. For this case, you can also specify ``use_pyarrow=True``
|
||||
to directly get this result as a short-cut:
|
||||
|
||||
>>> with open_arrow(path, use_pyarrow=True) as source:
|
||||
>>> meta, reader = source
|
||||
>>> geom_col = meta["geometry_name"] or "wkb_geometry"
|
||||
>>> for batch in reader:
|
||||
>>> geometries = shapely.from_wkb(batch[geom_col])
|
||||
|
||||
Returns
|
||||
-------
|
||||
(dict, pyarrow.RecordBatchReader or ArrowStream)
|
||||
|
||||
Returns a tuple of meta information about the data source in a dict,
|
||||
and a data stream object (a generic ArrowStream object, or a pyarrow
|
||||
RecordBatchReader if `use_pyarrow` is set to True).
|
||||
|
||||
Meta is: {
|
||||
"crs": "<crs>",
|
||||
"fields": <ndarray of field names>,
|
||||
"encoding": "<encoding>",
|
||||
"geometry_type": "<geometry_type>",
|
||||
"geometry_name": "<name of geometry column in arrow table>",
|
||||
}
|
||||
|
||||
"""
|
||||
if not HAS_ARROW_API:
|
||||
raise RuntimeError("GDAL>= 3.6 required to read using arrow")
|
||||
|
||||
dataset_kwargs = _preprocess_options_key_value(kwargs) if kwargs else {}
|
||||
|
||||
return ogr_open_arrow(
|
||||
get_vsi_path_or_buffer(path_or_buffer),
|
||||
layer=layer,
|
||||
encoding=encoding,
|
||||
columns=columns,
|
||||
read_geometry=read_geometry,
|
||||
force_2d=force_2d,
|
||||
skip_features=skip_features,
|
||||
max_features=max_features or 0,
|
||||
where=where,
|
||||
bbox=bbox,
|
||||
mask=_mask_to_wkb(mask),
|
||||
fids=fids,
|
||||
sql=sql,
|
||||
sql_dialect=sql_dialect,
|
||||
return_fids=return_fids,
|
||||
dataset_kwargs=dataset_kwargs,
|
||||
batch_size=batch_size,
|
||||
use_pyarrow=use_pyarrow,
|
||||
)
|
||||
|
||||
|
||||
def _parse_options_names(xml):
|
||||
"""Convert metadata xml to list of names."""
|
||||
# Based on Fiona's meta.py
|
||||
# (https://github.com/Toblerity/Fiona/blob/91c13ad8424641557a4e5f038f255f9b657b1bc5/fiona/meta.py)
|
||||
import xml.etree.ElementTree as ET
|
||||
|
||||
options = []
|
||||
if xml:
|
||||
root = ET.fromstring(xml)
|
||||
for option in root.iter("Option"):
|
||||
# some options explicitly have scope='raster'
|
||||
if option.attrib.get("scope", "vector") != "raster":
|
||||
options.append(option.attrib["name"])
|
||||
|
||||
return options
|
||||
|
||||
|
||||
def _validate_metadata(dataset_metadata, layer_metadata, metadata):
|
||||
"""Validate the metadata."""
|
||||
if metadata is not None:
|
||||
if layer_metadata is not None:
|
||||
raise ValueError("Cannot pass both metadata and layer_metadata")
|
||||
layer_metadata = metadata
|
||||
|
||||
# validate metadata types
|
||||
for meta in [dataset_metadata, layer_metadata]:
|
||||
if meta is not None:
|
||||
for k, v in meta.items():
|
||||
if not isinstance(k, str):
|
||||
raise ValueError(f"metadata key {k} must be a string")
|
||||
|
||||
if not isinstance(v, str):
|
||||
raise ValueError(f"metadata value {v} must be a string")
|
||||
|
||||
return dataset_metadata, layer_metadata
|
||||
|
||||
|
||||
def _preprocess_options_kwargs(driver, dataset_options, layer_options, kwargs):
|
||||
"""Preprocess kwargs and split in dataset and layer creation options."""
|
||||
dataset_kwargs = _preprocess_options_key_value(dataset_options or {})
|
||||
layer_kwargs = _preprocess_options_key_value(layer_options or {})
|
||||
if kwargs:
|
||||
kwargs = _preprocess_options_key_value(kwargs)
|
||||
dataset_option_names = _parse_options_names(
|
||||
_get_driver_metadata_item(driver, "DMD_CREATIONOPTIONLIST")
|
||||
)
|
||||
layer_option_names = _parse_options_names(
|
||||
_get_driver_metadata_item(driver, "DS_LAYER_CREATIONOPTIONLIST")
|
||||
)
|
||||
for k, v in kwargs.items():
|
||||
if k in dataset_option_names:
|
||||
dataset_kwargs[k] = v
|
||||
elif k in layer_option_names:
|
||||
layer_kwargs[k] = v
|
||||
else:
|
||||
raise ValueError(f"unrecognized option '{k}' for driver '{driver}'")
|
||||
|
||||
return dataset_kwargs, layer_kwargs
|
||||
|
||||
|
||||
def _get_write_path_driver(path, driver, append=False):
|
||||
"""Validate and return path and driver.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path : str or io.BytesIO
|
||||
path to output file on writeable file system or an io.BytesIO object to
|
||||
allow writing to memory. Will raise NotImplementedError if an open file
|
||||
handle is passed.
|
||||
driver : str, optional (default: None)
|
||||
The OGR format driver used to write the vector file. By default attempts
|
||||
to infer driver from path. Must be provided to write to a file-like
|
||||
object.
|
||||
append : bool, optional (default: False)
|
||||
True if path and driver is being tested for append support
|
||||
|
||||
Returns
|
||||
-------
|
||||
(path, driver)
|
||||
|
||||
"""
|
||||
if isinstance(path, BytesIO):
|
||||
if driver is None:
|
||||
raise ValueError("driver must be provided to write to in-memory file")
|
||||
|
||||
# blacklist certain drivers known not to work in current memory implementation
|
||||
# because they create multiple files
|
||||
if driver in {"ESRI Shapefile", "OpenFileGDB"}:
|
||||
raise ValueError(f"writing to in-memory file is not supported for {driver}")
|
||||
|
||||
# verify that driver supports VSI methods
|
||||
if not ogr_driver_supports_vsi(driver):
|
||||
raise DataSourceError(
|
||||
f"{driver} does not support ability to write in-memory in GDAL "
|
||||
f"{get_gdal_version_string()}"
|
||||
)
|
||||
|
||||
if append:
|
||||
raise NotImplementedError("append is not supported for in-memory files")
|
||||
|
||||
elif hasattr(path, "write") and not isinstance(path, Path):
|
||||
raise NotImplementedError(
|
||||
"writing to an open file handle is not yet supported; instead, write to a "
|
||||
"BytesIO instance and then read bytes from that to write to the file handle"
|
||||
)
|
||||
|
||||
else:
|
||||
path = vsi_path(path)
|
||||
|
||||
if driver is None:
|
||||
driver = detect_write_driver(path)
|
||||
|
||||
# verify that driver supports writing
|
||||
if not ogr_driver_supports_write(driver):
|
||||
raise DataSourceError(
|
||||
f"{driver} does not support write functionality in GDAL "
|
||||
f"{get_gdal_version_string()}"
|
||||
)
|
||||
|
||||
# prevent segfault from: https://github.com/OSGeo/gdal/issues/5739
|
||||
if append and driver == "FlatGeobuf" and get_gdal_version() <= (3, 5, 0):
|
||||
raise RuntimeError(
|
||||
"append to FlatGeobuf is not supported for GDAL <= 3.5.0 due to segfault"
|
||||
)
|
||||
|
||||
return path, driver
|
||||
|
||||
|
||||
def write(
|
||||
path,
|
||||
geometry,
|
||||
field_data,
|
||||
fields,
|
||||
field_mask=None,
|
||||
layer=None,
|
||||
driver=None,
|
||||
# derived from meta if roundtrip
|
||||
geometry_type=None,
|
||||
crs=None,
|
||||
encoding=None,
|
||||
promote_to_multi=None,
|
||||
nan_as_null=True,
|
||||
append=False,
|
||||
dataset_metadata=None,
|
||||
layer_metadata=None,
|
||||
metadata=None,
|
||||
dataset_options=None,
|
||||
layer_options=None,
|
||||
gdal_tz_offsets=None,
|
||||
**kwargs,
|
||||
):
|
||||
"""Write geometry and field data to an OGR file format.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path : str or io.BytesIO
|
||||
path to output file on writeable file system or an io.BytesIO object to
|
||||
allow writing to memory. Will raise NotImplementedError if an open file
|
||||
handle is passed; use BytesIO instead.
|
||||
NOTE: support for writing to memory is limited to specific drivers.
|
||||
geometry : ndarray of WKB encoded geometries or None
|
||||
If None, geometries will not be written to output file
|
||||
field_data : list-like of shape (num_fields, num_records)
|
||||
contains one record per field to be written in same order as fields
|
||||
fields : list-like
|
||||
contains field names
|
||||
field_mask : list-like of ndarrays or None, optional (default: None)
|
||||
contains mask arrays indicating null values of the field at the same
|
||||
position in the outer list, or None to indicate field does not have
|
||||
a mask array
|
||||
layer : str, optional (default: None)
|
||||
layer name to create. If writing to memory and layer name is not
|
||||
provided, it layer name will be set to a UUID4 value.
|
||||
driver : string, optional (default: None)
|
||||
The OGR format driver used to write the vector file. By default attempts
|
||||
to infer driver from path. Must be provided to write to memory.
|
||||
geometry_type : str, optional (default: None)
|
||||
Possible values are: "Unknown", "Point", "LineString", "Polygon",
|
||||
"MultiPoint", "MultiLineString", "MultiPolygon" or "GeometryCollection".
|
||||
|
||||
This parameter does not modify the geometry, but it will try to force
|
||||
the layer type of the output file to this value. Use this parameter with
|
||||
caution because using a wrong layer geometry type may result in errors
|
||||
when writing the file, may be ignored by the driver, or may result in
|
||||
invalid files.
|
||||
crs : str, optional (default: None)
|
||||
WKT-encoded CRS of the geometries to be written.
|
||||
encoding : str, optional (default: None)
|
||||
If present, will be used as the encoding for writing string values to
|
||||
the file. Use with caution, only certain drivers support encodings
|
||||
other than UTF-8.
|
||||
promote_to_multi : bool, optional (default: None)
|
||||
If True, will convert singular geometry types in the data to their
|
||||
corresponding multi geometry type for writing. By default, will convert
|
||||
mixed singular and multi geometry types to multi geometry types for
|
||||
drivers that do not support mixed singular and multi geometry types. If
|
||||
False, geometry types will not be promoted, which may result in errors
|
||||
or invalid files when attempting to write mixed singular and multi
|
||||
geometry types to drivers that do not support such combinations.
|
||||
nan_as_null : bool, default True
|
||||
For floating point columns (float32 / float64), whether NaN values are
|
||||
written as "null" (missing value). Defaults to True because in pandas
|
||||
NaNs are typically used as missing value. Note that when set to False,
|
||||
behaviour is format specific: some formats don't support NaNs by
|
||||
default (e.g. GeoJSON will skip this property) or might treat them as
|
||||
null anyway (e.g. GeoPackage).
|
||||
append : bool, optional (default: False)
|
||||
If True, the data source specified by path already exists, and the
|
||||
driver supports appending to an existing data source, will cause the
|
||||
data to be appended to the existing records in the data source. Not
|
||||
supported for writing to in-memory files.
|
||||
NOTE: append support is limited to specific drivers and GDAL versions.
|
||||
dataset_metadata : dict, optional (default: None)
|
||||
Metadata to be stored at the dataset level in the output file; limited
|
||||
to drivers that support writing metadata, such as GPKG, and silently
|
||||
ignored otherwise. Keys and values must be strings.
|
||||
layer_metadata : dict, optional (default: None)
|
||||
Metadata to be stored at the layer level in the output file; limited to
|
||||
drivers that support writing metadata, such as GPKG, and silently
|
||||
ignored otherwise. Keys and values must be strings.
|
||||
metadata : dict, optional (default: None)
|
||||
alias of layer_metadata
|
||||
dataset_options : dict, optional
|
||||
Dataset creation options (format specific) passed to OGR. Specify as
|
||||
a key-value dictionary.
|
||||
layer_options : dict, optional
|
||||
Layer creation options (format specific) passed to OGR. Specify as
|
||||
a key-value dictionary.
|
||||
gdal_tz_offsets : dict, optional (default: None)
|
||||
Used to handle GDAL timezone offsets for each field contained in dict.
|
||||
**kwargs
|
||||
Additional driver-specific dataset creation options passed to OGR. Invalid
|
||||
options will trigger a warning.
|
||||
|
||||
"""
|
||||
# if dtypes is given, remove it from kwargs (dtypes is included in meta returned by
|
||||
# read, and it is convenient to pass meta directly into write for round trip tests)
|
||||
kwargs.pop("dtypes", None)
|
||||
|
||||
path, driver = _get_write_path_driver(path, driver, append=append)
|
||||
|
||||
dataset_metadata, layer_metadata = _validate_metadata(
|
||||
dataset_metadata, layer_metadata, metadata
|
||||
)
|
||||
|
||||
if geometry is not None and promote_to_multi is None:
|
||||
promote_to_multi = (
|
||||
geometry_type.startswith("Multi")
|
||||
and driver in DRIVERS_NO_MIXED_SINGLE_MULTI
|
||||
)
|
||||
|
||||
if geometry is not None and crs is None:
|
||||
warnings.warn(
|
||||
"'crs' was not provided. The output dataset will not have "
|
||||
"projection information defined and may not be usable in other "
|
||||
"systems.",
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
# preprocess kwargs and split in dataset and layer creation options
|
||||
dataset_kwargs, layer_kwargs = _preprocess_options_kwargs(
|
||||
driver, dataset_options, layer_options, kwargs
|
||||
)
|
||||
|
||||
ogr_write(
|
||||
path,
|
||||
layer=layer,
|
||||
driver=driver,
|
||||
geometry=geometry,
|
||||
geometry_type=geometry_type,
|
||||
field_data=field_data,
|
||||
field_mask=field_mask,
|
||||
fields=fields,
|
||||
crs=crs,
|
||||
encoding=encoding,
|
||||
promote_to_multi=promote_to_multi,
|
||||
nan_as_null=nan_as_null,
|
||||
append=append,
|
||||
dataset_metadata=dataset_metadata,
|
||||
layer_metadata=layer_metadata,
|
||||
dataset_kwargs=dataset_kwargs,
|
||||
layer_kwargs=layer_kwargs,
|
||||
gdal_tz_offsets=gdal_tz_offsets,
|
||||
)
|
||||
|
||||
|
||||
def write_arrow(
|
||||
arrow_obj,
|
||||
path,
|
||||
layer=None,
|
||||
driver=None,
|
||||
geometry_name=None,
|
||||
geometry_type=None,
|
||||
crs=None,
|
||||
encoding=None,
|
||||
append=False,
|
||||
dataset_metadata=None,
|
||||
layer_metadata=None,
|
||||
metadata=None,
|
||||
dataset_options=None,
|
||||
layer_options=None,
|
||||
**kwargs,
|
||||
):
|
||||
"""Write an Arrow-compatible data source to an OGR file format.
|
||||
|
||||
.. _Arrow PyCapsule Protocol: https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html
|
||||
|
||||
Parameters
|
||||
----------
|
||||
arrow_obj
|
||||
The Arrow data to write. This can be any Arrow-compatible tabular data
|
||||
object that implements the `Arrow PyCapsule Protocol`_ (i.e. has an
|
||||
``__arrow_c_stream__`` method), for example a pyarrow Table or
|
||||
RecordBatchReader.
|
||||
path : str or io.BytesIO
|
||||
path to output file on writeable file system or an io.BytesIO object to
|
||||
allow writing to memory
|
||||
NOTE: support for writing to memory is limited to specific drivers.
|
||||
layer : str, optional (default: None)
|
||||
layer name to create. If writing to memory and layer name is not
|
||||
provided, it layer name will be set to a UUID4 value.
|
||||
driver : string, optional (default: None)
|
||||
The OGR format driver used to write the vector file. By default attempts
|
||||
to infer driver from path. Must be provided to write to memory.
|
||||
geometry_name : str, optional (default: None)
|
||||
The name of the column in the input data that will be written as the
|
||||
geometry field. Will be inferred from the input data if the geometry
|
||||
column is annotated as an "geoarrow.wkb" or "ogc.wkb" extension type.
|
||||
Otherwise needs to be specified explicitly.
|
||||
geometry_type : str
|
||||
The geometry type of the written layer. Currently, this needs to be
|
||||
specified explicitly when creating a new layer with geometries.
|
||||
Possible values are: "Unknown", "Point", "LineString", "Polygon",
|
||||
"MultiPoint", "MultiLineString", "MultiPolygon" or "GeometryCollection".
|
||||
|
||||
This parameter does not modify the geometry, but it will try to force the layer
|
||||
type of the output file to this value. Use this parameter with caution because
|
||||
using a wrong layer geometry type may result in errors when writing the
|
||||
file, may be ignored by the driver, or may result in invalid files.
|
||||
crs : str, optional (default: None)
|
||||
WKT-encoded CRS of the geometries to be written.
|
||||
encoding : str, optional (default: None)
|
||||
Only used for the .dbf file of ESRI Shapefiles. If not specified,
|
||||
uses the default locale.
|
||||
append : bool, optional (default: False)
|
||||
If True, the data source specified by path already exists, and the
|
||||
driver supports appending to an existing data source, will cause the
|
||||
data to be appended to the existing records in the data source. Not
|
||||
supported for writing to in-memory files.
|
||||
NOTE: append support is limited to specific drivers and GDAL versions.
|
||||
dataset_metadata : dict, optional (default: None)
|
||||
Metadata to be stored at the dataset level in the output file; limited
|
||||
to drivers that support writing metadata, such as GPKG, and silently
|
||||
ignored otherwise. Keys and values must be strings.
|
||||
layer_metadata : dict, optional (default: None)
|
||||
Metadata to be stored at the layer level in the output file; limited to
|
||||
drivers that support writing metadata, such as GPKG, and silently
|
||||
ignored otherwise. Keys and values must be strings.
|
||||
metadata : dict, optional (default: None)
|
||||
alias of layer_metadata
|
||||
dataset_options : dict, optional
|
||||
Dataset creation options (format specific) passed to OGR. Specify as
|
||||
a key-value dictionary.
|
||||
layer_options : dict, optional
|
||||
Layer creation options (format specific) passed to OGR. Specify as
|
||||
a key-value dictionary.
|
||||
**kwargs
|
||||
Additional driver-specific dataset or layer creation options passed
|
||||
to OGR. pyogrio will attempt to automatically pass those keywords
|
||||
either as dataset or as layer creation option based on the known
|
||||
options for the specific driver. Alternatively, you can use the
|
||||
explicit `dataset_options` or `layer_options` keywords to manually
|
||||
do this (for example if an option exists as both dataset and layer
|
||||
option).
|
||||
|
||||
"""
|
||||
if not HAS_ARROW_WRITE_API:
|
||||
raise RuntimeError("GDAL>=3.8 required to write using arrow")
|
||||
|
||||
if not hasattr(arrow_obj, "__arrow_c_stream__"):
|
||||
raise ValueError(
|
||||
"The provided data is not recognized as Arrow data. The object "
|
||||
"should implement the Arrow PyCapsule Protocol (i.e. have a "
|
||||
"'__arrow_c_stream__' method)."
|
||||
)
|
||||
|
||||
path, driver = _get_write_path_driver(path, driver, append=append)
|
||||
|
||||
if "promote_to_multi" in kwargs:
|
||||
raise ValueError(
|
||||
"The 'promote_to_multi' option is not supported when writing using Arrow"
|
||||
)
|
||||
|
||||
if geometry_name is not None:
|
||||
if geometry_type is None:
|
||||
raise ValueError("'geometry_type' keyword is required")
|
||||
if crs is None:
|
||||
# TODO: does GDAL infer CRS automatically from geometry metadata?
|
||||
warnings.warn(
|
||||
"'crs' was not provided. The output dataset will not have "
|
||||
"projection information defined and may not be usable in other "
|
||||
"systems.",
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
dataset_metadata, layer_metadata = _validate_metadata(
|
||||
dataset_metadata, layer_metadata, metadata
|
||||
)
|
||||
|
||||
# preprocess kwargs and split in dataset and layer creation options
|
||||
dataset_kwargs, layer_kwargs = _preprocess_options_kwargs(
|
||||
driver, dataset_options, layer_options, kwargs
|
||||
)
|
||||
|
||||
ogr_write_arrow(
|
||||
path,
|
||||
layer=layer,
|
||||
driver=driver,
|
||||
arrow_obj=arrow_obj,
|
||||
geometry_type=geometry_type,
|
||||
geometry_name=geometry_name,
|
||||
crs=crs,
|
||||
encoding=encoding,
|
||||
append=append,
|
||||
dataset_metadata=dataset_metadata,
|
||||
layer_metadata=layer_metadata,
|
||||
dataset_kwargs=dataset_kwargs,
|
||||
layer_kwargs=layer_kwargs,
|
||||
)
|
||||
Reference in New Issue
Block a user