venv
This commit is contained in:
@@ -1,24 +1,24 @@
|
||||
"""Functions for reading and writing GeoPandas dataframes."""
|
||||
|
||||
import os
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pyogrio._compat import HAS_GEOPANDAS, PANDAS_GE_15, PANDAS_GE_20
|
||||
from pyogrio._compat import HAS_GEOPANDAS, PANDAS_GE_15, PANDAS_GE_20, PANDAS_GE_22
|
||||
from pyogrio.errors import DataSourceError
|
||||
from pyogrio.raw import (
|
||||
DRIVERS_NO_MIXED_SINGLE_MULTI,
|
||||
DRIVERS_NO_MIXED_DIMENSIONS,
|
||||
detect_write_driver,
|
||||
DRIVERS_NO_MIXED_SINGLE_MULTI,
|
||||
_get_write_path_driver,
|
||||
read,
|
||||
read_arrow,
|
||||
write,
|
||||
)
|
||||
from pyogrio.errors import DataSourceError
|
||||
import warnings
|
||||
|
||||
|
||||
def _stringify_path(path):
|
||||
"""
|
||||
Convert path-like to a string if possible, pass-through other objects
|
||||
"""
|
||||
"""Convert path-like to a string if possible, pass-through other objects."""
|
||||
if isinstance(path, str):
|
||||
return path
|
||||
|
||||
@@ -33,10 +33,12 @@ def _stringify_path(path):
|
||||
def _try_parse_datetime(ser):
|
||||
import pandas as pd # only called when pandas is known to be installed
|
||||
|
||||
if PANDAS_GE_20:
|
||||
datetime_kwargs = dict(format="ISO8601", errors="ignore")
|
||||
if PANDAS_GE_22:
|
||||
datetime_kwargs = {"format": "ISO8601"}
|
||||
elif PANDAS_GE_20:
|
||||
datetime_kwargs = {"format": "ISO8601", "errors": "ignore"}
|
||||
else:
|
||||
datetime_kwargs = dict(yearfirst=True)
|
||||
datetime_kwargs = {"yearfirst": True}
|
||||
with warnings.catch_warnings():
|
||||
warnings.filterwarnings(
|
||||
"ignore",
|
||||
@@ -48,10 +50,13 @@ def _try_parse_datetime(ser):
|
||||
try:
|
||||
res = pd.to_datetime(ser, **datetime_kwargs)
|
||||
except Exception:
|
||||
pass
|
||||
res = ser
|
||||
# if object dtype, try parse as utc instead
|
||||
if res.dtype == "object":
|
||||
res = pd.to_datetime(ser, utc=True, **datetime_kwargs)
|
||||
try:
|
||||
res = pd.to_datetime(ser, utc=True, **datetime_kwargs)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if res.dtype != "object":
|
||||
# GDAL only supports ms precision, convert outputs to match.
|
||||
@@ -82,10 +87,12 @@ def read_dataframe(
|
||||
sql_dialect=None,
|
||||
fid_as_index=False,
|
||||
use_arrow=None,
|
||||
on_invalid="raise",
|
||||
arrow_to_pandas_kwargs=None,
|
||||
**kwargs,
|
||||
):
|
||||
"""Read from an OGR data source to a GeoPandas GeoDataFrame or Pandas DataFrame.
|
||||
|
||||
If the data source does not have a geometry column or ``read_geometry`` is False,
|
||||
a DataFrame will be returned.
|
||||
|
||||
@@ -94,20 +101,23 @@ def read_dataframe(
|
||||
Parameters
|
||||
----------
|
||||
path_or_buffer : pathlib.Path or str, or bytes buffer
|
||||
A dataset path or URI, or raw buffer.
|
||||
A dataset path or URI, raw buffer, or file-like object with a read method.
|
||||
layer : int or str, optional (default: first layer)
|
||||
If an integer is provided, it corresponds to the index of the layer
|
||||
with the data source. If a string is provided, it must match the name
|
||||
of the layer in the data source. Defaults to first layer in data source.
|
||||
encoding : str, optional (default: None)
|
||||
If present, will be used as the encoding for reading string values from
|
||||
the data source, unless encoding can be inferred directly from the data
|
||||
source.
|
||||
the data source. By default will automatically try to detect the native
|
||||
encoding and decode to ``UTF-8``.
|
||||
columns : list-like, optional (default: all columns)
|
||||
List of column names to import from the data source. Column names must
|
||||
exactly match the names in the data source, and will be returned in
|
||||
the order they occur in the data source. To avoid reading any columns,
|
||||
pass an empty list-like.
|
||||
pass an empty list-like. If combined with ``where`` parameter, must
|
||||
include columns referenced in the ``where`` expression or the data may
|
||||
not be correctly read; the data source may return empty results or
|
||||
raise an exception (behavior varies by driver).
|
||||
read_geometry : bool, optional (default: True)
|
||||
If True, will read geometry into a GeoSeries. If False, a Pandas DataFrame
|
||||
will be returned instead.
|
||||
@@ -152,7 +162,12 @@ def read_dataframe(
|
||||
the starting index is driver and file specific (e.g. typically 0 for
|
||||
Shapefile and 1 for GeoPackage, but can still depend on the specific
|
||||
file). The performance of reading a large number of features usings FIDs
|
||||
is also driver specific.
|
||||
is also driver specific and depends on the value of ``use_arrow``. The order
|
||||
of the rows returned is undefined. If you would like to sort based on FID, use
|
||||
``fid_as_index=True`` to have the index of the GeoDataFrame returned set to the
|
||||
FIDs of the features read. If ``use_arrow=True``, the number of FIDs is limited
|
||||
to 4997 for drivers with 'OGRSQL' as default SQL dialect. To read a larger
|
||||
number of FIDs, set ``user_arrow=False``.
|
||||
sql : str, optional (default: None)
|
||||
The SQL statement to execute. Look at the sql_dialect parameter for more
|
||||
information on the syntax to use for the query. When combined with other
|
||||
@@ -184,6 +199,17 @@ def read_dataframe(
|
||||
installed). When enabled, this provides a further speed-up.
|
||||
Defaults to False, but this default can also be globally overridden
|
||||
by setting the ``PYOGRIO_USE_ARROW=1`` environment variable.
|
||||
on_invalid : str, optional (default: "raise")
|
||||
The action to take when an invalid geometry is encountered. Possible
|
||||
values:
|
||||
|
||||
- **raise**: an exception will be raised if a WKB input geometry is
|
||||
invalid.
|
||||
- **warn**: invalid WKB geometries will be returned as ``None`` and a
|
||||
warning will be raised.
|
||||
- **ignore**: invalid WKB geometries will be returned as ``None``
|
||||
without a warning.
|
||||
|
||||
arrow_to_pandas_kwargs : dict, optional (default: None)
|
||||
When `use_arrow` is True, these kwargs will be passed to the `to_pandas`_
|
||||
call for the arrow to pandas conversion.
|
||||
@@ -215,13 +241,13 @@ def read_dataframe(
|
||||
|
||||
https://arrow.apache.org/docs/python/generated/pyarrow.Table.html#pyarrow.Table.to_pandas
|
||||
|
||||
""" # noqa: E501
|
||||
"""
|
||||
if not HAS_GEOPANDAS:
|
||||
raise ImportError("geopandas is required to use pyogrio.read_dataframe()")
|
||||
|
||||
import pandas as pd
|
||||
import geopandas as gp
|
||||
from geopandas.array import from_wkb
|
||||
import pandas as pd
|
||||
|
||||
import shapely # if geopandas is present, shapely is expected to be present
|
||||
|
||||
path_or_buffer = _stringify_path(path_or_buffer)
|
||||
@@ -279,10 +305,10 @@ def read_dataframe(
|
||||
if PANDAS_GE_15 and wkb_values.dtype != object:
|
||||
# for example ArrowDtype will otherwise create numpy array with pd.NA
|
||||
wkb_values = wkb_values.to_numpy(na_value=None)
|
||||
df["geometry"] = from_wkb(wkb_values, crs=meta["crs"])
|
||||
df["geometry"] = shapely.from_wkb(wkb_values, on_invalid=on_invalid)
|
||||
if force_2d:
|
||||
df["geometry"] = shapely.force_2d(df["geometry"])
|
||||
return gp.GeoDataFrame(df, geometry="geometry")
|
||||
return gp.GeoDataFrame(df, geometry="geometry", crs=meta["crs"])
|
||||
else:
|
||||
return df
|
||||
|
||||
@@ -302,9 +328,9 @@ def read_dataframe(
|
||||
if geometry is None or not read_geometry:
|
||||
return df
|
||||
|
||||
geometry = from_wkb(geometry, crs=meta["crs"])
|
||||
geometry = shapely.from_wkb(geometry, on_invalid=on_invalid)
|
||||
|
||||
return gp.GeoDataFrame(df, geometry=geometry)
|
||||
return gp.GeoDataFrame(df, geometry=geometry, crs=meta["crs"])
|
||||
|
||||
|
||||
# TODO: handle index properly
|
||||
@@ -318,6 +344,7 @@ def write_dataframe(
|
||||
promote_to_multi=None,
|
||||
nan_as_null=True,
|
||||
append=False,
|
||||
use_arrow=None,
|
||||
dataset_metadata=None,
|
||||
layer_metadata=None,
|
||||
metadata=None,
|
||||
@@ -325,8 +352,7 @@ def write_dataframe(
|
||||
layer_options=None,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
Write GeoPandas GeoDataFrame to an OGR file format.
|
||||
"""Write GeoPandas GeoDataFrame to an OGR file format.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
@@ -335,16 +361,21 @@ def write_dataframe(
|
||||
all values will be converted to strings to be written to the
|
||||
output file, except None and np.nan, which will be set to NULL
|
||||
in the output file.
|
||||
path : str
|
||||
path to file
|
||||
layer :str, optional (default: None)
|
||||
layer name
|
||||
path : str or io.BytesIO
|
||||
path to output file on writeable file system or an io.BytesIO object to
|
||||
allow writing to memory. Will raise NotImplementedError if an open file
|
||||
handle is passed; use BytesIO instead.
|
||||
NOTE: support for writing to memory is limited to specific drivers.
|
||||
layer : str, optional (default: None)
|
||||
layer name to create. If writing to memory and layer name is not
|
||||
provided, it layer name will be set to a UUID4 value.
|
||||
driver : string, optional (default: None)
|
||||
The OGR format driver used to write the vector file. By default write_dataframe
|
||||
attempts to infer driver from path.
|
||||
The OGR format driver used to write the vector file. By default attempts
|
||||
to infer driver from path. Must be provided to write to memory.
|
||||
encoding : str, optional (default: None)
|
||||
If present, will be used as the encoding for writing string values to
|
||||
the file.
|
||||
the file. Use with caution, only certain drivers support encodings
|
||||
other than UTF-8.
|
||||
geometry_type : string, optional (default: None)
|
||||
By default, the geometry type of the layer will be inferred from the
|
||||
data, after applying the promote_to_multi logic. If the data only contains a
|
||||
@@ -376,8 +407,17 @@ def write_dataframe(
|
||||
append : bool, optional (default: False)
|
||||
If True, the data source specified by path already exists, and the
|
||||
driver supports appending to an existing data source, will cause the
|
||||
data to be appended to the existing records in the data source.
|
||||
data to be appended to the existing records in the data source. Not
|
||||
supported for writing to in-memory files.
|
||||
NOTE: append support is limited to specific drivers and GDAL versions.
|
||||
use_arrow : bool, optional (default: False)
|
||||
Whether to use Arrow as the transfer mechanism of the data to write
|
||||
from Python to GDAL (requires GDAL >= 3.8 and `pyarrow` to be
|
||||
installed). When enabled, this provides a further speed-up.
|
||||
Defaults to False, but this default can also be globally overridden
|
||||
by setting the ``PYOGRIO_USE_ARROW=1`` environment variable.
|
||||
Using Arrow does not support writing an object-dtype column with
|
||||
mixed types.
|
||||
dataset_metadata : dict, optional (default: None)
|
||||
Metadata to be stored at the dataset level in the output file; limited
|
||||
to drivers that support writing metadata, such as GPKG, and silently
|
||||
@@ -389,10 +429,10 @@ def write_dataframe(
|
||||
metadata : dict, optional (default: None)
|
||||
alias of layer_metadata
|
||||
dataset_options : dict, optional
|
||||
Dataset creation option (format specific) passed to OGR. Specify as
|
||||
Dataset creation options (format specific) passed to OGR. Specify as
|
||||
a key-value dictionary.
|
||||
layer_options : dict, optional
|
||||
Layer creation option (format specific) passed to OGR. Specify as
|
||||
Layer creation options (format specific) passed to OGR. Specify as
|
||||
a key-value dictionary.
|
||||
**kwargs
|
||||
Additional driver-specific dataset or layer creation options passed
|
||||
@@ -402,23 +442,22 @@ def write_dataframe(
|
||||
explicit `dataset_options` or `layer_options` keywords to manually
|
||||
do this (for example if an option exists as both dataset and layer
|
||||
option).
|
||||
|
||||
"""
|
||||
# TODO: add examples to the docstring (e.g. OGR kwargs)
|
||||
|
||||
if not HAS_GEOPANDAS:
|
||||
raise ImportError("geopandas is required to use pyogrio.write_dataframe()")
|
||||
|
||||
from geopandas.array import to_wkb
|
||||
import pandas as pd
|
||||
from pyproj.enums import WktVersion # if geopandas is available so is pyproj
|
||||
|
||||
path = str(path)
|
||||
from geopandas.array import to_wkb
|
||||
|
||||
if not isinstance(df, pd.DataFrame):
|
||||
raise ValueError("'df' must be a DataFrame or GeoDataFrame")
|
||||
|
||||
if driver is None:
|
||||
driver = detect_write_driver(path)
|
||||
if use_arrow is None:
|
||||
use_arrow = bool(int(os.environ.get("PYOGRIO_USE_ARROW", "0")))
|
||||
path, driver = _get_write_path_driver(path, driver, append=append)
|
||||
|
||||
geometry_columns = df.columns[df.dtypes == "geometry"]
|
||||
if len(geometry_columns) > 1:
|
||||
@@ -456,11 +495,11 @@ def write_dataframe(
|
||||
# https://gdal.org/development/rfc/rfc56_millisecond_precision.html#core-changes
|
||||
# Convert each row offset to a signed multiple of 15m and add to GMT value
|
||||
gdal_offset_representation = tz_offset // pd.Timedelta("15m") + 100
|
||||
gdal_tz_offsets[name] = gdal_offset_representation
|
||||
gdal_tz_offsets[name] = gdal_offset_representation.values
|
||||
else:
|
||||
values = col.values
|
||||
if isinstance(values, pd.api.extensions.ExtensionArray):
|
||||
from pandas.arrays import IntegerArray, FloatingArray, BooleanArray
|
||||
from pandas.arrays import BooleanArray, FloatingArray, IntegerArray
|
||||
|
||||
if isinstance(values, (IntegerArray, FloatingArray, BooleanArray)):
|
||||
field_data.append(values._data)
|
||||
@@ -473,6 +512,9 @@ def write_dataframe(
|
||||
field_mask.append(None)
|
||||
|
||||
# Determine geometry_type and/or promote_to_multi
|
||||
if geometry_column is not None:
|
||||
geometry_types_all = geometry.geom_type
|
||||
|
||||
if geometry_column is not None and (
|
||||
geometry_type is None or promote_to_multi is None
|
||||
):
|
||||
@@ -482,9 +524,10 @@ def write_dataframe(
|
||||
# If there is data, infer layer geometry type + promote_to_multi
|
||||
if not df.empty:
|
||||
# None/Empty geometries sometimes report as Z incorrectly, so ignore them
|
||||
has_z_arr = geometry[
|
||||
(geometry != np.array(None)) & (~geometry.is_empty)
|
||||
].has_z
|
||||
with warnings.catch_warnings():
|
||||
warnings.filterwarnings("ignore", r"GeoSeries\.notna", UserWarning)
|
||||
geometry_notna = geometry.notna()
|
||||
has_z_arr = geometry[geometry_notna & (~geometry.is_empty)].has_z
|
||||
has_z = has_z_arr.any()
|
||||
all_z = has_z_arr.all()
|
||||
|
||||
@@ -493,7 +536,7 @@ def write_dataframe(
|
||||
f"Mixed 2D and 3D coordinates are not supported by {driver}"
|
||||
)
|
||||
|
||||
geometry_types = pd.Series(geometry.type.unique()).dropna().values
|
||||
geometry_types = pd.Series(geometry_types_all.unique()).dropna().values
|
||||
if len(geometry_types) == 1:
|
||||
tmp_geometry_type = geometry_types[0]
|
||||
if promote_to_multi and tmp_geometry_type in (
|
||||
@@ -539,7 +582,78 @@ def write_dataframe(
|
||||
if epsg:
|
||||
crs = f"EPSG:{epsg}"
|
||||
else:
|
||||
crs = geometry.crs.to_wkt(WktVersion.WKT1_GDAL)
|
||||
crs = geometry.crs.to_wkt("WKT1_GDAL")
|
||||
|
||||
if use_arrow:
|
||||
import pyarrow as pa
|
||||
|
||||
from pyogrio.raw import write_arrow
|
||||
|
||||
if geometry_column is not None:
|
||||
# Convert to multi type
|
||||
if promote_to_multi:
|
||||
import shapely
|
||||
|
||||
mask_points = geometry_types_all == "Point"
|
||||
mask_linestrings = geometry_types_all == "LineString"
|
||||
mask_polygons = geometry_types_all == "Polygon"
|
||||
|
||||
if mask_points.any():
|
||||
geometry[mask_points] = shapely.multipoints(
|
||||
np.atleast_2d(geometry[mask_points]), axis=0
|
||||
)
|
||||
|
||||
if mask_linestrings.any():
|
||||
geometry[mask_linestrings] = shapely.multilinestrings(
|
||||
np.atleast_2d(geometry[mask_linestrings]), axis=0
|
||||
)
|
||||
|
||||
if mask_polygons.any():
|
||||
geometry[mask_polygons] = shapely.multipolygons(
|
||||
np.atleast_2d(geometry[mask_polygons]), axis=0
|
||||
)
|
||||
|
||||
geometry = to_wkb(geometry.values)
|
||||
df = df.copy(deep=False)
|
||||
# convert to plain DataFrame to avoid warning from geopandas about
|
||||
# writing non-geometries to the geometry column
|
||||
df = pd.DataFrame(df, copy=False)
|
||||
df[geometry_column] = geometry
|
||||
|
||||
table = pa.Table.from_pandas(df, preserve_index=False)
|
||||
|
||||
if geometry_column is not None:
|
||||
# ensure that the geometry column is binary (for all-null geometries,
|
||||
# this could be a wrong type)
|
||||
geom_field = table.schema.field(geometry_column)
|
||||
if not (
|
||||
pa.types.is_binary(geom_field.type)
|
||||
or pa.types.is_large_binary(geom_field.type)
|
||||
):
|
||||
table = table.set_column(
|
||||
table.schema.get_field_index(geometry_column),
|
||||
geom_field.with_type(pa.binary()),
|
||||
table[geometry_column].cast(pa.binary()),
|
||||
)
|
||||
|
||||
write_arrow(
|
||||
table,
|
||||
path,
|
||||
layer=layer,
|
||||
driver=driver,
|
||||
geometry_name=geometry_column,
|
||||
geometry_type=geometry_type,
|
||||
crs=crs,
|
||||
encoding=encoding,
|
||||
append=append,
|
||||
dataset_metadata=dataset_metadata,
|
||||
layer_metadata=layer_metadata,
|
||||
metadata=metadata,
|
||||
dataset_options=dataset_options,
|
||||
layer_options=layer_options,
|
||||
**kwargs,
|
||||
)
|
||||
return
|
||||
|
||||
# If there is geometry data, prepare it to be written
|
||||
if geometry_column is not None:
|
||||
|
||||
Reference in New Issue
Block a user