This commit is contained in:
2025-01-26 19:24:23 -08:00
parent 32cd60e92b
commit d1dde0dbc6
4155 changed files with 29170 additions and 216373 deletions

View File

@@ -2,7 +2,6 @@ from packaging.version import Version
import pyarrow
_ERROR_MSG = """\
Disallowed deserialization of 'arrow.py_extension_type':
storage_type = {storage_type}

View File

@@ -1,19 +1,31 @@
from packaging.version import Version
import json
import warnings
from packaging.version import Version
import numpy as np
from pandas import DataFrame, Series
import geopandas._compat as compat
from geopandas._compat import import_optional_dependency
from geopandas.array import from_wkb
from geopandas import GeoDataFrame
import shapely
import geopandas
from geopandas import GeoDataFrame
from geopandas._compat import import_optional_dependency
from geopandas.array import from_shapely, from_wkb
from .file import _expand_user
METADATA_VERSION = "1.0.0"
SUPPORTED_VERSIONS = ["0.1.0", "0.4.0", "1.0.0-beta.1", "1.0.0"]
SUPPORTED_VERSIONS = ["0.1.0", "0.4.0", "1.0.0-beta.1", "1.0.0", "1.1.0"]
GEOARROW_ENCODINGS = [
"point",
"linestring",
"polygon",
"multipoint",
"multilinestring",
"multipolygon",
]
SUPPORTED_ENCODINGS = ["WKB"] + GEOARROW_ENCODINGS
# reference: https://github.com/opengeospatial/geoparquet
# Metadata structure:
@@ -68,7 +80,40 @@ def _remove_id_from_member_of_ensembles(json_dict):
member.pop("id", None)
def _create_metadata(df, schema_version=None):
# type ids 0 to 7
_geometry_type_names = [
"Point",
"LineString",
"LineString",
"Polygon",
"MultiPoint",
"MultiLineString",
"MultiPolygon",
"GeometryCollection",
]
_geometry_type_names += [geom_type + " Z" for geom_type in _geometry_type_names]
def _get_geometry_types(series):
"""
Get unique geometry types from a GeoSeries.
"""
arr_geometry_types = shapely.get_type_id(series.array._data)
# ensure to include "... Z" for 3D geometries
has_z = shapely.has_z(series.array._data)
arr_geometry_types[has_z] += 8
geometry_types = Series(arr_geometry_types).unique().tolist()
# drop missing values (shapely.get_type_id returns -1 for those)
if -1 in geometry_types:
geometry_types.remove(-1)
return sorted([_geometry_type_names[idx] for idx in geometry_types])
def _create_metadata(
df, schema_version=None, geometry_encoding=None, write_covering_bbox=False
):
"""Create and encode geo metadata dict.
Parameters
@@ -77,13 +122,22 @@ def _create_metadata(df, schema_version=None):
schema_version : {'0.1.0', '0.4.0', '1.0.0-beta.1', '1.0.0', None}
GeoParquet specification version; if not provided will default to
latest supported version.
write_covering_bbox : bool, default False
Writes the bounding box column for each row entry with column
name 'bbox'. Writing a bbox column can be computationally
expensive, hence is default setting is False.
Returns
-------
dict
"""
schema_version = schema_version or METADATA_VERSION
if schema_version is None:
if geometry_encoding and any(
encoding != "WKB" for encoding in geometry_encoding.values()
):
schema_version = "1.1.0"
else:
schema_version = METADATA_VERSION
if schema_version not in SUPPORTED_VERSIONS:
raise ValueError(
@@ -94,7 +148,8 @@ def _create_metadata(df, schema_version=None):
column_metadata = {}
for col in df.columns[df.dtypes == "geometry"]:
series = df[col]
geometry_types = sorted(Series(series.geom_type.unique()).dropna())
geometry_types = _get_geometry_types(series)
if schema_version[0] == "0":
geometry_types_name = "geometry_type"
if len(geometry_types) == 1:
@@ -111,7 +166,7 @@ def _create_metadata(df, schema_version=None):
_remove_id_from_member_of_ensembles(crs)
column_metadata[col] = {
"encoding": "WKB",
"encoding": geometry_encoding[col],
"crs": crs,
geometry_types_name: geometry_types,
}
@@ -121,10 +176,20 @@ def _create_metadata(df, schema_version=None):
# don't add bbox with NaNs for empty / all-NA geometry column
column_metadata[col]["bbox"] = bbox
if write_covering_bbox:
column_metadata[col]["covering"] = {
"bbox": {
"xmin": ["bbox", "xmin"],
"ymin": ["bbox", "ymin"],
"xmax": ["bbox", "xmax"],
"ymax": ["bbox", "ymax"],
},
}
return {
"primary_column": df._geometry_column_name,
"columns": column_metadata,
"version": schema_version or METADATA_VERSION,
"version": schema_version,
"creator": {"library": "geopandas", "version": geopandas.__version__},
}
@@ -188,7 +253,7 @@ def _validate_dataframe(df):
raise ValueError("Index level names must be strings")
def _validate_metadata(metadata):
def _validate_geo_metadata(metadata):
"""Validate geo metadata.
Must not be empty, and must contain the structure specified above.
@@ -232,8 +297,12 @@ def _validate_metadata(metadata):
"'{key}' for column '{col}'".format(key=key, col=col)
)
if column_metadata["encoding"] != "WKB":
raise ValueError("Only WKB geometry encoding is supported")
if column_metadata["encoding"] not in SUPPORTED_ENCODINGS:
raise ValueError(
"Only WKB geometry encoding or one of the native encodings "
f"({GEOARROW_ENCODINGS!r}) are supported, "
f"got: {column_metadata['encoding']}"
)
if column_metadata.get("edges", "planar") == "spherical":
warnings.warn(
@@ -245,37 +314,59 @@ def _validate_metadata(metadata):
stacklevel=4,
)
if "covering" in column_metadata:
covering = column_metadata["covering"]
if "bbox" in covering:
bbox = covering["bbox"]
for var in ["xmin", "ymin", "xmax", "ymax"]:
if var not in bbox.keys():
raise ValueError("Metadata for bbox column is malformed.")
def _geopandas_to_arrow(df, index=None, schema_version=None):
def _geopandas_to_arrow(
df,
index=None,
geometry_encoding="WKB",
schema_version=None,
write_covering_bbox=None,
):
"""
Helper function with main, shared logic for to_parquet/to_feather.
"""
from pyarrow import Table
from pyarrow import StructArray
from geopandas.io._geoarrow import geopandas_to_arrow
_validate_dataframe(df)
# create geo metadata before altering incoming data frame
geo_metadata = _create_metadata(df, schema_version=schema_version)
if schema_version is not None:
if geometry_encoding != "WKB" and schema_version != "1.1.0":
raise ValueError(
"'geoarrow' encoding is only supported with schema version >= 1.1.0"
)
kwargs = {}
if compat.USE_SHAPELY_20:
kwargs = {"flavor": "iso"}
else:
for col in df.columns[df.dtypes == "geometry"]:
series = df[col]
if series.has_z.any():
warnings.warn(
"The GeoDataFrame contains 3D geometries, and when using "
"shapely < 2.0, such geometries will be written not exactly "
"following to the GeoParquet spec (not using ISO WKB). For "
"most use cases this should not be a problem (GeoPandas can "
"read such files fine).",
stacklevel=2,
)
break
df = df.to_wkb(**kwargs)
table, geometry_encoding_dict = geopandas_to_arrow(
df, geometry_encoding=geometry_encoding, index=index, interleaved=False
)
geo_metadata = _create_metadata(
df,
schema_version=schema_version,
geometry_encoding=geometry_encoding_dict,
write_covering_bbox=write_covering_bbox,
)
table = Table.from_pandas(df, preserve_index=index)
if write_covering_bbox:
if "bbox" in df.columns:
raise ValueError(
"An existing column 'bbox' already exists in the dataframe. "
"Please rename to write covering bbox."
)
bounds = df.bounds
bbox_array = StructArray.from_arrays(
[bounds["minx"], bounds["miny"], bounds["maxx"], bounds["maxy"]],
names=["xmin", "ymin", "xmax", "ymax"],
)
table = table.append_column("bbox", bbox_array)
# Store geopandas specific file-level metadata
# This must be done AFTER creating the table or it is not persisted
@@ -286,7 +377,14 @@ def _geopandas_to_arrow(df, index=None, schema_version=None):
def _to_parquet(
df, path, index=None, compression="snappy", schema_version=None, **kwargs
df,
path,
index=None,
compression="snappy",
geometry_encoding="WKB",
schema_version=None,
write_covering_bbox=False,
**kwargs,
):
"""
Write a GeoDataFrame to the Parquet format.
@@ -312,9 +410,17 @@ def _to_parquet(
output except `RangeIndex` which is stored as metadata only.
compression : {'snappy', 'gzip', 'brotli', None}, default 'snappy'
Name of the compression to use. Use ``None`` for no compression.
geometry_encoding : {'WKB', 'geoarrow'}, default 'WKB'
The encoding to use for the geometry columns. Defaults to "WKB"
for maximum interoperability. Specify "geoarrow" to use one of the
native GeoArrow-based single-geometry type encodings.
schema_version : {'0.1.0', '0.4.0', '1.0.0', None}
GeoParquet specification version; if not provided will default to
latest supported version.
write_covering_bbox : bool, default False
Writes the bounding box column for each row entry with column
name 'bbox'. Writing a bbox column can be computationally
expensive, hence is default setting is False.
**kwargs
Additional keyword arguments passed to pyarrow.parquet.write_table().
"""
@@ -322,19 +428,14 @@ def _to_parquet(
"pyarrow.parquet", extra="pyarrow is required for Parquet support."
)
if kwargs and "version" in kwargs and kwargs["version"] is not None:
if schema_version is None and kwargs["version"] in SUPPORTED_VERSIONS:
warnings.warn(
"the `version` parameter has been replaced with `schema_version`. "
"`version` will instead be passed directly to the underlying "
"parquet writer unless `version` is 0.1.0 or 0.4.0.",
FutureWarning,
stacklevel=2,
)
schema_version = kwargs.pop("version")
path = _expand_user(path)
table = _geopandas_to_arrow(df, index=index, schema_version=schema_version)
table = _geopandas_to_arrow(
df,
index=index,
geometry_encoding=geometry_encoding,
schema_version=schema_version,
write_covering_bbox=write_covering_bbox,
)
parquet.write_table(table, path, compression=compression, **kwargs)
@@ -379,47 +480,26 @@ def _to_feather(df, path, index=None, compression=None, schema_version=None, **k
if Version(pyarrow.__version__) < Version("0.17.0"):
raise ImportError("pyarrow >= 0.17 required for Feather support")
if kwargs and "version" in kwargs and kwargs["version"] is not None:
if schema_version is None and kwargs["version"] in SUPPORTED_VERSIONS:
warnings.warn(
"the `version` parameter has been replaced with `schema_version`. "
"`version` will instead be passed directly to the underlying "
"feather writer unless `version` is 0.1.0 or 0.4.0.",
FutureWarning,
stacklevel=2,
)
schema_version = kwargs.pop("version")
path = _expand_user(path)
table = _geopandas_to_arrow(df, index=index, schema_version=schema_version)
feather.write_feather(table, path, compression=compression, **kwargs)
def _arrow_to_geopandas(table, metadata=None):
def _arrow_to_geopandas(table, geo_metadata=None):
"""
Helper function with main, shared logic for read_parquet/read_feather.
"""
df = table.to_pandas()
metadata = metadata or table.schema.metadata
if metadata is None or b"geo" not in metadata:
raise ValueError(
"""Missing geo metadata in Parquet/Feather file.
Use pandas.read_parquet/read_feather() instead."""
)
try:
metadata = _decode_metadata(metadata.get(b"geo", b""))
except (TypeError, json.decoder.JSONDecodeError):
raise ValueError("Missing or malformed geo metadata in Parquet/Feather file")
_validate_metadata(metadata)
if geo_metadata is None:
# Note: this path of not passing metadata is also used by dask-geopandas
geo_metadata = _validate_and_decode_metadata(table.schema.metadata)
# Find all geometry columns that were read from the file. May
# be a subset if 'columns' parameter is used.
geometry_columns = df.columns.intersection(metadata["columns"])
geometry_columns = [
col for col in geo_metadata["columns"] if col in table.column_names
]
result_column_names = list(table.slice(0, 0).to_pandas().columns)
geometry_columns.sort(key=result_column_names.index)
if not len(geometry_columns):
raise ValueError(
@@ -428,7 +508,7 @@ def _arrow_to_geopandas(table, metadata=None):
use pandas.read_parquet/read_feather() instead."""
)
geometry = metadata["primary_column"]
geometry = geo_metadata["primary_column"]
# Missing geometry likely indicates a subset of columns was read;
# promote the first available geometry to the primary geometry.
@@ -443,9 +523,12 @@ def _arrow_to_geopandas(table, metadata=None):
stacklevel=3,
)
table_attr = table.drop(geometry_columns)
df = table_attr.to_pandas()
# Convert the WKB columns that are present back to geometry.
for col in geometry_columns:
col_metadata = metadata["columns"][col]
col_metadata = geo_metadata["columns"][col]
if "crs" in col_metadata:
crs = col_metadata["crs"]
if isinstance(crs, dict):
@@ -455,7 +538,19 @@ def _arrow_to_geopandas(table, metadata=None):
# OGC:CRS84
crs = "OGC:CRS84"
df[col] = from_wkb(df[col].values, crs=crs)
if col_metadata["encoding"] == "WKB":
geom_arr = from_wkb(np.array(table[col]), crs=crs)
else:
from geopandas.io._geoarrow import construct_shapely_array
geom_arr = from_shapely(
construct_shapely_array(
table[col].combine_chunks(), "geoarrow." + col_metadata["encoding"]
),
crs=crs,
)
df.insert(result_column_names.index(col), col, geom_arr)
return GeoDataFrame(df, geometry=geometry)
@@ -521,7 +616,59 @@ def _ensure_arrow_fs(filesystem):
return filesystem
def _read_parquet(path, columns=None, storage_options=None, **kwargs):
def _validate_and_decode_metadata(metadata):
if metadata is None or b"geo" not in metadata:
raise ValueError(
"""Missing geo metadata in Parquet/Feather file.
Use pandas.read_parquet/read_feather() instead."""
)
# check for malformed metadata
try:
decoded_geo_metadata = _decode_metadata(metadata.get(b"geo", b""))
except (TypeError, json.decoder.JSONDecodeError):
raise ValueError("Missing or malformed geo metadata in Parquet/Feather file")
_validate_geo_metadata(decoded_geo_metadata)
return decoded_geo_metadata
def _read_parquet_schema_and_metadata(path, filesystem):
"""
Opening the Parquet file/dataset a first time to get the schema and metadata.
TODO: we should look into how we can reuse opened dataset for reading the
actual data, to avoid discovering the dataset twice (problem right now is
that the ParquetDataset interface doesn't allow passing the filters on read)
"""
import pyarrow
from pyarrow import parquet
kwargs = {}
if Version(pyarrow.__version__) < Version("15.0.0"):
kwargs = dict(use_legacy_dataset=False)
try:
schema = parquet.ParquetDataset(path, filesystem=filesystem, **kwargs).schema
except Exception:
schema = parquet.read_schema(path, filesystem=filesystem)
metadata = schema.metadata
# read metadata separately to get the raw Parquet FileMetaData metadata
# (pyarrow doesn't properly exposes those in schema.metadata for files
# created by GDAL - https://issues.apache.org/jira/browse/ARROW-16688)
if metadata is None or b"geo" not in metadata:
try:
metadata = parquet.read_metadata(path, filesystem=filesystem).metadata
except Exception:
pass
return schema, metadata
def _read_parquet(path, columns=None, storage_options=None, bbox=None, **kwargs):
"""
Load a Parquet object from the file path, returning a GeoDataFrame.
@@ -565,8 +712,13 @@ def _read_parquet(path, columns=None, storage_options=None, **kwargs):
both ``pyarrow.fs`` and ``fsspec`` (e.g. "s3://") then the ``pyarrow.fs``
filesystem is preferred. Provide the instantiated fsspec filesystem using
the ``filesystem`` keyword if you wish to use its implementation.
bbox : tuple, optional
Bounding box to be used to filter selection from geoparquet data. This
is only usable if the data was saved with the bbox covering metadata.
Input is of the tuple format (xmin, ymin, xmax, ymax).
**kwargs
Any additional kwargs passed to pyarrow.parquet.read_table().
Any additional kwargs passed to :func:`pyarrow.parquet.read_table`.
Returns
-------
@@ -595,29 +747,36 @@ def _read_parquet(path, columns=None, storage_options=None, **kwargs):
filesystem, path = _get_filesystem_path(
path, filesystem=filesystem, storage_options=storage_options
)
path = _expand_user(path)
schema, metadata = _read_parquet_schema_and_metadata(path, filesystem)
geo_metadata = _validate_and_decode_metadata(metadata)
bbox_filter = (
_get_parquet_bbox_filter(geo_metadata, bbox) if bbox is not None else None
)
if_bbox_column_exists = _check_if_covering_in_geo_metadata(geo_metadata)
# by default, bbox column is not read in, so must specify which
# columns are read in if it exists.
if not columns and if_bbox_column_exists:
columns = _get_non_bbox_columns(schema, geo_metadata)
# if both bbox and filters kwargs are used, must splice together.
if "filters" in kwargs:
filters_kwarg = kwargs.pop("filters")
filters = _splice_bbox_and_filters(filters_kwarg, bbox_filter)
else:
filters = bbox_filter
kwargs["use_pandas_metadata"] = True
table = parquet.read_table(path, columns=columns, filesystem=filesystem, **kwargs)
# read metadata separately to get the raw Parquet FileMetaData metadata
# (pyarrow doesn't properly exposes those in schema.metadata for files
# created by GDAL - https://issues.apache.org/jira/browse/ARROW-16688)
metadata = None
if table.schema.metadata is None or b"geo" not in table.schema.metadata:
try:
# read_metadata does not accept a filesystem keyword, so need to
# handle this manually (https://issues.apache.org/jira/browse/ARROW-16719)
if filesystem is not None:
pa_filesystem = _ensure_arrow_fs(filesystem)
with pa_filesystem.open_input_file(path) as source:
metadata = parquet.read_metadata(source).metadata
else:
metadata = parquet.read_metadata(path).metadata
except Exception:
pass
table = parquet.read_table(
path, columns=columns, filesystem=filesystem, filters=filters, **kwargs
)
return _arrow_to_geopandas(table, metadata)
return _arrow_to_geopandas(table, geo_metadata)
def _read_feather(path, columns=None, **kwargs):
@@ -677,11 +836,78 @@ def _read_feather(path, columns=None, **kwargs):
)
# TODO move this into `import_optional_dependency`
import pyarrow
import geopandas.io._pyarrow_hotfix # noqa: F401
if Version(pyarrow.__version__) < Version("0.17.0"):
raise ImportError("pyarrow >= 0.17 required for Feather support")
path = _expand_user(path)
table = feather.read_table(path, columns=columns, **kwargs)
return _arrow_to_geopandas(table)
def _get_parquet_bbox_filter(geo_metadata, bbox):
primary_column = geo_metadata["primary_column"]
if _check_if_covering_in_geo_metadata(geo_metadata):
bbox_column_name = _get_bbox_encoding_column_name(geo_metadata)
return _convert_bbox_to_parquet_filter(bbox, bbox_column_name)
elif geo_metadata["columns"][primary_column]["encoding"] == "point":
import pyarrow.compute as pc
return (
(pc.field((primary_column, "x")) >= bbox[0])
& (pc.field((primary_column, "x")) <= bbox[2])
& (pc.field((primary_column, "y")) >= bbox[1])
& (pc.field((primary_column, "y")) <= bbox[3])
)
else:
raise ValueError(
"Specifying 'bbox' not supported for this Parquet file (it should either "
"have a bbox covering column or use 'point' encoding)."
)
def _convert_bbox_to_parquet_filter(bbox, bbox_column_name):
import pyarrow.compute as pc
return ~(
(pc.field((bbox_column_name, "xmin")) > bbox[2])
| (pc.field((bbox_column_name, "ymin")) > bbox[3])
| (pc.field((bbox_column_name, "xmax")) < bbox[0])
| (pc.field((bbox_column_name, "ymax")) < bbox[1])
)
def _check_if_covering_in_geo_metadata(geo_metadata):
primary_column = geo_metadata["primary_column"]
return "covering" in geo_metadata["columns"][primary_column].keys()
def _get_bbox_encoding_column_name(geo_metadata):
primary_column = geo_metadata["primary_column"]
return geo_metadata["columns"][primary_column]["covering"]["bbox"]["xmin"][0]
def _get_non_bbox_columns(schema, geo_metadata):
bbox_column_name = _get_bbox_encoding_column_name(geo_metadata)
columns = schema.names
if bbox_column_name in columns:
columns.remove(bbox_column_name)
return columns
def _splice_bbox_and_filters(kwarg_filters, bbox_filter):
parquet = import_optional_dependency(
"pyarrow.parquet", extra="pyarrow is required for Parquet support."
)
if bbox_filter is None:
return kwarg_filters
filters_expression = parquet.filters_to_expression(kwarg_filters)
return bbox_filter & filters_expression

View File

@@ -1,30 +1,33 @@
from __future__ import annotations
import os
import urllib.request
import warnings
from io import IOBase
from packaging.version import Version
from pathlib import Path
import warnings
# Adapted from pandas.io.common
from urllib.parse import urlparse as parse_url
from urllib.parse import uses_netloc, uses_params, uses_relative
import numpy as np
import pandas as pd
from pandas.api.types import is_integer_dtype
import pyproj
import shapely
from shapely.geometry import mapping
from shapely.geometry.base import BaseGeometry
from geopandas import GeoDataFrame, GeoSeries
# Adapted from pandas.io.common
from urllib.parse import urlparse as parse_url
from urllib.parse import uses_netloc, uses_params, uses_relative
import urllib.request
from geopandas._compat import HAS_PYPROJ, PANDAS_GE_20
from geopandas.io.util import vsi_path
_VALID_URLS = set(uses_relative + uses_netloc + uses_params)
_VALID_URLS.discard("")
# file:// URIs are supported by fiona/pyogrio -> don't already open + read the file here
_VALID_URLS.discard("file")
fiona = None
fiona_env = None
fiona_import_error = None
@@ -55,6 +58,7 @@ def _import_fiona():
FIONA_GE_19 = Version(Version(fiona.__version__).base_version) >= Version(
"1.9.0"
)
except ImportError as err:
fiona = False
fiona_import_error = str(err)
@@ -71,13 +75,14 @@ def _import_pyogrio():
if pyogrio is None:
try:
import pyogrio
except ImportError as err:
pyogrio = False
pyogrio_import_error = str(err)
def _check_fiona(func):
if fiona is None:
if not fiona:
raise ImportError(
f"the {func} requires the 'fiona' package, but it is not installed or does "
f"not import correctly.\nImporting fiona resulted in: {fiona_import_error}"
@@ -85,7 +90,7 @@ def _check_fiona(func):
def _check_pyogrio(func):
if pyogrio is None:
if not pyogrio:
raise ImportError(
f"the {func} requires the 'pyogrio' package, but it is not installed "
"or does not import correctly."
@@ -93,35 +98,49 @@ def _check_pyogrio(func):
)
def _check_metadata_supported(metadata: str | None, engine: str, driver: str) -> None:
if metadata is None:
return
if driver != "GPKG":
raise NotImplementedError(
"The 'metadata' keyword is only supported for the GPKG driver."
)
if engine == "fiona" and not FIONA_GE_19:
raise NotImplementedError(
"The 'metadata' keyword is only supported for Fiona >= 1.9."
)
def _check_engine(engine, func):
# if not specified through keyword or option, then default to "fiona" if
# installed, otherwise try pyogrio
# if not specified through keyword or option, then default to "pyogrio" if
# installed, otherwise try fiona
if engine is None:
import geopandas
engine = geopandas.options.io_engine
if engine is None:
_import_fiona()
if fiona:
engine = "fiona"
_import_pyogrio()
if pyogrio:
engine = "pyogrio"
else:
_import_pyogrio()
if pyogrio:
engine = "pyogrio"
_import_fiona()
if fiona:
engine = "fiona"
if engine == "fiona":
_import_fiona()
_check_fiona(func)
elif engine == "pyogrio":
if engine == "pyogrio":
_import_pyogrio()
_check_pyogrio(func)
elif engine == "fiona":
_import_fiona()
_check_fiona(func)
elif engine is None:
raise ImportError(
f"The {func} requires the 'pyogrio' or 'fiona' package, "
"but neither is installed or imports correctly."
f"\nImporting fiona resulted in: {fiona_import_error}"
f"\nImporting pyogrio resulted in: {pyogrio_import_error}"
f"\nImporting fiona resulted in: {fiona_import_error}"
)
return engine
@@ -168,31 +187,12 @@ def _is_url(url):
return False
def _is_zip(path):
"""Check if a given path is a zipfile"""
parsed = fiona.path.ParsedPath.from_uri(path)
return (
parsed.archive.endswith(".zip")
if parsed.archive
else parsed.path.endswith(".zip")
)
def _read_file(filename, bbox=None, mask=None, rows=None, engine=None, **kwargs):
def _read_file(
filename, bbox=None, mask=None, columns=None, rows=None, engine=None, **kwargs
):
"""
Returns a GeoDataFrame from a file or URL.
.. note::
GeoPandas currently defaults to use Fiona as the engine in ``read_file``.
However, GeoPandas 1.0 will switch to use pyogrio as the default engine, since
pyogrio can provide a significant speedup compared to Fiona. We recommend to
already install pyogrio and specify the engine by using the ``engine`` keyword
(``geopandas.read_file(..., engine="pyogrio")``), or by setting the default for
the ``engine`` keyword globally with::
geopandas.options.io_engine = "pyogrio"
Parameters
----------
filename : str, path object or file-like object
@@ -209,21 +209,28 @@ def _read_file(filename, bbox=None, mask=None, rows=None, engine=None, **kwargs)
Filter for features that intersect with the given dict-like geojson
geometry, GeoSeries, GeoDataFrame or shapely geometry.
CRS mis-matches are resolved if given a GeoSeries or GeoDataFrame.
Cannot be used with bbox.
Cannot be used with bbox. If multiple geometries are passed, this will
first union all geometries, which may be computationally expensive.
columns : list, optional
List of column names to import from the data source. Column names
must exactly match the names in the data source. To avoid reading
any columns (besides the geometry column), pass an empty list-like.
By default reads all columns.
rows : int or slice, default None
Load in specific rows by passing an integer (first `n` rows) or a
slice() object.
engine : str, "fiona" or "pyogrio"
engine : str, "pyogrio" or "fiona"
The underlying library that is used to read the file. Currently, the
supported options are "fiona" and "pyogrio". Defaults to "fiona" if
installed, otherwise tries "pyogrio".
supported options are "pyogrio" and "fiona". Defaults to "pyogrio" if
installed, otherwise tries "fiona". Engine can also be set globally
with the ``geopandas.options.io_engine`` option.
**kwargs :
Keyword args to be passed to the engine. In case of the "fiona" engine,
the keyword arguments are passed to :func:`fiona.open` or
:class:`fiona.collection.BytesCollection` when opening the file.
For more information on possible keywords, type:
``import fiona; help(fiona.open)``. In case of the "pyogrio" engine,
the keyword arguments are passed to :func:`pyogrio.read_dataframe`.
Keyword args to be passed to the engine, and can be used to write
to multi-layer data, store data within archives (zip files), etc.
In case of the "pyogrio" engine, the keyword arguments are passed to
`pyogrio.write_dataframe`. In case of the "fiona" engine, the keyword
arguments are passed to fiona.open`. For more information on possible
keywords, type: ``import pyogrio; help(pyogrio.write_dataframe)``.
Examples
@@ -284,7 +291,9 @@ def _read_file(filename, bbox=None, mask=None, rows=None, engine=None, **kwargs)
from_bytes = True
if engine == "pyogrio":
return _read_file_pyogrio(filename, bbox=bbox, mask=mask, rows=rows, **kwargs)
return _read_file_pyogrio(
filename, bbox=bbox, mask=mask, columns=columns, rows=rows, **kwargs
)
elif engine == "fiona":
if pd.api.types.is_file_like(filename):
@@ -295,7 +304,13 @@ def _read_file(filename, bbox=None, mask=None, rows=None, engine=None, **kwargs)
path_or_bytes = filename
return _read_file_fiona(
path_or_bytes, from_bytes, bbox=bbox, mask=mask, rows=rows, **kwargs
path_or_bytes,
from_bytes,
bbox=bbox,
mask=mask,
columns=columns,
rows=rows,
**kwargs,
)
else:
@@ -303,31 +318,36 @@ def _read_file(filename, bbox=None, mask=None, rows=None, engine=None, **kwargs)
def _read_file_fiona(
path_or_bytes, from_bytes, bbox=None, mask=None, rows=None, where=None, **kwargs
path_or_bytes,
from_bytes,
bbox=None,
mask=None,
columns=None,
rows=None,
where=None,
**kwargs,
):
if where is not None and not FIONA_GE_19:
raise NotImplementedError("where requires fiona 1.9+")
if columns is not None:
if "include_fields" in kwargs:
raise ValueError(
"Cannot specify both 'include_fields' and 'columns' keywords"
)
if not FIONA_GE_19:
raise NotImplementedError("'columns' keyword requires fiona 1.9+")
kwargs["include_fields"] = columns
elif "include_fields" in kwargs:
# alias to columns, as this variable is used below to specify column order
# in the dataframe creation
columns = kwargs["include_fields"]
if not from_bytes:
# Opening a file via URL or file-like-object above automatically detects a
# zipped file. In order to match that behavior, attempt to add a zip scheme
# if missing.
if _is_zip(str(path_or_bytes)):
parsed = fiona.parse_path(str(path_or_bytes))
if isinstance(parsed, fiona.path.ParsedPath):
# If fiona is able to parse the path, we can safely look at the scheme
# and update it to have a zip scheme if necessary.
schemes = (parsed.scheme or "").split("+")
if "zip" not in schemes:
parsed.scheme = "+".join(["zip"] + schemes)
path_or_bytes = parsed.name
elif isinstance(parsed, fiona.path.UnparsedPath) and not str(
path_or_bytes
).startswith("/vsi"):
# If fiona is unable to parse the path, it might have a Windows drive
# scheme. Try adding zip:// to the front. If the path starts with "/vsi"
# it is a legacy GDAL path type, so let it pass unmodified.
path_or_bytes = "zip://" + parsed.name
path_or_bytes = vsi_path(str(path_or_bytes))
if from_bytes:
reader = fiona.BytesCollection
@@ -359,7 +379,7 @@ def _read_file_fiona(
assert len(bbox) == 4
# handle loading the mask
elif isinstance(mask, (GeoDataFrame, GeoSeries)):
mask = mapping(mask.to_crs(crs).unary_union)
mask = mapping(mask.to_crs(crs).union_all())
elif isinstance(mask, BaseGeometry):
mask = mapping(mask)
@@ -383,11 +403,14 @@ def _read_file_fiona(
else:
f_filt = features
# get list of columns
columns = list(features.schema["properties"])
columns = columns or list(features.schema["properties"])
datetime_fields = [
k for (k, v) in features.schema["properties"].items() if v == "datetime"
]
if kwargs.get("ignore_geometry", False):
if (
kwargs.get("ignore_geometry", False)
or features.schema["geometry"] == "None"
):
df = pd.DataFrame(
[record["properties"] for record in f_filt], columns=columns
)
@@ -396,16 +419,39 @@ def _read_file_fiona(
f_filt, crs=crs, columns=columns + ["geometry"]
)
for k in datetime_fields:
as_dt = pd.to_datetime(df[k], errors="ignore")
# if to_datetime failed, try again for mixed timezone offsets
if as_dt.dtype == "object":
as_dt = None
# plain try catch for when pandas will raise in the future
# TODO we can tighten the exception type in future when it does
try:
with warnings.catch_warnings():
# pandas 2.x does not yet enforce this behaviour but raises a
# warning -> we want to to suppress this warning for our users,
# and do this by turning it into an error so we take the
# `except` code path to try again with utc=True
warnings.filterwarnings(
"error",
"In a future version of pandas, parsing datetimes with "
"mixed time zones will raise an error",
FutureWarning,
)
as_dt = pd.to_datetime(df[k])
except Exception:
pass
if as_dt is None or as_dt.dtype == "object":
# if to_datetime failed, try again for mixed timezone offsets
# This can still fail if there are invalid datetimes
as_dt = pd.to_datetime(df[k], errors="ignore", utc=True)
try:
as_dt = pd.to_datetime(df[k], utc=True)
except Exception:
pass
# if to_datetime succeeded, round datetimes as
# fiona only supports up to ms precision (any microseconds are
# floating point rounding error)
if not (as_dt.dtype == "object"):
df[k] = as_dt.dt.round(freq="ms")
if as_dt is not None and not (as_dt.dtype == "object"):
if PANDAS_GE_20:
df[k] = as_dt.dt.as_unit("ms")
else:
df[k] = as_dt.dt.round(freq="ms")
return df
@@ -428,48 +474,79 @@ def _read_file_pyogrio(path_or_bytes, bbox=None, mask=None, rows=None, **kwargs)
raise ValueError("slice with step is not supported")
else:
raise TypeError("'rows' must be an integer or a slice.")
if bbox is not None and mask is not None:
# match error message from Fiona
raise ValueError("mask and bbox can not be set together")
if bbox is not None:
if isinstance(bbox, (GeoDataFrame, GeoSeries)):
bbox = tuple(bbox.total_bounds)
crs = pyogrio.read_info(path_or_bytes).get("crs")
if isinstance(path_or_bytes, IOBase):
path_or_bytes.seek(0)
bbox = tuple(bbox.to_crs(crs).total_bounds)
elif isinstance(bbox, BaseGeometry):
bbox = bbox.bounds
if len(bbox) != 4:
raise ValueError("'bbox' should be a length-4 tuple.")
if mask is not None:
raise ValueError(
"The 'mask' keyword is not supported with the 'pyogrio' engine. "
"You can use 'bbox' instead."
)
# NOTE: mask cannot be used at same time as bbox keyword
if isinstance(mask, (GeoDataFrame, GeoSeries)):
crs = pyogrio.read_info(path_or_bytes).get("crs")
if isinstance(path_or_bytes, IOBase):
path_or_bytes.seek(0)
mask = shapely.unary_union(mask.to_crs(crs).geometry.values)
elif isinstance(mask, BaseGeometry):
mask = shapely.unary_union(mask)
elif isinstance(mask, dict) or hasattr(mask, "__geo_interface__"):
# convert GeoJSON to shapely geometry
mask = shapely.geometry.shape(mask)
kwargs["mask"] = mask
if kwargs.pop("ignore_geometry", False):
kwargs["read_geometry"] = False
# TODO: if bbox is not None, check its CRS vs the CRS of the file
# translate `ignore_fields`/`include_fields` keyword for back compat with fiona
if "ignore_fields" in kwargs and "include_fields" in kwargs:
raise ValueError("Cannot specify both 'ignore_fields' and 'include_fields'")
elif "ignore_fields" in kwargs:
if kwargs.get("columns", None) is not None:
raise ValueError(
"Cannot specify both 'columns' and 'ignore_fields' keywords"
)
warnings.warn(
"The 'include_fields' and 'ignore_fields' keywords are deprecated, and "
"will be removed in a future release. You can use the 'columns' keyword "
"instead to select which columns to read.",
DeprecationWarning,
stacklevel=3,
)
ignore_fields = kwargs.pop("ignore_fields")
fields = pyogrio.read_info(path_or_bytes)["fields"]
include_fields = [col for col in fields if col not in ignore_fields]
kwargs["columns"] = include_fields
elif "include_fields" in kwargs:
# translate `include_fields` keyword for back compat with fiona engine
if kwargs.get("columns", None) is not None:
raise ValueError(
"Cannot specify both 'columns' and 'include_fields' keywords"
)
warnings.warn(
"The 'include_fields' and 'ignore_fields' keywords are deprecated, and "
"will be removed in a future release. You can use the 'columns' keyword "
"instead to select which columns to read.",
DeprecationWarning,
stacklevel=3,
)
kwargs["columns"] = kwargs.pop("include_fields")
return pyogrio.read_dataframe(path_or_bytes, bbox=bbox, **kwargs)
def read_file(*args, **kwargs):
warnings.warn(
"geopandas.io.file.read_file() is intended for internal "
"use only, and will be deprecated. Use geopandas.read_file() instead.",
FutureWarning,
stacklevel=2,
)
return _read_file(*args, **kwargs)
def to_file(*args, **kwargs):
warnings.warn(
"geopandas.io.file.to_file() is intended for internal "
"use only, and will be deprecated. Use GeoDataFrame.to_file() "
"or GeoSeries.to_file() instead.",
FutureWarning,
stacklevel=2,
)
return _to_file(*args, **kwargs)
def _detect_driver(path):
"""
Attempt to auto-detect driver based on the extension
@@ -497,25 +574,16 @@ def _to_file(
mode="w",
crs=None,
engine=None,
metadata=None,
**kwargs,
):
"""
Write this GeoDataFrame to an OGR data source
A dictionary of supported OGR providers is available via:
>>> import fiona
>>> fiona.supported_drivers # doctest: +SKIP
.. note::
GeoPandas currently defaults to use Fiona as the engine in ``to_file``.
However, GeoPandas 1.0 will switch to use pyogrio as the default engine, since
pyogrio can provide a significant speedup compared to Fiona. We recommend to
already install pyogrio and specify the engine by using the ``engine`` keyword
(``df.to_file(..., engine="pyogrio")``), or by setting the default for
the ``engine`` keyword globally with::
geopandas.options.io_engine = "pyogrio"
>>> import pyogrio
>>> pyogrio.list_drivers() # doctest: +SKIP
Parameters
----------
@@ -557,10 +625,15 @@ def _to_file(
The value can be anything accepted
by :meth:`pyproj.CRS.from_user_input() <pyproj.crs.CRS.from_user_input>`,
such as an authority string (eg "EPSG:4326") or a WKT string.
engine : str, "fiona" or "pyogrio"
The underlying library that is used to write the file. Currently, the
supported options are "fiona" and "pyogrio". Defaults to "fiona" if
installed, otherwise tries "pyogrio".
engine : str, "pyogrio" or "fiona"
The underlying library that is used to read the file. Currently, the
supported options are "pyogrio" and "fiona". Defaults to "pyogrio" if
installed, otherwise tries "fiona". Engine can also be set globally
with the ``geopandas.options.io_engine`` option.
metadata : dict[str, str], default None
Optional metadata to be stored in the file. Keys and values must be
strings. Only supported for the "GPKG" driver
(requires Fiona >= 1.9 or pyogrio >= 0.6).
**kwargs :
Keyword args to be passed to the engine, and can be used to write
to multi-layer data, store data within archives (zip files), etc.
@@ -604,44 +677,57 @@ def _to_file(
"to a supported format like a well-known text (WKT) using "
"`GeoSeries.to_wkt()`.",
)
_check_metadata_supported(metadata, engine, driver)
if mode not in ("w", "a"):
raise ValueError(f"'mode' should be one of 'w' or 'a', got '{mode}' instead")
if engine == "fiona":
_to_file_fiona(df, filename, driver, schema, crs, mode, **kwargs)
elif engine == "pyogrio":
_to_file_pyogrio(df, filename, driver, schema, crs, mode, **kwargs)
if engine == "pyogrio":
_to_file_pyogrio(df, filename, driver, schema, crs, mode, metadata, **kwargs)
elif engine == "fiona":
_to_file_fiona(df, filename, driver, schema, crs, mode, metadata, **kwargs)
else:
raise ValueError(f"unknown engine '{engine}'")
def _to_file_fiona(df, filename, driver, schema, crs, mode, **kwargs):
def _to_file_fiona(df, filename, driver, schema, crs, mode, metadata, **kwargs):
if not HAS_PYPROJ and crs:
raise ImportError(
"The 'pyproj' package is required to write a file with a CRS, but it is not"
" installed or does not import correctly."
)
if schema is None:
schema = infer_schema(df)
if crs:
crs = pyproj.CRS.from_user_input(crs)
from pyproj import CRS
crs = CRS.from_user_input(crs)
else:
crs = df.crs
with fiona_env():
crs_wkt = None
try:
gdal_version = fiona.env.get_gdal_release_name()
except AttributeError:
gdal_version = "2.0.0" # just assume it is not the latest
if Version(gdal_version) >= Version("3.0.0") and crs:
gdal_version = Version(
fiona.env.get_gdal_release_name().strip("e")
) # GH3147
except (AttributeError, ValueError):
gdal_version = Version("2.0.0") # just assume it is not the latest
if gdal_version >= Version("3.0.0") and crs:
crs_wkt = crs.to_wkt()
elif crs:
crs_wkt = crs.to_wkt("WKT1_GDAL")
with fiona.open(
filename, mode=mode, driver=driver, crs_wkt=crs_wkt, schema=schema, **kwargs
) as colxn:
if metadata is not None:
colxn.update_tags(metadata)
colxn.writerecords(df.iterfeatures())
def _to_file_pyogrio(df, filename, driver, schema, crs, mode, **kwargs):
def _to_file_pyogrio(df, filename, driver, schema, crs, mode, metadata, **kwargs):
import pyogrio
if schema is not None:
@@ -653,13 +739,13 @@ def _to_file_pyogrio(df, filename, driver, schema, crs, mode, **kwargs):
kwargs["append"] = True
if crs is not None:
raise ValueError("Passing 'crs' it not supported with the 'pyogrio' engine.")
raise ValueError("Passing 'crs' is not supported with the 'pyogrio' engine.")
# for the fiona engine, this check is done in gdf.iterfeatures()
if not df.columns.is_unique:
raise ValueError("GeoDataFrame cannot contain duplicated column names.")
pyogrio.write_dataframe(df, filename, driver=driver, **kwargs)
pyogrio.write_dataframe(df, filename, driver=driver, metadata=metadata, **kwargs)
def infer_schema(df):
@@ -732,3 +818,34 @@ def _geometry_types(df):
geom_types = geom_types[0]
return geom_types
def _list_layers(filename) -> pd.DataFrame:
"""List layers available in a file.
Provides an overview of layers available in a file or URL together with their
geometry types. When supported by the data source, this includes both spatial and
non-spatial layers. Non-spatial layers are indicated by the ``"geometry_type"``
column being ``None``. GeoPandas will not read such layers but they can be read into
a pd.DataFrame using :func:`pyogrio.read_dataframe`.
Parameters
----------
filename : str, path object or file-like object
Either the absolute or relative path to the file or URL to
be opened, or any object with a read() method (such as an open file
or StringIO)
Returns
-------
pandas.DataFrame
A DataFrame with columns "name" and "geometry_type" and one row per layer.
"""
_import_pyogrio()
_check_pyogrio("list_layers")
import pyogrio
return pd.DataFrame(
pyogrio.list_layers(filename), columns=["name", "geometry_type"]
)

View File

@@ -1,5 +1,6 @@
import warnings
from contextlib import contextmanager
from functools import lru_cache
import pandas as pd
@@ -8,8 +9,6 @@ import shapely.wkb
from geopandas import GeoDataFrame
from geopandas import _compat as compat
@contextmanager
def _get_conn(conn_or_engine):
@@ -28,7 +27,7 @@ def _get_conn(conn_or_engine):
-------
Connection
"""
from sqlalchemy.engine.base import Engine, Connection
from sqlalchemy.engine.base import Connection, Engine
if isinstance(conn_or_engine, Connection):
if not conn_or_engine.in_transaction():
@@ -43,7 +42,7 @@ def _get_conn(conn_or_engine):
raise ValueError(f"Unknown Connectable: {conn_or_engine}")
def _df_to_geodf(df, geom_col="geom", crs=None):
def _df_to_geodf(df, geom_col="geom", crs=None, con=None):
"""
Transforms a pandas DataFrame into a GeoDataFrame.
The column 'geom_col' must be a geometry column in WKB representation.
@@ -60,6 +59,8 @@ def _df_to_geodf(df, geom_col="geom", crs=None):
such as an authority string (eg "EPSG:4326") or a WKT string.
If not set, tries to determine CRS from the SRID associated with the
first geometry in the database, and assigns that to all geometries.
con : sqlalchemy.engine.Connection or sqlalchemy.engine.Engine
Active connection to the database to query.
Returns
-------
GeoDataFrame
@@ -80,10 +81,6 @@ def _df_to_geodf(df, geom_col="geom", crs=None):
load_geom_bytes = shapely.wkb.loads
"""Load from Python 3 binary."""
def load_geom_buffer(x):
"""Load from Python 2 binary."""
return shapely.wkb.loads(str(x))
def load_geom_text(x):
"""Load from binary encoded as text."""
return shapely.wkb.loads(str(x), hex=True)
@@ -95,13 +92,31 @@ def _df_to_geodf(df, geom_col="geom", crs=None):
df[geom_col] = geoms = geoms.apply(load_geom)
if crs is None:
if compat.SHAPELY_GE_20:
srid = shapely.get_srid(geoms.iat[0])
else:
srid = shapely.geos.lgeos.GEOSGetSRID(geoms.iat[0]._geom)
srid = shapely.get_srid(geoms.iat[0])
# if no defined SRID in geodatabase, returns SRID of 0
if srid != 0:
crs = "epsg:{}".format(srid)
try:
spatial_ref_sys_df = _get_spatial_ref_sys_df(con, srid)
except pd.errors.DatabaseError:
warning_msg = (
f"Could not find the spatial reference system table "
f"(spatial_ref_sys) in PostGIS."
f"Trying epsg:{srid} as a fallback."
)
warnings.warn(warning_msg, UserWarning, stacklevel=3)
crs = "epsg:{}".format(srid)
else:
if not spatial_ref_sys_df.empty:
auth_name = spatial_ref_sys_df["auth_name"].item()
crs = f"{auth_name}:{srid}"
else:
warning_msg = (
f"Could not find srid {srid} in the "
f"spatial_ref_sys table. "
f"Trying epsg:{srid} as a fallback."
)
warnings.warn(warning_msg, UserWarning, stacklevel=3)
crs = "epsg:{}".format(srid)
return GeoDataFrame(df, crs=crs, geometry=geom_col)
@@ -176,7 +191,7 @@ def _read_postgis(
params=params,
chunksize=chunksize,
)
return _df_to_geodf(df, geom_col=geom_col, crs=crs)
return _df_to_geodf(df, geom_col=geom_col, crs=crs, con=con)
else:
# read data in chunks and return a generator
@@ -189,20 +204,9 @@ def _read_postgis(
params=params,
chunksize=chunksize,
)
return (_df_to_geodf(df, geom_col=geom_col, crs=crs) for df in df_generator)
def read_postgis(*args, **kwargs):
import warnings
warnings.warn(
"geopandas.io.sql.read_postgis() is intended for internal "
"use only, and will be deprecated. Use geopandas.read_postgis() instead.",
FutureWarning,
stacklevel=2,
)
return _read_postgis(*args, **kwargs)
return (
_df_to_geodf(df, geom_col=geom_col, crs=crs, con=con) for df in df_generator
)
def _get_geometry_type(gdf):
@@ -253,7 +257,7 @@ def _get_geometry_type(gdf):
def _get_srid_from_crs(gdf):
"""
Get EPSG code from CRS if available. If not, return -1.
Get EPSG code from CRS if available. If not, return 0.
"""
# Use geoalchemy2 default for srid
@@ -279,7 +283,7 @@ def _get_srid_from_crs(gdf):
warnings.warn(warning_msg, UserWarning, stacklevel=2)
if srid is None:
srid = -1
srid = 0
warnings.warn(warning_msg, UserWarning, stacklevel=2)
return srid
@@ -288,8 +292,8 @@ def _get_srid_from_crs(gdf):
def _convert_linearring_to_linestring(gdf, geom_name):
from shapely.geometry import LineString
# Todo: Use Pygeos function once it's implemented:
# https://github.com/pygeos/pygeos/issues/76
# Todo: Use shapely function once it's implemented:
# https://github.com/shapely/shapely/issues/1617
mask = gdf.geom_type == "LinearRing"
gdf.loc[mask, geom_name] = gdf.loc[mask, geom_name].apply(
@@ -300,26 +304,11 @@ def _convert_linearring_to_linestring(gdf, geom_name):
def _convert_to_ewkb(gdf, geom_name, srid):
"""Convert geometries to ewkb."""
if compat.USE_SHAPELY_20:
geoms = shapely.to_wkb(
shapely.set_srid(gdf[geom_name].values._data, srid=srid),
hex=True,
include_srid=True,
)
elif compat.USE_PYGEOS:
from pygeos import set_srid, to_wkb
geoms = to_wkb(
set_srid(gdf[geom_name].values._data, srid=srid),
hex=True,
include_srid=True,
)
else:
from shapely.wkb import dumps
geoms = [dumps(geom, srid=srid, hex=True) for geom in gdf[geom_name]]
geoms = shapely.to_wkb(
shapely.set_srid(gdf[geom_name].values._data, srid=srid),
hex=True,
include_srid=True,
)
# The gdf will warn that the geometry column doesn't hold in-memory geometries
# now that they are EWKB, so convert back to a regular dataframe to avoid warning
@@ -330,8 +319,8 @@ def _convert_to_ewkb(gdf, geom_name, srid):
def _psql_insert_copy(tbl, conn, keys, data_iter):
import io
import csv
import io
s_buf = io.StringIO()
writer = csv.writer(s_buf)
@@ -341,11 +330,16 @@ def _psql_insert_copy(tbl, conn, keys, data_iter):
columns = ", ".join('"{}"'.format(k) for k in keys)
dbapi_conn = conn.connection
sql = 'COPY "{}"."{}" ({}) FROM STDIN WITH CSV'.format(
tbl.table.schema, tbl.table.name, columns
)
with dbapi_conn.cursor() as cur:
sql = 'COPY "{}"."{}" ({}) FROM STDIN WITH CSV'.format(
tbl.table.schema, tbl.table.name, columns
)
cur.copy_expert(sql=sql, file=s_buf)
# Use psycopg method if it's available
if hasattr(cur, "copy") and callable(cur.copy):
with cur.copy(sql) as copy:
copy.write(s_buf.read())
else: # otherwise use psycopg2 method
cur.copy_expert(sql, s_buf)
def _write_postgis(
@@ -469,3 +463,11 @@ def _write_postgis(
dtype=dtype,
method=_psql_insert_copy,
)
@lru_cache
def _get_spatial_ref_sys_df(con, srid):
spatial_ref_sys_sql = (
f"SELECT srid, auth_name FROM spatial_ref_sys WHERE srid = {srid}"
)
return pd.read_sql(spatial_ref_sys_sql, con)

View File

@@ -19,6 +19,7 @@ pickles and test versus the current data that is generated
(with master). These are then compared.
"""
import os
import pickle
import platform
@@ -26,9 +27,10 @@ import sys
import pandas as pd
import geopandas
from shapely.geometry import Point
import geopandas
def create_pickle_data():
"""create the pickle data"""

View File

@@ -1,33 +1,41 @@
import datetime
import io
import json
import os
import pathlib
import shutil
import tempfile
from collections import OrderedDict
from packaging.version import Version
import numpy as np
import pandas as pd
import pytest
import pytz
from packaging.version import Version
from pandas.api.types import is_datetime64_any_dtype
from pandas.testing import assert_series_equal
from shapely.geometry import Point, Polygon, box
from shapely.geometry import Point, Polygon, box, mapping
import geopandas
from geopandas import GeoDataFrame, read_file
from geopandas._compat import PANDAS_GE_20
from geopandas.io.file import _detect_driver, _EXTENSION_TO_DRIVER
from geopandas._compat import HAS_PYPROJ, PANDAS_GE_20, PANDAS_GE_30
from geopandas.io.file import _EXTENSION_TO_DRIVER, _detect_driver
import pytest
from geopandas.testing import assert_geodataframe_equal, assert_geoseries_equal
from geopandas.tests.util import PACKAGE_DIR, validate_boro_df
from pandas.testing import assert_frame_equal, assert_series_equal
try:
import pyogrio
PYOGRIO_GE_07 = Version(pyogrio.__version__) > Version("0.6.0")
# those version checks have to be defined here instead of imported from
# geopandas.io.file (those are only initialized lazily on first usage)
PYOGRIO_GE_090 = Version(Version(pyogrio.__version__).base_version) >= Version(
"0.9.0"
)
except ImportError:
pyogrio = False
PYOGRIO_GE_07 = False
PYOGRIO_GE_090 = False
try:
@@ -46,6 +54,9 @@ FIONA_MARK = pytest.mark.skipif(not fiona, reason="fiona not installed")
_CRS = "epsg:4326"
pytestmark = pytest.mark.filterwarnings("ignore:Value:RuntimeWarning:pyogrio")
@pytest.fixture(
params=[
pytest.param("fiona", marks=FIONA_MARK),
@@ -62,9 +73,8 @@ def skip_pyogrio_not_supported(engine):
@pytest.fixture
def df_nybb(engine):
nybb_path = geopandas.datasets.get_path("nybb")
df = read_file(nybb_path, engine=engine)
def df_nybb(engine, nybb_filename):
df = read_file(nybb_filename, engine=engine)
return df
@@ -130,7 +140,7 @@ def test_to_file(tmpdir, df_nybb, df_null, driver, ext, engine):
df = GeoDataFrame.from_file(tempfilename, engine=engine)
assert "geometry" in df
assert len(df) == 5
assert np.alltrue(df["BoroName"].values == df_nybb["BoroName"])
assert np.all(df["BoroName"].values == df_nybb["BoroName"])
# Write layer with null geometry out to file
tempfilename = os.path.join(str(tmpdir), "null_geom" + ext)
@@ -139,7 +149,7 @@ def test_to_file(tmpdir, df_nybb, df_null, driver, ext, engine):
df = GeoDataFrame.from_file(tempfilename, engine=engine)
assert "geometry" in df
assert len(df) == 2
assert np.alltrue(df["Name"].values == df_null["Name"])
assert np.all(df["Name"].values == df_null["Name"])
# check the expected driver
assert_correct_driver(tempfilename, ext, engine)
@@ -153,7 +163,7 @@ def test_to_file_pathlib(tmpdir, df_nybb, driver, ext, engine):
df = GeoDataFrame.from_file(temppath, engine=engine)
assert "geometry" in df
assert len(df) == 5
assert np.alltrue(df["BoroName"].values == df_nybb["BoroName"])
assert np.all(df["BoroName"].values == df_nybb["BoroName"])
# check the expected driver
assert_correct_driver(temppath, ext, engine)
@@ -174,9 +184,10 @@ def test_to_file_bool(tmpdir, driver, ext, engine):
result = read_file(tempfilename, engine=engine)
if ext in (".shp", ""):
# Shapefile does not support boolean, so is read back as int
if engine == "fiona":
# but since GDAL 3.9 supports boolean fields in SHP
if engine == "fiona" and fiona.gdal_version.minor < 9:
df["col"] = df["col"].astype("int64")
else:
elif engine == "pyogrio" and pyogrio.__gdal_version__ < (3, 9):
df["col"] = df["col"].astype("int32")
assert_geodataframe_equal(result, df)
# check the expected driver
@@ -189,15 +200,15 @@ eastern = pytz.timezone("America/New_York")
datetime_type_tests = (TEST_DATE, eastern.localize(TEST_DATE))
@pytest.mark.filterwarnings(
"ignore:Non-conformant content for record 1 in column b:RuntimeWarning"
) # for GPKG, GDAL writes the tz data but warns on reading (see DATETIME_FORMAT option)
@pytest.mark.parametrize(
"time", datetime_type_tests, ids=("naive_datetime", "datetime_with_timezone")
)
@pytest.mark.parametrize("driver,ext", driver_ext_pairs)
def test_to_file_datetime(tmpdir, driver, ext, time, engine):
"""Test writing a data file with the datetime column type"""
if engine == "pyogrio" and time.tzinfo is not None:
# TODO
pytest.skip("pyogrio doesn't yet support timezones")
if ext in (".shp", ""):
pytest.skip(f"Driver corresponding to ext {ext} doesn't support dt fields")
@@ -207,23 +218,25 @@ def test_to_file_datetime(tmpdir, driver, ext, time, engine):
df = GeoDataFrame(
{"a": [1.0, 2.0], "b": [time, time]}, geometry=[point, point], crs=4326
)
fiona_precision_limit = "ms"
df["b"] = df["b"].dt.round(freq=fiona_precision_limit)
df["b"] = df["b"].dt.round(freq="ms")
df.to_file(tempfilename, driver=driver, engine=engine)
df_read = read_file(tempfilename, engine=engine)
assert_geodataframe_equal(df.drop(columns=["b"]), df_read.drop(columns=["b"]))
# Check datetime column
expected = df["b"]
if PANDAS_GE_20:
expected = df["b"].dt.as_unit("ms")
actual = df_read["b"]
if df["b"].dt.tz is not None:
# US/Eastern becomes pytz.FixedOffset(-300) when read from file
# so compare fairly in terms of UTC
assert_series_equal(
df["b"].dt.tz_convert(pytz.utc), df_read["b"].dt.tz_convert(pytz.utc)
)
else:
if engine == "pyogrio" and PANDAS_GE_20:
df["b"] = df["b"].astype("datetime64[ms]")
assert_series_equal(df["b"], df_read["b"])
# as GDAL only models offsets, not timezones.
# Compare fair result in terms of UTC instead
expected = expected.dt.tz_convert(pytz.utc)
actual = actual.dt.tz_convert(pytz.utc)
assert_series_equal(expected, actual)
dt_exts = ["gpkg", "geojson"]
@@ -239,7 +252,7 @@ def write_invalid_date_file(date_str, tmpdir, ext, engine):
)
# Schema not required for GeoJSON since not typed, but needed for GPKG
if ext == "geojson":
df.to_file(tempfilename)
df.to_file(tempfilename, engine=engine)
else:
schema = {"geometry": "Point", "properties": {"date": "datetime"}}
if engine == "pyogrio" and not fiona:
@@ -254,7 +267,7 @@ def test_read_file_datetime_invalid(tmpdir, ext, engine):
# https://github.com/geopandas/geopandas/issues/2502
date_str = "9999-99-99T00:00:00" # invalid date handled by GDAL
tempfilename = write_invalid_date_file(date_str, tmpdir, ext, engine)
res = read_file(tempfilename)
res = read_file(tempfilename, engine=engine)
if ext == "gpkg":
assert is_datetime64_any_dtype(res["date"])
assert pd.isna(res["date"].iloc[-1])
@@ -265,16 +278,19 @@ def test_read_file_datetime_invalid(tmpdir, ext, engine):
@pytest.mark.parametrize("ext", dt_exts)
def test_read_file_datetime_out_of_bounds_ns(tmpdir, ext, engine):
if engine == "pyogrio" and not PANDAS_GE_20:
pytest.skip("with pyogrio requires pandas >= 2.0 to pass")
# https://github.com/geopandas/geopandas/issues/2502
if ext == "geojson":
skip_pyogrio_not_supported(engine)
date_str = "9999-12-31T00:00:00" # valid to GDAL, not to [ns] format
tempfilename = write_invalid_date_file(date_str, tmpdir, ext, engine)
res = read_file(tempfilename)
# Pandas invalid datetimes are read in as object dtype (strings)
assert res["date"].dtype == "object"
assert isinstance(res["date"].iloc[0], str)
res = read_file(tempfilename, engine=engine)
if PANDAS_GE_30:
assert res["date"].dtype == "datetime64[ms]"
assert res["date"].iloc[-1] == pd.Timestamp("9999-12-31 00:00:00")
else:
# Pandas invalid datetimes are read in as object dtype (strings)
assert res["date"].dtype == "object"
assert isinstance(res["date"].iloc[0], str)
def test_read_file_datetime_mixed_offsets(tmpdir):
@@ -292,17 +308,13 @@ def test_read_file_datetime_mixed_offsets(tmpdir):
df.to_file(tempfilename)
# check mixed tz don't crash GH2478
res = read_file(tempfilename)
if engine == "fiona":
# Convert mixed timezones to UTC equivalent
assert is_datetime64_any_dtype(res["date"])
if not PANDAS_GE_20:
utc = pytz.utc
else:
utc = datetime.timezone.utc
assert res["date"].dt.tz == utc
# Convert mixed timezones to UTC equivalent
assert is_datetime64_any_dtype(res["date"])
if not PANDAS_GE_20:
utc = pytz.utc
else:
# old fiona and pyogrio ignore timezones and read as datetimes successfully
assert is_datetime64_any_dtype(res["date"])
utc = datetime.timezone.utc
assert res["date"].dt.tz == utc
@pytest.mark.parametrize("driver,ext", driver_ext_pairs)
@@ -365,14 +377,21 @@ def test_to_file_int32(tmpdir, df_points, engine, driver, ext):
df = GeoDataFrame(geometry=geometry)
df["data"] = pd.array([1, np.nan] * 5, dtype=pd.Int32Dtype())
df.to_file(tempfilename, driver=driver, engine=engine)
df_read = GeoDataFrame.from_file(tempfilename, driver=driver, engine=engine)
assert_geodataframe_equal(df_read, df, check_dtype=False, check_like=True)
df_read = GeoDataFrame.from_file(tempfilename, engine=engine)
# the int column with missing values comes back as float
expected = df.copy()
expected["data"] = expected["data"].astype("float64")
assert_geodataframe_equal(df_read, expected, check_like=True)
tempfilename2 = os.path.join(str(tmpdir), f"int32_2.{ext}")
df2 = df.dropna()
df2.to_file(tempfilename2, driver=driver, engine=engine)
df2_read = GeoDataFrame.from_file(tempfilename2, engine=engine)
if engine == "pyogrio":
tempfilename2 = os.path.join(str(tmpdir), f"int32_2.{ext}")
df2 = df.dropna()
df2.to_file(tempfilename2, driver=driver, engine=engine)
df2_read = GeoDataFrame.from_file(tempfilename2, driver=driver, engine=engine)
assert df2_read["data"].dtype == "int32"
else:
# with the fiona engine the 32 bitwidth is not preserved
assert df2_read["data"].dtype == "int64"
@pytest.mark.parametrize("driver,ext", driver_ext_pairs)
@@ -382,8 +401,11 @@ def test_to_file_int64(tmpdir, df_points, engine, driver, ext):
df = GeoDataFrame(geometry=geometry)
df["data"] = pd.array([1, np.nan] * 5, dtype=pd.Int64Dtype())
df.to_file(tempfilename, driver=driver, engine=engine)
df_read = GeoDataFrame.from_file(tempfilename, driver=driver, engine=engine)
assert_geodataframe_equal(df_read, df, check_dtype=False, check_like=True)
df_read = GeoDataFrame.from_file(tempfilename, engine=engine)
# the int column with missing values comes back as float
expected = df.copy()
expected["data"] = expected["data"].astype("float64")
assert_geodataframe_equal(df_read, expected, check_like=True)
def test_to_file_empty(tmpdir, engine):
@@ -393,12 +415,6 @@ def test_to_file_empty(tmpdir, engine):
input_empty_df.to_file(tempfilename, engine=engine)
def test_to_file_privacy(tmpdir, df_nybb):
tempfilename = os.path.join(str(tmpdir), "test.shp")
with pytest.warns(FutureWarning):
geopandas.io.file.to_file(df_nybb, tempfilename)
def test_to_file_schema(tmpdir, df_nybb, engine):
"""
Ensure that the file is written according to the schema
@@ -431,12 +447,13 @@ def test_to_file_schema(tmpdir, df_nybb, engine):
assert result_schema == schema
def test_to_file_crs(tmpdir, engine):
@pytest.mark.skipif(not HAS_PYPROJ, reason="pyproj not installed")
def test_to_file_crs(tmpdir, engine, nybb_filename):
"""
Ensure that the file is written according to the crs
if it is specified
"""
df = read_file(geopandas.datasets.get_path("nybb"), engine=engine)
df = read_file(nybb_filename, engine=engine)
tempfilename = os.path.join(str(tmpdir), "crs.shp")
# save correct CRS
@@ -445,7 +462,7 @@ def test_to_file_crs(tmpdir, engine):
assert result.crs == df.crs
if engine == "pyogrio":
with pytest.raises(ValueError, match="Passing 'crs' it not supported"):
with pytest.raises(ValueError, match="Passing 'crs' is not supported"):
df.to_file(tempfilename, crs=3857, engine=engine)
return
@@ -455,8 +472,7 @@ def test_to_file_crs(tmpdir, engine):
assert result.crs == "epsg:3857"
# specify CRS for gdf without one
df2 = df.copy()
df2.crs = None
df2 = df.set_crs(None, allow_override=True)
df2.to_file(tempfilename, crs=2263, engine=engine)
df = GeoDataFrame.from_file(tempfilename, engine=engine)
assert df.crs == "epsg:2263"
@@ -529,6 +545,7 @@ def test_mode_unsupported(tmpdir, df_nybb, engine):
df_nybb.to_file(tempfilename, mode="r", engine=engine)
@pytest.mark.filterwarnings("ignore:'crs' was not provided:UserWarning:pyogrio")
@pytest.mark.parametrize("driver,ext", driver_ext_pairs)
def test_empty_crs(tmpdir, driver, ext, engine):
"""Test handling of undefined CRS with GPKG driver (GH #1975)."""
@@ -548,7 +565,7 @@ def test_empty_crs(tmpdir, driver, ext, engine):
if ext == ".geojson":
# geojson by default assumes epsg:4326
df.crs = "EPSG:4326"
df.geometry.array.crs = "EPSG:4326"
assert_geodataframe_equal(result, df)
@@ -561,10 +578,11 @@ def test_empty_crs(tmpdir, driver, ext, engine):
NYBB_CRS = "epsg:2263"
def test_read_file(engine):
df = read_file(geopandas.datasets.get_path("nybb"), engine=engine)
def test_read_file(engine, nybb_filename):
df = read_file(nybb_filename, engine=engine)
validate_boro_df(df)
assert df.crs == NYBB_CRS
if HAS_PYPROJ:
assert df.crs == NYBB_CRS
expected_columns = ["BoroCode", "BoroName", "Shape_Leng", "Shape_Area"]
assert (df.columns[:-1] == expected_columns).all()
@@ -578,7 +596,7 @@ def test_read_file(engine):
"main/geopandas/tests/data/null_geom.geojson",
# url to zip file
"https://raw.githubusercontent.com/geopandas/geopandas/"
"main/geopandas/datasets/nybb_16a.zip",
"main/geopandas/tests/data/nybb_16a.zip",
# url to zipfile without extension
"https://geonode.goosocean.org/download/480",
# url to web service
@@ -596,6 +614,25 @@ def test_read_file_local_uri(file_path, engine):
assert isinstance(gdf, geopandas.GeoDataFrame)
@pytest.mark.skipif(not HAS_PYPROJ, reason="pyproj not installed")
def test_read_file_geojson_string_path(engine):
if engine == "pyogrio" and not PYOGRIO_GE_090:
pytest.skip("fixed in pyogrio 0.9.0")
expected = GeoDataFrame({"val_with_hash": ["row # 0"], "geometry": [Point(0, 1)]})
features = {
"type": "FeatureCollection",
"features": [
{
"type": "Feature",
"properties": {"val_with_hash": "row # 0"},
"geometry": {"type": "Point", "coordinates": [0.0, 1.0]},
}
],
}
df_read = read_file(json.dumps(features))
assert_geodataframe_equal(expected.set_crs("EPSG:4326"), df_read)
def test_read_file_textio(file_path, engine):
file_text_stream = open(file_path)
file_stringio = io.StringIO(open(file_path).read())
@@ -648,11 +685,11 @@ def test_read_file_tempfile(engine):
temp.close()
def test_read_binary_file_fsspec(engine):
def test_read_binary_file_fsspec(engine, nybb_filename):
fsspec = pytest.importorskip("fsspec")
# Remove the zip scheme so fsspec doesn't open as a zipped file,
# instead we want to read as bytes and let fiona decode it.
path = geopandas.datasets.get_path("nybb")[6:]
path = nybb_filename[6:]
with fsspec.open(path, "rb") as f:
gdf = read_file(f, engine=engine)
assert isinstance(gdf, geopandas.GeoDataFrame)
@@ -665,10 +702,10 @@ def test_read_text_file_fsspec(file_path, engine):
assert isinstance(gdf, geopandas.GeoDataFrame)
def test_infer_zipped_file(engine):
def test_infer_zipped_file(engine, nybb_filename):
# Remove the zip scheme so that the test for a zipped file can
# check it and add it back.
path = geopandas.datasets.get_path("nybb")[6:]
path = nybb_filename[6:]
gdf = read_file(path, engine=engine)
assert isinstance(gdf, geopandas.GeoDataFrame)
@@ -683,15 +720,24 @@ def test_infer_zipped_file(engine):
assert isinstance(gdf, geopandas.GeoDataFrame)
def test_allow_legacy_gdal_path(engine):
def test_allow_legacy_gdal_path(engine, nybb_filename):
# Construct a GDAL-style zip path.
path = "/vsizip/" + geopandas.datasets.get_path("nybb")[6:]
path = "/vsizip/" + nybb_filename[6:]
gdf = read_file(path, engine=engine)
assert isinstance(gdf, geopandas.GeoDataFrame)
def test_read_file_filtered__bbox(df_nybb, engine):
nybb_filename = geopandas.datasets.get_path("nybb")
@pytest.mark.skipif(not PYOGRIO_GE_090, reason="bug fixed in pyogrio 0.9.0")
def test_read_file_with_hash_in_path(engine, nybb_filename, tmp_path):
folder_with_hash = tmp_path / "path with # present"
folder_with_hash.mkdir(exist_ok=True, parents=True)
read_path = folder_with_hash / "nybb.zip"
shutil.copy(nybb_filename[6:], read_path)
gdf = read_file(read_path, engine=engine)
assert isinstance(gdf, geopandas.GeoDataFrame)
def test_read_file_bbox_tuple(df_nybb, engine, nybb_filename):
bbox = (
1031051.7879884212,
224272.49231459625,
@@ -703,8 +749,7 @@ def test_read_file_filtered__bbox(df_nybb, engine):
assert_geodataframe_equal(filtered_df, expected.reset_index(drop=True))
def test_read_file_filtered__bbox__polygon(df_nybb, engine):
nybb_filename = geopandas.datasets.get_path("nybb")
def test_read_file_bbox_polygon(df_nybb, engine, nybb_filename):
bbox = box(
1031051.7879884212, 224272.49231459625, 1047224.3104931959, 244317.30894023244
)
@@ -713,14 +758,12 @@ def test_read_file_filtered__bbox__polygon(df_nybb, engine):
assert_geodataframe_equal(filtered_df, expected.reset_index(drop=True))
def test_read_file_filtered__rows(df_nybb, engine):
nybb_filename = geopandas.datasets.get_path("nybb")
def test_read_file_filtered__rows(df_nybb, engine, nybb_filename):
filtered_df = read_file(nybb_filename, rows=1, engine=engine)
assert_geodataframe_equal(filtered_df, df_nybb.iloc[[0], :])
def test_read_file_filtered__rows_slice(df_nybb, engine):
nybb_filename = geopandas.datasets.get_path("nybb")
def test_read_file_filtered__rows_slice(df_nybb, engine, nybb_filename):
filtered_df = read_file(nybb_filename, rows=slice(1, 3), engine=engine)
assert_geodataframe_equal(filtered_df, df_nybb.iloc[1:3, :].reset_index(drop=True))
@@ -728,21 +771,14 @@ def test_read_file_filtered__rows_slice(df_nybb, engine):
@pytest.mark.filterwarnings(
"ignore:Layer does not support OLC_FASTFEATURECOUNT:RuntimeWarning"
) # for the slice with -1
def test_read_file_filtered__rows_bbox(df_nybb, engine):
nybb_filename = geopandas.datasets.get_path("nybb")
def test_read_file_filtered__rows_bbox(df_nybb, engine, nybb_filename):
bbox = (
1031051.7879884212,
224272.49231459625,
1047224.3104931959,
244317.30894023244,
)
if engine == "pyogrio" and not PYOGRIO_GE_07:
with pytest.raises(ValueError, match="'skip_features' must be between 0 and 1"):
# combination bbox and rows (rows slice applied after bbox filtering!)
filtered_df = read_file(
nybb_filename, bbox=bbox, rows=slice(4, None), engine=engine
)
else: # fiona
if engine == "fiona":
# combination bbox and rows (rows slice applied after bbox filtering!)
filtered_df = read_file(
nybb_filename, bbox=bbox, rows=slice(4, None), engine=engine
@@ -768,16 +804,14 @@ def test_read_file_filtered__rows_bbox(df_nybb, engine):
)
def test_read_file_filtered_rows_invalid(engine):
def test_read_file_filtered_rows_invalid(engine, nybb_filename):
with pytest.raises(TypeError):
read_file(
geopandas.datasets.get_path("nybb"), rows="not_a_slice", engine=engine
)
read_file(nybb_filename, rows="not_a_slice", engine=engine)
def test_read_file__ignore_geometry(engine):
def test_read_file__ignore_geometry(engine, naturalearth_lowres):
pdf = geopandas.read_file(
geopandas.datasets.get_path("naturalearth_lowres"),
naturalearth_lowres,
ignore_geometry=True,
engine=engine,
)
@@ -785,20 +819,73 @@ def test_read_file__ignore_geometry(engine):
assert isinstance(pdf, pd.DataFrame) and not isinstance(pdf, geopandas.GeoDataFrame)
def test_read_file__ignore_all_fields(engine):
skip_pyogrio_not_supported(engine) # pyogrio has "columns" keyword instead
@pytest.mark.filterwarnings(
"ignore:The 'include_fields' and 'ignore_fields' keywords:DeprecationWarning"
)
def test_read_file__ignore_fields(engine, naturalearth_lowres):
gdf = geopandas.read_file(
geopandas.datasets.get_path("naturalearth_lowres"),
naturalearth_lowres,
ignore_fields=["pop_est", "continent", "iso_a3", "gdp_md_est"],
engine=engine,
)
assert gdf.columns.tolist() == ["name", "geometry"]
@pytest.mark.filterwarnings(
"ignore:The 'include_fields' and 'ignore_fields' keywords:DeprecationWarning"
)
def test_read_file__ignore_all_fields(engine, naturalearth_lowres):
gdf = geopandas.read_file(
naturalearth_lowres,
ignore_fields=["pop_est", "continent", "name", "iso_a3", "gdp_md_est"],
engine="fiona",
engine=engine,
)
assert gdf.columns.tolist() == ["geometry"]
def test_read_file__where_filter(engine):
def test_read_file_missing_geometry(tmpdir, engine):
filename = str(tmpdir / "test.csv")
expected = pd.DataFrame(
{"col1": np.array([1, 2, 3], dtype="int64"), "col2": ["a", "b", "c"]}
)
expected.to_csv(filename, index=False)
df = geopandas.read_file(filename, engine=engine)
# both engines read integers as strings; force back to original type
df["col1"] = df["col1"].astype("int64")
assert isinstance(df, pd.DataFrame)
assert not isinstance(df, geopandas.GeoDataFrame)
assert_frame_equal(df, expected)
def test_read_file_None_attribute(tmp_path, engine):
# Test added in context of https://github.com/geopandas/geopandas/issues/2901
test_path = tmp_path / "test.gpkg"
gdf = GeoDataFrame(
{"a": [None, None]}, geometry=[Point(1, 2), Point(3, 4)], crs=4326
)
gdf.to_file(test_path, engine=engine)
read_gdf = read_file(test_path, engine=engine)
assert_geodataframe_equal(gdf, read_gdf)
def test_read_csv_dtype(tmpdir, df_nybb):
filename = str(tmpdir / "test.csv")
df_nybb.to_csv(filename, index=False)
pdf = pd.read_csv(filename, dtype={"geometry": "geometry"})
assert pdf.geometry.dtype == "geometry"
def test_read_file__where_filter(engine, naturalearth_lowres):
if FIONA_GE_19 or engine == "pyogrio":
gdf = geopandas.read_file(
geopandas.datasets.get_path("naturalearth_lowres"),
naturalearth_lowres,
where="continent='Africa'",
engine=engine,
)
@@ -806,26 +893,75 @@ def test_read_file__where_filter(engine):
else:
with pytest.raises(NotImplementedError):
geopandas.read_file(
geopandas.datasets.get_path("naturalearth_lowres"),
naturalearth_lowres,
where="continent='Africa'",
engine="fiona",
)
@PYOGRIO_MARK
def test_read_file__columns():
# TODO: this is only support for pyogrio, but we could mimic it for fiona as well
def test_read_file__columns(engine, naturalearth_lowres):
if engine == "fiona" and not FIONA_GE_19:
pytest.skip("columns requires fiona 1.9+")
gdf = geopandas.read_file(
geopandas.datasets.get_path("naturalearth_lowres"),
columns=["name", "pop_est"],
engine="pyogrio",
naturalearth_lowres, columns=["name", "pop_est"], engine=engine
)
assert gdf.columns.tolist() == ["name", "pop_est", "geometry"]
def test_read_file_filtered_with_gdf_boundary(df_nybb, engine):
def test_read_file__columns_empty(engine, naturalearth_lowres):
if engine == "fiona" and not FIONA_GE_19:
pytest.skip("columns requires fiona 1.9+")
gdf = geopandas.read_file(naturalearth_lowres, columns=[], engine=engine)
assert gdf.columns.tolist() == ["geometry"]
@pytest.mark.skipif(FIONA_GE_19 or not fiona, reason="test for fiona < 1.9")
def test_read_file__columns_old_fiona(naturalearth_lowres):
with pytest.raises(NotImplementedError):
geopandas.read_file(
naturalearth_lowres, columns=["name", "pop_est"], engine="fiona"
)
@pytest.mark.filterwarnings(
"ignore:The 'include_fields' and 'ignore_fields' keywords:DeprecationWarning"
)
def test_read_file__include_fields(engine, naturalearth_lowres):
if engine == "fiona" and not FIONA_GE_19:
pytest.skip("columns requires fiona 1.9+")
gdf = geopandas.read_file(
naturalearth_lowres, include_fields=["name", "pop_est"], engine=engine
)
assert gdf.columns.tolist() == ["name", "pop_est", "geometry"]
@pytest.mark.skipif(not FIONA_GE_19, reason="columns requires fiona 1.9+")
def test_read_file__columns_conflicting_keywords(engine, naturalearth_lowres):
path = naturalearth_lowres
with pytest.raises(ValueError, match="Cannot specify both"):
geopandas.read_file(
path, include_fields=["name"], ignore_fields=["pop_est"], engine=engine
)
with pytest.raises(ValueError, match="Cannot specify both"):
geopandas.read_file(
path, columns=["name"], include_fields=["pop_est"], engine=engine
)
with pytest.raises(ValueError, match="Cannot specify both"):
geopandas.read_file(
path, columns=["name"], ignore_fields=["pop_est"], engine=engine
)
@pytest.mark.skipif(not HAS_PYPROJ, reason="pyproj not installed")
@pytest.mark.parametrize("file_like", [False, True])
def test_read_file_bbox_gdf(df_nybb, engine, nybb_filename, file_like):
full_df_shape = df_nybb.shape
nybb_filename = geopandas.datasets.get_path("nybb")
bbox = geopandas.GeoDataFrame(
geometry=[
box(
@@ -837,28 +973,41 @@ def test_read_file_filtered_with_gdf_boundary(df_nybb, engine):
],
crs=NYBB_CRS,
)
filtered_df = read_file(nybb_filename, bbox=bbox, engine=engine)
infile = (
open(nybb_filename.replace("zip://", ""), "rb") if file_like else nybb_filename
)
filtered_df = read_file(infile, bbox=bbox, engine=engine)
filtered_df_shape = filtered_df.shape
assert full_df_shape != filtered_df_shape
assert filtered_df_shape == (2, 5)
def test_read_file_filtered_with_gdf_boundary__mask(df_nybb, engine):
skip_pyogrio_not_supported(engine)
gdf_mask = geopandas.read_file(geopandas.datasets.get_path("naturalearth_lowres"))
gdf = geopandas.read_file(
geopandas.datasets.get_path("naturalearth_cities"),
mask=gdf_mask[gdf_mask.continent == "Africa"],
engine=engine,
)
filtered_df_shape = gdf.shape
assert filtered_df_shape == (57, 2)
def test_read_file_filtered_with_gdf_boundary__mask__polygon(df_nybb, engine):
skip_pyogrio_not_supported(engine)
@pytest.mark.skipif(not HAS_PYPROJ, reason="pyproj not installed")
@pytest.mark.parametrize("file_like", [False, True])
def test_read_file_mask_gdf(df_nybb, engine, nybb_filename, file_like):
full_df_shape = df_nybb.shape
mask = geopandas.GeoDataFrame(
geometry=[
box(
1031051.7879884212,
224272.49231459625,
1047224.3104931959,
244317.30894023244,
)
],
crs=NYBB_CRS,
)
infile = (
open(nybb_filename.replace("zip://", ""), "rb") if file_like else nybb_filename
)
filtered_df = read_file(infile, mask=mask, engine=engine)
filtered_df_shape = filtered_df.shape
assert full_df_shape != filtered_df_shape
assert filtered_df_shape == (2, 5)
def test_read_file_mask_polygon(df_nybb, engine, nybb_filename):
full_df_shape = df_nybb.shape
nybb_filename = geopandas.datasets.get_path("nybb")
mask = box(
1031051.7879884212, 224272.49231459625, 1047224.3104931959, 244317.30894023244
)
@@ -868,10 +1017,25 @@ def test_read_file_filtered_with_gdf_boundary__mask__polygon(df_nybb, engine):
assert filtered_df_shape == (2, 5)
def test_read_file_filtered_with_gdf_boundary_mismatched_crs(df_nybb, engine):
skip_pyogrio_not_supported(engine)
def test_read_file_mask_geojson(df_nybb, nybb_filename, engine):
full_df_shape = df_nybb.shape
mask = mapping(
box(
1031051.7879884212,
224272.49231459625,
1047224.3104931959,
244317.30894023244,
)
)
filtered_df = read_file(nybb_filename, mask=mask, engine=engine)
filtered_df_shape = filtered_df.shape
assert full_df_shape != filtered_df_shape
assert filtered_df_shape == (2, 5)
@pytest.mark.skipif(not HAS_PYPROJ, reason="pyproj not installed")
def test_read_file_bbox_gdf_mismatched_crs(df_nybb, engine, nybb_filename):
full_df_shape = df_nybb.shape
nybb_filename = geopandas.datasets.get_path("nybb")
bbox = geopandas.GeoDataFrame(
geometry=[
box(
@@ -890,10 +1054,9 @@ def test_read_file_filtered_with_gdf_boundary_mismatched_crs(df_nybb, engine):
assert filtered_df_shape == (2, 5)
def test_read_file_filtered_with_gdf_boundary_mismatched_crs__mask(df_nybb, engine):
skip_pyogrio_not_supported(engine)
@pytest.mark.skipif(not HAS_PYPROJ, reason="pyproj not installed")
def test_read_file_mask_gdf_mismatched_crs(df_nybb, engine, nybb_filename):
full_df_shape = df_nybb.shape
nybb_filename = geopandas.datasets.get_path("nybb")
mask = geopandas.GeoDataFrame(
geometry=[
box(
@@ -912,6 +1075,20 @@ def test_read_file_filtered_with_gdf_boundary_mismatched_crs__mask(df_nybb, engi
assert filtered_df_shape == (2, 5)
def test_read_file_bbox_mask_not_allowed(engine, nybb_filename):
bbox = (
1031051.7879884212,
224272.49231459625,
1047224.3104931959,
244317.30894023244,
)
mask = box(*bbox)
with pytest.raises(ValueError, match="mask and bbox can not be set together"):
read_file(nybb_filename, bbox=bbox, mask=mask)
@pytest.mark.filterwarnings(
"ignore:Layer 'b'test_empty'' does not have any features:UserWarning"
)
@@ -942,11 +1119,6 @@ def test_read_file_empty_shapefile(tmpdir, engine):
assert all(empty.columns == ["A", "Z", "geometry"])
def test_read_file_privacy(tmpdir, df_nybb):
with pytest.warns(FutureWarning):
geopandas.io.file.read_file(geopandas.datasets.get_path("nybb"))
class FileNumber(object):
def __init__(self, tmpdir, base, ext):
self.tmpdir = str(tmpdir)
@@ -1113,7 +1285,7 @@ def test_write_index_to_file(tmpdir, df_points, driver, ext, engine):
# index as string
df_p = df_points.copy()
df = GeoDataFrame(df_p["value1"], geometry=df_p.geometry)
df.index = pd.TimedeltaIndex(range(len(df)), "days")
df.index = pd.to_timedelta(range(len(df)), unit="days")
# TODO: TimedeltaIndex is an invalid field type
df.index = df.index.astype(str)
do_checks(df, index_is_used=True)
@@ -1121,7 +1293,7 @@ def test_write_index_to_file(tmpdir, df_points, driver, ext, engine):
# unnamed DatetimeIndex
df_p = df_points.copy()
df = GeoDataFrame(df_p["value1"], geometry=df_p.geometry)
df.index = pd.TimedeltaIndex(range(len(df)), "days") + pd.DatetimeIndex(
df.index = pd.to_timedelta(range(len(df)), unit="days") + pd.to_datetime(
["1999-12-27"] * len(df)
)
if driver == "ESRI Shapefile":
@@ -1152,6 +1324,54 @@ def test_write_read_file(test_file, engine):
os.remove(os.path.expanduser(test_file))
@pytest.mark.skipif(fiona is False, reason="Fiona not available")
@pytest.mark.skipif(FIONA_GE_19, reason="Fiona >= 1.9 supports metadata")
def test_to_file_metadata_unsupported_fiona_version(tmp_path, df_points):
metadata = {"title": "test"}
tmp_file = tmp_path / "test.gpkg"
match = "'metadata' keyword is only supported for Fiona >= 1.9"
with pytest.raises(NotImplementedError, match=match):
df_points.to_file(tmp_file, driver="GPKG", engine="fiona", metadata=metadata)
@pytest.mark.skipif(not FIONA_GE_19, reason="only Fiona >= 1.9 supports metadata")
def test_to_file_metadata_supported_fiona_version(tmp_path, df_points):
metadata = {"title": "test"}
tmp_file = tmp_path / "test.gpkg"
df_points.to_file(tmp_file, driver="GPKG", engine="fiona", metadata=metadata)
# Check that metadata is written to the file
with fiona.open(tmp_file) as src:
tags = src.tags()
assert tags == metadata
@pytest.mark.skipif(pyogrio is False, reason="Pyogrio not available")
def test_to_file_metadata_pyogrio(tmp_path, df_points):
metadata = {"title": "test"}
tmp_file = tmp_path / "test.gpkg"
df_points.to_file(tmp_file, driver="GPKG", engine="pyogrio", metadata=metadata)
# Check that metadata is written to the file
info = pyogrio.read_info(tmp_file)
layer_metadata = info["layer_metadata"]
assert layer_metadata == metadata
@pytest.mark.parametrize(
"driver, ext", [("ESRI Shapefile", ".shp"), ("GeoJSON", ".geojson")]
)
def test_to_file_metadata_unsupported_driver(driver, ext, tmpdir, df_points, engine):
metadata = {"title": "Test"}
tempfilename = os.path.join(str(tmpdir), "test" + ext)
with pytest.raises(
NotImplementedError, match="'metadata' keyword is only supported for"
):
df_points.to_file(tempfilename, driver=driver, metadata=metadata)
def test_multiple_geom_cols_error(tmpdir, df_nybb):
df_nybb["geom2"] = df_nybb.geometry
with pytest.raises(ValueError, match="GeoDataFrame contains multiple geometry"):
@@ -1160,7 +1380,7 @@ def test_multiple_geom_cols_error(tmpdir, df_nybb):
@PYOGRIO_MARK
@FIONA_MARK
def test_option_io_engine():
def test_option_io_engine(nybb_filename):
try:
geopandas.options.io_engine = "pyogrio"
@@ -1171,8 +1391,48 @@ def test_option_io_engine():
orig = fiona.supported_drivers["ESRI Shapefile"]
fiona.supported_drivers["ESRI Shapefile"] = "w"
nybb_filename = geopandas.datasets.get_path("nybb")
_ = geopandas.read_file(nybb_filename)
finally:
fiona.supported_drivers["ESRI Shapefile"] = orig
geopandas.options.io_engine = None
@pytest.mark.skipif(pyogrio, reason="test for pyogrio not installed")
def test_error_engine_unavailable_pyogrio(tmp_path, df_points, file_path):
with pytest.raises(ImportError, match="the 'read_file' function requires"):
geopandas.read_file(file_path, engine="pyogrio")
with pytest.raises(ImportError, match="the 'to_file' method requires"):
df_points.to_file(tmp_path / "test.gpkg", engine="pyogrio")
@pytest.mark.skipif(fiona, reason="test for fiona not installed")
def test_error_engine_unavailable_fiona(tmp_path, df_points, file_path):
with pytest.raises(ImportError, match="the 'read_file' function requires"):
geopandas.read_file(file_path, engine="fiona")
with pytest.raises(ImportError, match="the 'to_file' method requires"):
df_points.to_file(tmp_path / "test.gpkg", engine="fiona")
@PYOGRIO_MARK
def test_list_layers(df_points, tmpdir):
tempfilename = os.path.join(str(tmpdir), "dataset.gpkg")
df_points.to_file(tempfilename, layer="original")
df_points.set_geometry(df_points.buffer(1)).to_file(tempfilename, layer="buffered")
df_points.set_geometry(df_points.buffer(2).boundary).to_file(
tempfilename, layer="boundary"
)
pyogrio.write_dataframe(
df_points[["value1", "value2"]], tempfilename, layer="non-spatial"
)
layers = geopandas.list_layers(tempfilename)
expected = pd.DataFrame(
{
"name": ["original", "buffered", "boundary", "non-spatial"],
"geometry_type": ["Point", "Polygon", "LineString", None],
}
)
assert_frame_equal(layers, expected)

View File

@@ -12,11 +12,10 @@ from shapely.geometry import (
import geopandas
from geopandas import GeoDataFrame
from geopandas.testing import assert_geodataframe_equal
import pytest
from .test_file import FIONA_MARK, PYOGRIO_MARK
import pytest
from geopandas.testing import assert_geodataframe_equal
# Credit: Polygons below come from Montreal city Open Data portal
# http://donnees.ville.montreal.qc.ca/dataset/unites-evaluation-fonciere
@@ -244,7 +243,14 @@ def geodataframe(request):
return request.param
@pytest.fixture(params=["GeoJSON", "ESRI Shapefile", "GPKG", "SQLite"])
@pytest.fixture(
params=[
("GeoJSON", ".geojson"),
("ESRI Shapefile", ".shp"),
("GPKG", ".gpkg"),
("SQLite", ".sqlite"),
]
)
def ogr_driver(request):
return request.param
@@ -260,16 +266,18 @@ def engine(request):
def test_to_file_roundtrip(tmpdir, geodataframe, ogr_driver, engine):
output_file = os.path.join(str(tmpdir), "output_file")
driver, ext = ogr_driver
output_file = os.path.join(str(tmpdir), "output_file" + ext)
write_kwargs = {}
if ogr_driver == "SQLite":
if driver == "SQLite":
write_kwargs["spatialite"] = True
# This if statement can be removed once minimal fiona version >= 1.8.20
if engine == "fiona":
import fiona
from packaging.version import Version
import fiona
if Version(fiona.__version__) < Version("1.8.20"):
pytest.skip("SQLite driver only available from version 1.8.20")
@@ -285,22 +293,35 @@ def test_to_file_roundtrip(tmpdir, geodataframe, ogr_driver, engine):
):
write_kwargs["geometry_type"] = "Point Z"
expected_error = _expected_error_on(geodataframe, ogr_driver)
expected_error = _expected_error_on(geodataframe, driver)
if expected_error:
with pytest.raises(
RuntimeError, match="Failed to write record|Could not add feature to layer"
):
geodataframe.to_file(
output_file, driver=ogr_driver, engine=engine, **write_kwargs
output_file, driver=driver, engine=engine, **write_kwargs
)
else:
geodataframe.to_file(
output_file, driver=ogr_driver, engine=engine, **write_kwargs
)
if driver == "SQLite" and engine == "pyogrio":
try:
geodataframe.to_file(
output_file, driver=driver, engine=engine, **write_kwargs
)
except ValueError as e:
if "unrecognized option 'SPATIALITE'" in str(e):
pytest.xfail(
"pyogrio wheels from PyPI do not come with SpatiaLite support. "
f"Error: {e}"
)
raise
else:
geodataframe.to_file(
output_file, driver=driver, engine=engine, **write_kwargs
)
reloaded = geopandas.read_file(output_file, engine=engine)
if ogr_driver == "GeoJSON" and engine == "pyogrio":
if driver == "GeoJSON" and engine == "pyogrio":
# For GeoJSON files, the int64 column comes back as int32
reloaded["a"] = reloaded["a"].astype("int64")

View File

@@ -1,5 +1,8 @@
from collections import OrderedDict
import numpy as np
import pandas as pd
from shapely.geometry import (
LineString,
MultiLineString,
@@ -9,12 +12,11 @@ from shapely.geometry import (
Polygon,
)
import pandas as pd
import pytest
import numpy as np
from geopandas import GeoDataFrame
from geopandas.io.file import infer_schema
import pytest
# Credit: Polygons below come from Montreal city Open Data portal
# http://donnees.ville.montreal.qc.ca/dataset/unites-evaluation-fonciere
city_hall_boundaries = Polygon(

View File

@@ -2,7 +2,7 @@
See generate_legacy_storage_files.py for the creation of the legacy files.
"""
from contextlib import contextmanager
import glob
import os
import pathlib
@@ -11,9 +11,6 @@ import pandas as pd
import pytest
from geopandas.testing import assert_geodataframe_equal
from geopandas import _compat as compat
import geopandas
from shapely.geometry import Point
DATA_PATH = pathlib.Path(os.path.dirname(__file__)) / "data"
@@ -34,18 +31,7 @@ def legacy_pickle(request):
return request.param
@contextmanager
def with_use_pygeos(option):
orig = geopandas.options.use_pygeos
geopandas.options.use_pygeos = option
try:
yield
finally:
geopandas.options.use_pygeos = orig
@pytest.mark.skipif(
compat.USE_SHAPELY_20 or compat.USE_PYGEOS,
@pytest.mark.skip(
reason=(
"shapely 2.0/pygeos-based unpickling currently only works for "
"shapely-2.0/pygeos-written files"
@@ -68,43 +54,3 @@ def test_round_trip_current(tmpdir, current_pickle_data):
result = pd.read_pickle(path)
assert_geodataframe_equal(result, value)
assert isinstance(result.has_sindex, bool)
def _create_gdf():
return geopandas.GeoDataFrame(
{"a": [0.1, 0.2, 0.3], "geometry": [Point(1, 1), Point(2, 2), Point(3, 3)]},
crs="EPSG:4326",
)
@pytest.mark.skipif(not compat.HAS_PYGEOS, reason="requires pygeos to test #1745")
def test_pygeos_switch(tmpdir):
# writing and reading with pygeos disabled
with with_use_pygeos(False):
gdf = _create_gdf()
path = str(tmpdir / "gdf_crs1.pickle")
gdf.to_pickle(path)
result = pd.read_pickle(path)
assert_geodataframe_equal(result, gdf)
# writing without pygeos, reading with pygeos
with with_use_pygeos(False):
gdf = _create_gdf()
path = str(tmpdir / "gdf_crs1.pickle")
gdf.to_pickle(path)
with with_use_pygeos(True):
result = pd.read_pickle(path)
gdf = _create_gdf()
assert_geodataframe_equal(result, gdf)
# writing with pygeos, reading without pygeos
with with_use_pygeos(True):
gdf = _create_gdf()
path = str(tmpdir / "gdf_crs1.pickle")
gdf.to_pickle(path)
with with_use_pygeos(False):
result = pd.read_pickle(path)
gdf = _create_gdf()
assert_geodataframe_equal(result, gdf)

View File

@@ -4,18 +4,27 @@ The spatial database tests may not work without additional system
configuration. postGIS tests require a test database to have been setup;
see geopandas.tests.util for more information.
"""
import os
import warnings
from importlib.util import find_spec
import pandas as pd
import geopandas
from geopandas import GeoDataFrame, read_file, read_postgis
import geopandas._compat as compat
from geopandas.io.sql import _get_conn as get_conn, _write_postgis as write_postgis
from geopandas.tests.util import create_postgis, create_spatialite, validate_boro_df
from geopandas import GeoDataFrame, read_file, read_postgis
from geopandas._compat import HAS_PYPROJ
from geopandas.io.sql import _get_conn as get_conn
from geopandas.io.sql import _write_postgis as write_postgis
import pytest
from geopandas.tests.util import (
create_postgis,
create_spatialite,
mock,
validate_boro_df,
)
try:
from sqlalchemy import text
@@ -26,31 +35,48 @@ except ImportError:
@pytest.fixture
def df_nybb():
nybb_path = geopandas.datasets.get_path("nybb")
df = read_file(nybb_path)
def df_nybb(nybb_filename):
df = read_file(nybb_filename)
return df
@pytest.fixture()
def connection_postgis():
def check_available_postgis_drivers() -> list[str]:
"""Work out which of psycopg2 and psycopg are available.
This prevents tests running if the relevant package isn't installed
(rather than being skipped, as skips are treated as failures during postgis CI)
"""
Initiates a connection to a postGIS database that must already exist.
See create_postgis for more information.
"""
psycopg2 = pytest.importorskip("psycopg2")
from psycopg2 import OperationalError
drivers = []
if find_spec("psycopg"):
drivers.append("psycopg")
if find_spec("psycopg2"):
drivers.append("psycopg2")
return drivers
POSTGIS_DRIVERS = check_available_postgis_drivers()
def prepare_database_credentials() -> dict:
"""Gather postgres connection credentials from environment variables."""
return {
"dbname": "test_geopandas",
"user": os.environ.get("PGUSER"),
"password": os.environ.get("PGPASSWORD"),
"host": os.environ.get("PGHOST"),
"port": os.environ.get("PGPORT"),
}
@pytest.fixture()
def connection_postgis(request):
"""Create a postgres connection using either psycopg2 or psycopg.
Use this as an indirect fixture, where the request parameter is POSTGIS_DRIVERS."""
psycopg = pytest.importorskip(request.param)
dbname = "test_geopandas"
user = os.environ.get("PGUSER")
password = os.environ.get("PGPASSWORD")
host = os.environ.get("PGHOST")
port = os.environ.get("PGPORT")
try:
con = psycopg2.connect(
dbname=dbname, user=user, password=password, host=host, port=port
)
except OperationalError:
con = psycopg.connect(**prepare_database_credentials())
except psycopg.OperationalError:
pytest.skip("Cannot connect with postgresql database")
with warnings.catch_warnings():
warnings.filterwarnings(
@@ -61,28 +87,25 @@ def connection_postgis():
@pytest.fixture()
def engine_postgis():
def engine_postgis(request):
"""
Initiates a connection engine to a postGIS database that must already exist.
Initiate a sqlalchemy connection engine using either psycopg2 or psycopg.
Use this as an indirect fixture, where the request parameter is POSTGIS_DRIVERS.
"""
sqlalchemy = pytest.importorskip("sqlalchemy")
from sqlalchemy.engine.url import URL
user = os.environ.get("PGUSER")
password = os.environ.get("PGPASSWORD")
host = os.environ.get("PGHOST")
port = os.environ.get("PGPORT")
dbname = "test_geopandas"
credentials = prepare_database_credentials()
try:
con = sqlalchemy.create_engine(
URL.create(
drivername="postgresql+psycopg2",
username=user,
database=dbname,
password=password,
host=host,
port=port,
drivername=f"postgresql+{request.param}",
username=credentials["user"],
database=credentials["dbname"],
password=credentials["password"],
host=credentials["host"],
port=credentials["port"],
)
)
con.connect()
@@ -140,7 +163,7 @@ def drop_table_if_exists(conn_or_engine, table):
@pytest.fixture
def df_mixed_single_and_multi():
from shapely.geometry import Point, LineString, MultiLineString
from shapely.geometry import LineString, MultiLineString, Point
df = geopandas.GeoDataFrame(
{
@@ -157,7 +180,7 @@ def df_mixed_single_and_multi():
@pytest.fixture
def df_geom_collection():
from shapely.geometry import Point, LineString, Polygon, GeometryCollection
from shapely.geometry import GeometryCollection, LineString, Point, Polygon
df = geopandas.GeoDataFrame(
{
@@ -188,7 +211,7 @@ def df_linear_ring():
@pytest.fixture
def df_3D_geoms():
from shapely.geometry import Point, LineString, Polygon
from shapely.geometry import LineString, Point, Polygon
df = geopandas.GeoDataFrame(
{
@@ -204,6 +227,7 @@ def df_3D_geoms():
class TestIO:
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
def test_get_conn(self, engine_postgis):
Connection = pytest.importorskip("sqlalchemy.engine.base").Connection
@@ -217,6 +241,7 @@ class TestIO:
with get_conn(object()):
pass
@pytest.mark.parametrize("connection_postgis", POSTGIS_DRIVERS, indirect=True)
def test_read_postgis_default(self, connection_postgis, df_nybb):
con = connection_postgis
create_postgis(con, df_nybb)
@@ -229,6 +254,7 @@ class TestIO:
# by user; should not be set to 0, as from get_srid failure
assert df.crs is None
@pytest.mark.parametrize("connection_postgis", POSTGIS_DRIVERS, indirect=True)
def test_read_postgis_custom_geom_col(self, connection_postgis, df_nybb):
con = connection_postgis
geom_col = "the_geom"
@@ -239,6 +265,7 @@ class TestIO:
validate_boro_df(df)
@pytest.mark.parametrize("connection_postgis", POSTGIS_DRIVERS, indirect=True)
def test_read_postgis_select_geom_as(self, connection_postgis, df_nybb):
"""Tests that a SELECT {geom} AS {some_other_geom} works."""
con = connection_postgis
@@ -254,6 +281,7 @@ class TestIO:
validate_boro_df(df)
@pytest.mark.parametrize("connection_postgis", POSTGIS_DRIVERS, indirect=True)
def test_read_postgis_get_srid(self, connection_postgis, df_nybb):
"""Tests that an SRID can be read from a geodatabase (GH #451)."""
con = connection_postgis
@@ -267,6 +295,7 @@ class TestIO:
validate_boro_df(df)
assert df.crs == crs
@pytest.mark.parametrize("connection_postgis", POSTGIS_DRIVERS, indirect=True)
def test_read_postgis_override_srid(self, connection_postgis, df_nybb):
"""Tests that a user specified CRS overrides the geodatabase SRID."""
con = connection_postgis
@@ -279,6 +308,7 @@ class TestIO:
validate_boro_df(df)
assert df.crs == orig_crs
@pytest.mark.parametrize("connection_postgis", POSTGIS_DRIVERS, indirect=True)
def test_from_postgis_default(self, connection_postgis, df_nybb):
con = connection_postgis
create_postgis(con, df_nybb)
@@ -288,6 +318,7 @@ class TestIO:
validate_boro_df(df, case_sensitive=False)
@pytest.mark.parametrize("connection_postgis", POSTGIS_DRIVERS, indirect=True)
def test_from_postgis_custom_geom_col(self, connection_postgis, df_nybb):
con = connection_postgis
geom_col = "the_geom"
@@ -323,6 +354,7 @@ class TestIO:
df = read_postgis(sql, con, geom_col=geom_col)
validate_boro_df(df)
@pytest.mark.parametrize("connection_postgis", POSTGIS_DRIVERS, indirect=True)
def test_read_postgis_chunksize(self, connection_postgis, df_nybb):
"""Test chunksize argument"""
chunksize = 2
@@ -337,14 +369,7 @@ class TestIO:
# by user; should not be set to 0, as from get_srid failure
assert df.crs is None
def test_read_postgis_privacy(self, connection_postgis, df_nybb):
con = connection_postgis
create_postgis(con, df_nybb)
sql = "SELECT * FROM nybb;"
with pytest.warns(FutureWarning):
geopandas.io.sql.read_postgis(sql, con)
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
def test_write_postgis_default(self, engine_postgis, df_nybb):
"""Tests that GeoDataFrame can be written to PostGIS with defaults."""
engine = engine_postgis
@@ -360,6 +385,7 @@ class TestIO:
df = read_postgis(sql, engine, geom_col="geometry")
validate_boro_df(df)
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
def test_write_postgis_uppercase_tablename(self, engine_postgis, df_nybb):
"""Tests writing GeoDataFrame to PostGIS with uppercase tablename."""
engine = engine_postgis
@@ -375,6 +401,7 @@ class TestIO:
df = read_postgis(sql, engine, geom_col="geometry")
validate_boro_df(df)
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
def test_write_postgis_sqlalchemy_connection(self, engine_postgis, df_nybb):
"""Tests that GeoDataFrame can be written to PostGIS with defaults."""
with engine_postgis.begin() as con:
@@ -390,6 +417,7 @@ class TestIO:
df = read_postgis(sql, con, geom_col="geometry")
validate_boro_df(df)
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
def test_write_postgis_fail_when_table_exists(self, engine_postgis, df_nybb):
"""
Tests that uploading the same table raises error when: if_replace='fail'.
@@ -409,6 +437,7 @@ class TestIO:
else:
raise e
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
def test_write_postgis_replace_when_table_exists(self, engine_postgis, df_nybb):
"""
Tests that replacing a table is possible when: if_replace='replace'.
@@ -426,6 +455,7 @@ class TestIO:
df = read_postgis(sql, engine, geom_col="geometry")
validate_boro_df(df)
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
def test_write_postgis_append_when_table_exists(self, engine_postgis, df_nybb):
"""
Tests that appending to existing table produces correct results when:
@@ -445,15 +475,18 @@ class TestIO:
# There should be twice as many rows in the new table
assert new_rows == orig_rows * 2, (
"There should be {target} rows,"
"found: {current}".format(target=orig_rows * 2, current=new_rows),
"There should be {target} rows,found: {current}".format(
target=orig_rows * 2, current=new_rows
),
)
# Number of columns should stay the same
assert new_cols == orig_cols, (
"There should be {target} columns,"
"found: {current}".format(target=orig_cols, current=new_cols),
"There should be {target} columns,found: {current}".format(
target=orig_cols, current=new_cols
),
)
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
def test_write_postgis_without_crs(self, engine_postgis, df_nybb):
"""
Tests that GeoDataFrame can be written to PostGIS without CRS information.
@@ -463,8 +496,7 @@ class TestIO:
table = "nybb"
# Write to db
df_nybb = df_nybb
df_nybb.crs = None
df_nybb.geometry.array.crs = None
with pytest.warns(UserWarning, match="Could not parse CRS from the GeoDataF"):
write_postgis(df_nybb, con=engine, name=table, if_exists="replace")
# Validate that srid is -1
@@ -477,6 +509,7 @@ class TestIO:
target_srid = conn.execute(sql).fetchone()[0]
assert target_srid == 0, "SRID should be 0, found %s" % target_srid
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
def test_write_postgis_with_esri_authority(self, engine_postgis, df_nybb):
"""
Tests that GeoDataFrame can be written to PostGIS with ESRI Authority
@@ -499,6 +532,7 @@ class TestIO:
target_srid = conn.execute(sql).fetchone()[0]
assert target_srid == 102003, "SRID should be 102003, found %s" % target_srid
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
def test_write_postgis_geometry_collection(
self, engine_postgis, df_geom_collection
):
@@ -525,6 +559,7 @@ class TestIO:
assert geom_type.upper() == "GEOMETRYCOLLECTION"
assert df.geom_type.unique()[0] == "GeometryCollection"
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
def test_write_postgis_mixed_geometry_types(
self, engine_postgis, df_mixed_single_and_multi
):
@@ -551,6 +586,7 @@ class TestIO:
assert res[1][0].upper() == "MULTILINESTRING"
assert res[2][0].upper() == "POINT"
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
def test_write_postgis_linear_ring(self, engine_postgis, df_linear_ring):
"""
Tests that writing a LinearRing.
@@ -572,6 +608,7 @@ class TestIO:
assert geom_type.upper() == "LINESTRING"
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
def test_write_postgis_in_chunks(self, engine_postgis, df_mixed_single_and_multi):
"""
Tests writing a LinearRing works.
@@ -605,6 +642,7 @@ class TestIO:
assert res[1][0].upper() == "MULTILINESTRING"
assert res[2][0].upper() == "POINT"
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
def test_write_postgis_to_different_schema(self, engine_postgis, df_nybb):
"""
Tests writing data to alternative schema.
@@ -628,6 +666,7 @@ class TestIO:
df = read_postgis(sql, engine, geom_col="geometry")
validate_boro_df(df)
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
def test_write_postgis_to_different_schema_when_table_exists(
self, engine_postgis, df_nybb
):
@@ -672,6 +711,7 @@ class TestIO:
df = read_postgis(sql, engine, geom_col="geometry")
validate_boro_df(df)
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
def test_write_postgis_3D_geometries(self, engine_postgis, df_3D_geoms):
"""
Tests writing a geometries with 3 dimensions works.
@@ -687,6 +727,7 @@ class TestIO:
df = read_postgis(sql, engine, geom_col="geometry")
assert list(df.geometry.has_z) == [True, True, True]
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
def test_row_order(self, engine_postgis, df_nybb):
"""
Tests that the row order in db table follows the order of the original frame.
@@ -703,6 +744,7 @@ class TestIO:
df = read_postgis(sql, engine, geom_col="geometry")
assert df["BoroCode"].tolist() == correct_order
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
def test_append_before_table_exists(self, engine_postgis, df_nybb):
"""
Tests that insert works with if_exists='append' when table does not exist yet.
@@ -720,6 +762,7 @@ class TestIO:
df = read_postgis(sql, engine, geom_col="geometry")
validate_boro_df(df)
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
def test_append_with_different_crs(self, engine_postgis, df_nybb):
"""
Tests that the warning is raised if table CRS differs from frame.
@@ -736,9 +779,26 @@ class TestIO:
with pytest.raises(ValueError, match="CRS of the target table"):
write_postgis(df_nybb2, con=engine, name=table, if_exists="append")
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
def test_append_without_crs(self, engine_postgis, df_nybb):
# This test was included in #3328 when the default value for no
# CRS was changed from an SRID of -1 to 0. This resolves issues
# of appending dataframes to postgis that have no CRS as postgis
# no CRS value is 0.
engine = engine_postgis
df_nybb = df_nybb.set_crs(None, allow_override=True)
table = "nybb"
write_postgis(df_nybb, con=engine, name=table, if_exists="replace")
# append another dataframe with no crs
df_nybb2 = df_nybb
write_postgis(df_nybb2, con=engine, name=table, if_exists="append")
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
@pytest.mark.xfail(
compat.PANDAS_GE_20 and not compat.PANDAS_GE_21,
reason="Duplicate columns are dropped in read_sql with pandas 2.0.x",
compat.PANDAS_GE_20 and not compat.PANDAS_GE_202,
reason="Duplicate columns are dropped in read_sql with pandas 2.0.0 and 2.0.1",
)
def test_duplicate_geometry_column_fails(self, engine_postgis):
"""
@@ -750,3 +810,69 @@ class TestIO:
with pytest.raises(ValueError):
read_postgis(sql, engine, geom_col="geom")
@pytest.mark.parametrize("connection_postgis", POSTGIS_DRIVERS, indirect=True)
def test_read_non_epsg_crs(self, connection_postgis, df_nybb):
con = connection_postgis
df_nybb = df_nybb.to_crs(crs="esri:54052")
create_postgis(con, df_nybb, srid=54052)
sql = "SELECT * FROM nybb;"
df = read_postgis(sql, con)
validate_boro_df(df)
assert df.crs == "ESRI:54052"
@pytest.mark.skipif(not HAS_PYPROJ, reason="pyproj not installed")
@mock.patch("shapely.get_srid")
@pytest.mark.parametrize("connection_postgis", POSTGIS_DRIVERS, indirect=True)
def test_read_srid_not_in_table(self, mock_get_srid, connection_postgis, df_nybb):
# mock a non-existent srid for edge case if shapely has an srid
# not present in postgis table.
pyproj = pytest.importorskip("pyproj")
mock_get_srid.return_value = 99999
con = connection_postgis
df_nybb = df_nybb.to_crs(crs="epsg:4326")
create_postgis(con, df_nybb)
sql = "SELECT * FROM nybb;"
with pytest.raises(pyproj.exceptions.CRSError, match="crs not found"):
with pytest.warns(UserWarning, match="Could not find srid 99999"):
read_postgis(sql, con)
@mock.patch("geopandas.io.sql._get_spatial_ref_sys_df")
@pytest.mark.parametrize("connection_postgis", POSTGIS_DRIVERS, indirect=True)
def test_read_no_spatial_ref_sys_table_in_postgis(
self, mock_get_spatial_ref_sys_df, connection_postgis, df_nybb
):
# mock for a non-existent spatial_ref_sys database
mock_get_spatial_ref_sys_df.side_effect = pd.errors.DatabaseError
con = connection_postgis
df_nybb = df_nybb.to_crs(crs="epsg:4326")
create_postgis(con, df_nybb, srid=4326)
sql = "SELECT * FROM nybb;"
with pytest.warns(
UserWarning, match="Could not find the spatial reference system table"
):
df = read_postgis(sql, con)
assert df.crs == "EPSG:4326"
@pytest.mark.parametrize("connection_postgis", POSTGIS_DRIVERS, indirect=True)
def test_read_non_epsg_crs_chunksize(self, connection_postgis, df_nybb):
"""Test chunksize argument with non epsg crs"""
chunksize = 2
con = connection_postgis
df_nybb = df_nybb.to_crs(crs="esri:54052")
create_postgis(con, df_nybb, srid=54052)
sql = "SELECT * FROM nybb;"
df = pd.concat(read_postgis(sql, con, chunksize=chunksize))
validate_boro_df(df)
assert df.crs == "ESRI:54052"