that's too much!

This commit is contained in:
2024-12-19 20:22:56 -08:00
parent 0020a609dd
commit 32cd60e92b
8443 changed files with 1446950 additions and 42 deletions

View File

@@ -0,0 +1,73 @@
from packaging.version import Version
import pyarrow
_ERROR_MSG = """\
Disallowed deserialization of 'arrow.py_extension_type':
storage_type = {storage_type}
serialized = {serialized}
pickle disassembly:\n{pickle_disassembly}
Reading of untrusted Parquet or Feather files with a PyExtensionType column
allows arbitrary code execution.
If you trust this file, you can enable reading the extension type by one of:
- upgrading to pyarrow >= 14.0.1, and call `pa.PyExtensionType.set_auto_load(True)`
- install pyarrow-hotfix (`pip install pyarrow-hotfix`) and disable it by running
`import pyarrow_hotfix; pyarrow_hotfix.uninstall()`
We strongly recommend updating your Parquet/Feather files to use extension types
derived from `pyarrow.ExtensionType` instead, and register this type explicitly.
See https://arrow.apache.org/docs/dev/python/extending_types.html#defining-extension-types-user-defined-types
for more details.
"""
def patch_pyarrow():
# starting from pyarrow 14.0.1, it has its own mechanism
if Version(pyarrow.__version__) >= Version("14.0.1"):
return
# if the user has pyarrow_hotfix (https://github.com/pitrou/pyarrow-hotfix)
# installed, use this instead (which also ensures it works if they had
# called `pyarrow_hotfix.uninstall()`)
try:
import pyarrow_hotfix # noqa: F401
except ImportError:
pass
else:
return
# if the hotfix is already installed and enabled
if getattr(pyarrow, "_hotfix_installed", False):
return
class ForbiddenExtensionType(pyarrow.ExtensionType):
def __arrow_ext_serialize__(self):
return b""
@classmethod
def __arrow_ext_deserialize__(cls, storage_type, serialized):
import io
import pickletools
out = io.StringIO()
pickletools.dis(serialized, out)
raise RuntimeError(
_ERROR_MSG.format(
storage_type=storage_type,
serialized=serialized,
pickle_disassembly=out.getvalue(),
)
)
pyarrow.unregister_extension_type("arrow.py_extension_type")
pyarrow.register_extension_type(
ForbiddenExtensionType(pyarrow.null(), "arrow.py_extension_type")
)
pyarrow._hotfix_installed = True
patch_pyarrow()

View File

@@ -0,0 +1,687 @@
from packaging.version import Version
import json
import warnings
import numpy as np
from pandas import DataFrame, Series
import geopandas._compat as compat
from geopandas._compat import import_optional_dependency
from geopandas.array import from_wkb
from geopandas import GeoDataFrame
import geopandas
from .file import _expand_user
METADATA_VERSION = "1.0.0"
SUPPORTED_VERSIONS = ["0.1.0", "0.4.0", "1.0.0-beta.1", "1.0.0"]
# reference: https://github.com/opengeospatial/geoparquet
# Metadata structure:
# {
# "geo": {
# "columns": {
# "<name>": {
# "encoding": "WKB"
# "geometry_types": <list of str: REQUIRED>
# "crs": "<PROJJSON or None: OPTIONAL>",
# "orientation": "<'counterclockwise' or None: OPTIONAL>"
# "edges": "planar"
# "bbox": <list of [xmin, ymin, xmax, ymax]: OPTIONAL>
# "epoch": <float: OPTIONAL>
# }
# },
# "primary_column": "<str: REQUIRED>",
# "version": "<METADATA_VERSION>",
#
# # Additional GeoPandas specific metadata (not in metadata spec)
# "creator": {
# "library": "geopandas",
# "version": "<geopandas.__version__>"
# }
# }
# }
def _is_fsspec_url(url):
return (
isinstance(url, str)
and "://" in url
and not url.startswith(("http://", "https://"))
)
def _remove_id_from_member_of_ensembles(json_dict):
"""
Older PROJ versions will not recognize IDs of datum ensemble members that
were added in more recent PROJ database versions.
Cf https://github.com/opengeospatial/geoparquet/discussions/110
and https://github.com/OSGeo/PROJ/pull/3221
Mimicking the patch to GDAL from https://github.com/OSGeo/gdal/pull/5872
"""
for key, value in json_dict.items():
if isinstance(value, dict):
_remove_id_from_member_of_ensembles(value)
elif key == "members" and isinstance(value, list):
for member in value:
member.pop("id", None)
def _create_metadata(df, schema_version=None):
"""Create and encode geo metadata dict.
Parameters
----------
df : GeoDataFrame
schema_version : {'0.1.0', '0.4.0', '1.0.0-beta.1', '1.0.0', None}
GeoParquet specification version; if not provided will default to
latest supported version.
Returns
-------
dict
"""
schema_version = schema_version or METADATA_VERSION
if schema_version not in SUPPORTED_VERSIONS:
raise ValueError(
f"schema_version must be one of: {', '.join(SUPPORTED_VERSIONS)}"
)
# Construct metadata for each geometry
column_metadata = {}
for col in df.columns[df.dtypes == "geometry"]:
series = df[col]
geometry_types = sorted(Series(series.geom_type.unique()).dropna())
if schema_version[0] == "0":
geometry_types_name = "geometry_type"
if len(geometry_types) == 1:
geometry_types = geometry_types[0]
else:
geometry_types_name = "geometry_types"
crs = None
if series.crs:
if schema_version == "0.1.0":
crs = series.crs.to_wkt()
else: # version >= 0.4.0
crs = series.crs.to_json_dict()
_remove_id_from_member_of_ensembles(crs)
column_metadata[col] = {
"encoding": "WKB",
"crs": crs,
geometry_types_name: geometry_types,
}
bbox = series.total_bounds.tolist()
if np.isfinite(bbox).all():
# don't add bbox with NaNs for empty / all-NA geometry column
column_metadata[col]["bbox"] = bbox
return {
"primary_column": df._geometry_column_name,
"columns": column_metadata,
"version": schema_version or METADATA_VERSION,
"creator": {"library": "geopandas", "version": geopandas.__version__},
}
def _encode_metadata(metadata):
"""Encode metadata dict to UTF-8 JSON string
Parameters
----------
metadata : dict
Returns
-------
UTF-8 encoded JSON string
"""
return json.dumps(metadata).encode("utf-8")
def _decode_metadata(metadata_str):
"""Decode a UTF-8 encoded JSON string to dict
Parameters
----------
metadata_str : string (UTF-8 encoded)
Returns
-------
dict
"""
if metadata_str is None:
return None
return json.loads(metadata_str.decode("utf-8"))
def _validate_dataframe(df):
"""Validate that the GeoDataFrame conforms to requirements for writing
to Parquet format.
Raises `ValueError` if the GeoDataFrame is not valid.
copied from `pandas.io.parquet`
Parameters
----------
df : GeoDataFrame
"""
if not isinstance(df, DataFrame):
raise ValueError("Writing to Parquet/Feather only supports IO with DataFrames")
# must have value column names (strings only)
if df.columns.inferred_type not in {"string", "unicode", "empty"}:
raise ValueError("Writing to Parquet/Feather requires string column names")
# index level names must be strings
valid_names = all(
isinstance(name, str) for name in df.index.names if name is not None
)
if not valid_names:
raise ValueError("Index level names must be strings")
def _validate_metadata(metadata):
"""Validate geo metadata.
Must not be empty, and must contain the structure specified above.
Raises ValueError if metadata is not valid.
Parameters
----------
metadata : dict
"""
if not metadata:
raise ValueError("Missing or malformed geo metadata in Parquet/Feather file")
# version was schema_version in 0.1.0
version = metadata.get("version", metadata.get("schema_version"))
if not version:
raise ValueError(
"'geo' metadata in Parquet/Feather file is missing required key: "
"'version'"
)
required_keys = ("primary_column", "columns")
for key in required_keys:
if metadata.get(key, None) is None:
raise ValueError(
"'geo' metadata in Parquet/Feather file is missing required key: "
"'{key}'".format(key=key)
)
if not isinstance(metadata["columns"], dict):
raise ValueError("'columns' in 'geo' metadata must be a dict")
# Validate that geometry columns have required metadata and values
# leaving out "geometry_type" for compatibility with 0.1
required_col_keys = ("encoding",)
for col, column_metadata in metadata["columns"].items():
for key in required_col_keys:
if key not in column_metadata:
raise ValueError(
"'geo' metadata in Parquet/Feather file is missing required key "
"'{key}' for column '{col}'".format(key=key, col=col)
)
if column_metadata["encoding"] != "WKB":
raise ValueError("Only WKB geometry encoding is supported")
if column_metadata.get("edges", "planar") == "spherical":
warnings.warn(
f"The geo metadata indicate that column '{col}' has spherical edges, "
"but because GeoPandas currently does not support spherical "
"geometry, it ignores this metadata and will interpret the edges of "
"the geometries as planar.",
UserWarning,
stacklevel=4,
)
def _geopandas_to_arrow(df, index=None, schema_version=None):
"""
Helper function with main, shared logic for to_parquet/to_feather.
"""
from pyarrow import Table
_validate_dataframe(df)
# create geo metadata before altering incoming data frame
geo_metadata = _create_metadata(df, schema_version=schema_version)
kwargs = {}
if compat.USE_SHAPELY_20:
kwargs = {"flavor": "iso"}
else:
for col in df.columns[df.dtypes == "geometry"]:
series = df[col]
if series.has_z.any():
warnings.warn(
"The GeoDataFrame contains 3D geometries, and when using "
"shapely < 2.0, such geometries will be written not exactly "
"following to the GeoParquet spec (not using ISO WKB). For "
"most use cases this should not be a problem (GeoPandas can "
"read such files fine).",
stacklevel=2,
)
break
df = df.to_wkb(**kwargs)
table = Table.from_pandas(df, preserve_index=index)
# Store geopandas specific file-level metadata
# This must be done AFTER creating the table or it is not persisted
metadata = table.schema.metadata
metadata.update({b"geo": _encode_metadata(geo_metadata)})
return table.replace_schema_metadata(metadata)
def _to_parquet(
df, path, index=None, compression="snappy", schema_version=None, **kwargs
):
"""
Write a GeoDataFrame to the Parquet format.
Any geometry columns present are serialized to WKB format in the file.
Requires 'pyarrow'.
This is tracking version 1.0.0 of the GeoParquet specification at:
https://github.com/opengeospatial/geoparquet. Writing older versions is
supported using the `schema_version` keyword.
.. versionadded:: 0.8
Parameters
----------
path : str, path object
index : bool, default None
If ``True``, always include the dataframe's index(es) as columns
in the file output.
If ``False``, the index(es) will not be written to the file.
If ``None``, the index(ex) will be included as columns in the file
output except `RangeIndex` which is stored as metadata only.
compression : {'snappy', 'gzip', 'brotli', None}, default 'snappy'
Name of the compression to use. Use ``None`` for no compression.
schema_version : {'0.1.0', '0.4.0', '1.0.0', None}
GeoParquet specification version; if not provided will default to
latest supported version.
**kwargs
Additional keyword arguments passed to pyarrow.parquet.write_table().
"""
parquet = import_optional_dependency(
"pyarrow.parquet", extra="pyarrow is required for Parquet support."
)
if kwargs and "version" in kwargs and kwargs["version"] is not None:
if schema_version is None and kwargs["version"] in SUPPORTED_VERSIONS:
warnings.warn(
"the `version` parameter has been replaced with `schema_version`. "
"`version` will instead be passed directly to the underlying "
"parquet writer unless `version` is 0.1.0 or 0.4.0.",
FutureWarning,
stacklevel=2,
)
schema_version = kwargs.pop("version")
path = _expand_user(path)
table = _geopandas_to_arrow(df, index=index, schema_version=schema_version)
parquet.write_table(table, path, compression=compression, **kwargs)
def _to_feather(df, path, index=None, compression=None, schema_version=None, **kwargs):
"""
Write a GeoDataFrame to the Feather format.
Any geometry columns present are serialized to WKB format in the file.
Requires 'pyarrow' >= 0.17.
This is tracking version 1.0.0 of the GeoParquet specification for
the metadata at: https://github.com/opengeospatial/geoparquet. Writing
older versions is supported using the `schema_version` keyword.
.. versionadded:: 0.8
Parameters
----------
path : str, path object
index : bool, default None
If ``True``, always include the dataframe's index(es) as columns
in the file output.
If ``False``, the index(es) will not be written to the file.
If ``None``, the index(ex) will be included as columns in the file
output except `RangeIndex` which is stored as metadata only.
compression : {'zstd', 'lz4', 'uncompressed'}, optional
Name of the compression to use. Use ``"uncompressed"`` for no
compression. By default uses LZ4 if available, otherwise uncompressed.
schema_version : {'0.1.0', '0.4.0', '1.0.0', None}
GeoParquet specification version for the metadata; if not provided
will default to latest supported version.
kwargs
Additional keyword arguments passed to pyarrow.feather.write_feather().
"""
feather = import_optional_dependency(
"pyarrow.feather", extra="pyarrow is required for Feather support."
)
# TODO move this into `import_optional_dependency`
import pyarrow
if Version(pyarrow.__version__) < Version("0.17.0"):
raise ImportError("pyarrow >= 0.17 required for Feather support")
if kwargs and "version" in kwargs and kwargs["version"] is not None:
if schema_version is None and kwargs["version"] in SUPPORTED_VERSIONS:
warnings.warn(
"the `version` parameter has been replaced with `schema_version`. "
"`version` will instead be passed directly to the underlying "
"feather writer unless `version` is 0.1.0 or 0.4.0.",
FutureWarning,
stacklevel=2,
)
schema_version = kwargs.pop("version")
path = _expand_user(path)
table = _geopandas_to_arrow(df, index=index, schema_version=schema_version)
feather.write_feather(table, path, compression=compression, **kwargs)
def _arrow_to_geopandas(table, metadata=None):
"""
Helper function with main, shared logic for read_parquet/read_feather.
"""
df = table.to_pandas()
metadata = metadata or table.schema.metadata
if metadata is None or b"geo" not in metadata:
raise ValueError(
"""Missing geo metadata in Parquet/Feather file.
Use pandas.read_parquet/read_feather() instead."""
)
try:
metadata = _decode_metadata(metadata.get(b"geo", b""))
except (TypeError, json.decoder.JSONDecodeError):
raise ValueError("Missing or malformed geo metadata in Parquet/Feather file")
_validate_metadata(metadata)
# Find all geometry columns that were read from the file. May
# be a subset if 'columns' parameter is used.
geometry_columns = df.columns.intersection(metadata["columns"])
if not len(geometry_columns):
raise ValueError(
"""No geometry columns are included in the columns read from
the Parquet/Feather file. To read this file without geometry columns,
use pandas.read_parquet/read_feather() instead."""
)
geometry = metadata["primary_column"]
# Missing geometry likely indicates a subset of columns was read;
# promote the first available geometry to the primary geometry.
if len(geometry_columns) and geometry not in geometry_columns:
geometry = geometry_columns[0]
# if there are multiple non-primary geometry columns, raise a warning
if len(geometry_columns) > 1:
warnings.warn(
"Multiple non-primary geometry columns read from Parquet/Feather "
"file. The first column read was promoted to the primary geometry.",
stacklevel=3,
)
# Convert the WKB columns that are present back to geometry.
for col in geometry_columns:
col_metadata = metadata["columns"][col]
if "crs" in col_metadata:
crs = col_metadata["crs"]
if isinstance(crs, dict):
_remove_id_from_member_of_ensembles(crs)
else:
# per the GeoParquet spec, missing CRS is to be interpreted as
# OGC:CRS84
crs = "OGC:CRS84"
df[col] = from_wkb(df[col].values, crs=crs)
return GeoDataFrame(df, geometry=geometry)
def _get_filesystem_path(path, filesystem=None, storage_options=None):
"""
Get the filesystem and path for a given filesystem and path.
If the filesystem is not None then it's just returned as is.
"""
import pyarrow
if (
isinstance(path, str)
and storage_options is None
and filesystem is None
and Version(pyarrow.__version__) >= Version("5.0.0")
):
# Use the native pyarrow filesystem if possible.
try:
from pyarrow.fs import FileSystem
filesystem, path = FileSystem.from_uri(path)
except Exception:
# fallback to use get_handle / fsspec for filesystems
# that pyarrow doesn't support
pass
if _is_fsspec_url(path) and filesystem is None:
fsspec = import_optional_dependency(
"fsspec", extra="fsspec is requred for 'storage_options'."
)
filesystem, path = fsspec.core.url_to_fs(path, **(storage_options or {}))
if filesystem is None and storage_options:
raise ValueError(
"Cannot provide 'storage_options' with non-fsspec path '{}'".format(path)
)
return filesystem, path
def _ensure_arrow_fs(filesystem):
"""
Simplified version of pyarrow.fs._ensure_filesystem. This is only needed
below because `pyarrow.parquet.read_metadata` does not yet accept a
filesystem keyword (https://issues.apache.org/jira/browse/ARROW-16719)
"""
from pyarrow import fs
if isinstance(filesystem, fs.FileSystem):
return filesystem
# handle fsspec-compatible filesystems
try:
import fsspec
except ImportError:
pass
else:
if isinstance(filesystem, fsspec.AbstractFileSystem):
return fs.PyFileSystem(fs.FSSpecHandler(filesystem))
return filesystem
def _read_parquet(path, columns=None, storage_options=None, **kwargs):
"""
Load a Parquet object from the file path, returning a GeoDataFrame.
You can read a subset of columns in the file using the ``columns`` parameter.
However, the structure of the returned GeoDataFrame will depend on which
columns you read:
* if no geometry columns are read, this will raise a ``ValueError`` - you
should use the pandas `read_parquet` method instead.
* if the primary geometry column saved to this file is not included in
columns, the first available geometry column will be set as the geometry
column of the returned GeoDataFrame.
Supports versions 0.1.0, 0.4.0 and 1.0.0 of the GeoParquet
specification at: https://github.com/opengeospatial/geoparquet
If 'crs' key is not present in the GeoParquet metadata associated with the
Parquet object, it will default to "OGC:CRS84" according to the specification.
Requires 'pyarrow'.
.. versionadded:: 0.8
Parameters
----------
path : str, path object
columns : list-like of strings, default=None
If not None, only these columns will be read from the file. If
the primary geometry column is not included, the first secondary
geometry read from the file will be set as the geometry column
of the returned GeoDataFrame. If no geometry columns are present,
a ``ValueError`` will be raised.
storage_options : dict, optional
Extra options that make sense for a particular storage connection, e.g. host,
port, username, password, etc. For HTTP(S) URLs the key-value pairs are
forwarded to urllib as header options. For other URLs (e.g. starting with
"s3://", and "gcs://") the key-value pairs are forwarded to fsspec. Please
see fsspec and urllib for more details.
When no storage options are provided and a filesystem is implemented by
both ``pyarrow.fs`` and ``fsspec`` (e.g. "s3://") then the ``pyarrow.fs``
filesystem is preferred. Provide the instantiated fsspec filesystem using
the ``filesystem`` keyword if you wish to use its implementation.
**kwargs
Any additional kwargs passed to pyarrow.parquet.read_table().
Returns
-------
GeoDataFrame
Examples
--------
>>> df = geopandas.read_parquet("data.parquet") # doctest: +SKIP
Specifying columns to read:
>>> df = geopandas.read_parquet(
... "data.parquet",
... columns=["geometry", "pop_est"]
... ) # doctest: +SKIP
"""
parquet = import_optional_dependency(
"pyarrow.parquet", extra="pyarrow is required for Parquet support."
)
import geopandas.io._pyarrow_hotfix # noqa: F401
# TODO(https://github.com/pandas-dev/pandas/pull/41194): see if pandas
# adds filesystem as a keyword and match that.
filesystem = kwargs.pop("filesystem", None)
filesystem, path = _get_filesystem_path(
path, filesystem=filesystem, storage_options=storage_options
)
path = _expand_user(path)
kwargs["use_pandas_metadata"] = True
table = parquet.read_table(path, columns=columns, filesystem=filesystem, **kwargs)
# read metadata separately to get the raw Parquet FileMetaData metadata
# (pyarrow doesn't properly exposes those in schema.metadata for files
# created by GDAL - https://issues.apache.org/jira/browse/ARROW-16688)
metadata = None
if table.schema.metadata is None or b"geo" not in table.schema.metadata:
try:
# read_metadata does not accept a filesystem keyword, so need to
# handle this manually (https://issues.apache.org/jira/browse/ARROW-16719)
if filesystem is not None:
pa_filesystem = _ensure_arrow_fs(filesystem)
with pa_filesystem.open_input_file(path) as source:
metadata = parquet.read_metadata(source).metadata
else:
metadata = parquet.read_metadata(path).metadata
except Exception:
pass
return _arrow_to_geopandas(table, metadata)
def _read_feather(path, columns=None, **kwargs):
"""
Load a Feather object from the file path, returning a GeoDataFrame.
You can read a subset of columns in the file using the ``columns`` parameter.
However, the structure of the returned GeoDataFrame will depend on which
columns you read:
* if no geometry columns are read, this will raise a ``ValueError`` - you
should use the pandas `read_feather` method instead.
* if the primary geometry column saved to this file is not included in
columns, the first available geometry column will be set as the geometry
column of the returned GeoDataFrame.
Supports versions 0.1.0, 0.4.0 and 1.0.0 of the GeoParquet
specification at: https://github.com/opengeospatial/geoparquet
If 'crs' key is not present in the Feather metadata associated with the
Parquet object, it will default to "OGC:CRS84" according to the specification.
Requires 'pyarrow' >= 0.17.
.. versionadded:: 0.8
Parameters
----------
path : str, path object
columns : list-like of strings, default=None
If not None, only these columns will be read from the file. If
the primary geometry column is not included, the first secondary
geometry read from the file will be set as the geometry column
of the returned GeoDataFrame. If no geometry columns are present,
a ``ValueError`` will be raised.
**kwargs
Any additional kwargs passed to pyarrow.feather.read_table().
Returns
-------
GeoDataFrame
Examples
--------
>>> df = geopandas.read_feather("data.feather") # doctest: +SKIP
Specifying columns to read:
>>> df = geopandas.read_feather(
... "data.feather",
... columns=["geometry", "pop_est"]
... ) # doctest: +SKIP
"""
feather = import_optional_dependency(
"pyarrow.feather", extra="pyarrow is required for Feather support."
)
# TODO move this into `import_optional_dependency`
import pyarrow
import geopandas.io._pyarrow_hotfix # noqa: F401
if Version(pyarrow.__version__) < Version("0.17.0"):
raise ImportError("pyarrow >= 0.17 required for Feather support")
path = _expand_user(path)
table = feather.read_table(path, columns=columns, **kwargs)
return _arrow_to_geopandas(table)

View File

@@ -0,0 +1,734 @@
import os
from packaging.version import Version
from pathlib import Path
import warnings
import numpy as np
import pandas as pd
from pandas.api.types import is_integer_dtype
import pyproj
from shapely.geometry import mapping
from shapely.geometry.base import BaseGeometry
from geopandas import GeoDataFrame, GeoSeries
# Adapted from pandas.io.common
from urllib.parse import urlparse as parse_url
from urllib.parse import uses_netloc, uses_params, uses_relative
import urllib.request
_VALID_URLS = set(uses_relative + uses_netloc + uses_params)
_VALID_URLS.discard("")
# file:// URIs are supported by fiona/pyogrio -> don't already open + read the file here
_VALID_URLS.discard("file")
fiona = None
fiona_env = None
fiona_import_error = None
FIONA_GE_19 = False
def _import_fiona():
global fiona
global fiona_env
global fiona_import_error
global FIONA_GE_19
if fiona is None:
try:
import fiona
# only try to import fiona.Env if the main fiona import succeeded
# (otherwise you can get confusing "AttributeError: module 'fiona'
# has no attribute '_loading'" / partially initialized module errors)
try:
from fiona import Env as fiona_env
except ImportError:
try:
from fiona import drivers as fiona_env
except ImportError:
fiona_env = None
FIONA_GE_19 = Version(Version(fiona.__version__).base_version) >= Version(
"1.9.0"
)
except ImportError as err:
fiona = False
fiona_import_error = str(err)
pyogrio = None
pyogrio_import_error = None
def _import_pyogrio():
global pyogrio
global pyogrio_import_error
if pyogrio is None:
try:
import pyogrio
except ImportError as err:
pyogrio = False
pyogrio_import_error = str(err)
def _check_fiona(func):
if fiona is None:
raise ImportError(
f"the {func} requires the 'fiona' package, but it is not installed or does "
f"not import correctly.\nImporting fiona resulted in: {fiona_import_error}"
)
def _check_pyogrio(func):
if pyogrio is None:
raise ImportError(
f"the {func} requires the 'pyogrio' package, but it is not installed "
"or does not import correctly."
"\nImporting pyogrio resulted in: {pyogrio_import_error}"
)
def _check_engine(engine, func):
# if not specified through keyword or option, then default to "fiona" if
# installed, otherwise try pyogrio
if engine is None:
import geopandas
engine = geopandas.options.io_engine
if engine is None:
_import_fiona()
if fiona:
engine = "fiona"
else:
_import_pyogrio()
if pyogrio:
engine = "pyogrio"
if engine == "fiona":
_import_fiona()
_check_fiona(func)
elif engine == "pyogrio":
_import_pyogrio()
_check_pyogrio(func)
elif engine is None:
raise ImportError(
f"The {func} requires the 'pyogrio' or 'fiona' package, "
"but neither is installed or imports correctly."
f"\nImporting fiona resulted in: {fiona_import_error}"
f"\nImporting pyogrio resulted in: {pyogrio_import_error}"
)
return engine
_EXTENSION_TO_DRIVER = {
".bna": "BNA",
".dxf": "DXF",
".csv": "CSV",
".shp": "ESRI Shapefile",
".dbf": "ESRI Shapefile",
".json": "GeoJSON",
".geojson": "GeoJSON",
".geojsonl": "GeoJSONSeq",
".geojsons": "GeoJSONSeq",
".gpkg": "GPKG",
".gml": "GML",
".xml": "GML",
".gpx": "GPX",
".gtm": "GPSTrackMaker",
".gtz": "GPSTrackMaker",
".tab": "MapInfo File",
".mif": "MapInfo File",
".mid": "MapInfo File",
".dgn": "DGN",
".fgb": "FlatGeobuf",
}
def _expand_user(path):
"""Expand paths that use ~."""
if isinstance(path, str):
path = os.path.expanduser(path)
elif isinstance(path, Path):
path = path.expanduser()
return path
def _is_url(url):
"""Check to see if *url* has a valid protocol."""
try:
return parse_url(url).scheme in _VALID_URLS
except Exception:
return False
def _is_zip(path):
"""Check if a given path is a zipfile"""
parsed = fiona.path.ParsedPath.from_uri(path)
return (
parsed.archive.endswith(".zip")
if parsed.archive
else parsed.path.endswith(".zip")
)
def _read_file(filename, bbox=None, mask=None, rows=None, engine=None, **kwargs):
"""
Returns a GeoDataFrame from a file or URL.
.. note::
GeoPandas currently defaults to use Fiona as the engine in ``read_file``.
However, GeoPandas 1.0 will switch to use pyogrio as the default engine, since
pyogrio can provide a significant speedup compared to Fiona. We recommend to
already install pyogrio and specify the engine by using the ``engine`` keyword
(``geopandas.read_file(..., engine="pyogrio")``), or by setting the default for
the ``engine`` keyword globally with::
geopandas.options.io_engine = "pyogrio"
Parameters
----------
filename : str, path object or file-like object
Either the absolute or relative path to the file or URL to
be opened, or any object with a read() method (such as an open file
or StringIO)
bbox : tuple | GeoDataFrame or GeoSeries | shapely Geometry, default None
Filter features by given bounding box, GeoSeries, GeoDataFrame or a shapely
geometry. With engine="fiona", CRS mis-matches are resolved if given a GeoSeries
or GeoDataFrame. With engine="pyogrio", bbox must be in the same CRS as the
dataset. Tuple is (minx, miny, maxx, maxy) to match the bounds property of
shapely geometry objects. Cannot be used with mask.
mask : dict | GeoDataFrame or GeoSeries | shapely Geometry, default None
Filter for features that intersect with the given dict-like geojson
geometry, GeoSeries, GeoDataFrame or shapely geometry.
CRS mis-matches are resolved if given a GeoSeries or GeoDataFrame.
Cannot be used with bbox.
rows : int or slice, default None
Load in specific rows by passing an integer (first `n` rows) or a
slice() object.
engine : str, "fiona" or "pyogrio"
The underlying library that is used to read the file. Currently, the
supported options are "fiona" and "pyogrio". Defaults to "fiona" if
installed, otherwise tries "pyogrio".
**kwargs :
Keyword args to be passed to the engine. In case of the "fiona" engine,
the keyword arguments are passed to :func:`fiona.open` or
:class:`fiona.collection.BytesCollection` when opening the file.
For more information on possible keywords, type:
``import fiona; help(fiona.open)``. In case of the "pyogrio" engine,
the keyword arguments are passed to :func:`pyogrio.read_dataframe`.
Examples
--------
>>> df = geopandas.read_file("nybb.shp") # doctest: +SKIP
Specifying layer of GPKG:
>>> df = geopandas.read_file("file.gpkg", layer='cities') # doctest: +SKIP
Reading only first 10 rows:
>>> df = geopandas.read_file("nybb.shp", rows=10) # doctest: +SKIP
Reading only geometries intersecting ``mask``:
>>> df = geopandas.read_file("nybb.shp", mask=polygon) # doctest: +SKIP
Reading only geometries intersecting ``bbox``:
>>> df = geopandas.read_file("nybb.shp", bbox=(0, 0, 10, 20)) # doctest: +SKIP
Returns
-------
:obj:`geopandas.GeoDataFrame` or :obj:`pandas.DataFrame` :
If `ignore_geometry=True` a :obj:`pandas.DataFrame` will be returned.
Notes
-----
The format drivers will attempt to detect the encoding of your data, but
may fail. In this case, the proper encoding can be specified explicitly
by using the encoding keyword parameter, e.g. ``encoding='utf-8'``.
When specifying a URL, geopandas will check if the server supports reading
partial data and in that case pass the URL as is to the underlying engine,
which will then use the network file system handler of GDAL to read from
the URL. Otherwise geopandas will download the data from the URL and pass
all data in-memory to the underlying engine.
If you need more control over how the URL is read, you can specify the
GDAL virtual filesystem manually (e.g. ``/vsicurl/https://...``). See the
GDAL documentation on filesystems for more details
(https://gdal.org/user/virtual_file_systems.html#vsicurl-http-https-ftp-files-random-access).
"""
engine = _check_engine(engine, "'read_file' function")
filename = _expand_user(filename)
from_bytes = False
if _is_url(filename):
# if it is a url that supports random access -> pass through to
# pyogrio/fiona as is (to support downloading only part of the file)
# otherwise still download manually because pyogrio/fiona don't support
# all types of urls (https://github.com/geopandas/geopandas/issues/2908)
with urllib.request.urlopen(filename) as response:
if not response.headers.get("Accept-Ranges") == "bytes":
filename = response.read()
from_bytes = True
if engine == "pyogrio":
return _read_file_pyogrio(filename, bbox=bbox, mask=mask, rows=rows, **kwargs)
elif engine == "fiona":
if pd.api.types.is_file_like(filename):
data = filename.read()
path_or_bytes = data.encode("utf-8") if isinstance(data, str) else data
from_bytes = True
else:
path_or_bytes = filename
return _read_file_fiona(
path_or_bytes, from_bytes, bbox=bbox, mask=mask, rows=rows, **kwargs
)
else:
raise ValueError(f"unknown engine '{engine}'")
def _read_file_fiona(
path_or_bytes, from_bytes, bbox=None, mask=None, rows=None, where=None, **kwargs
):
if where is not None and not FIONA_GE_19:
raise NotImplementedError("where requires fiona 1.9+")
if not from_bytes:
# Opening a file via URL or file-like-object above automatically detects a
# zipped file. In order to match that behavior, attempt to add a zip scheme
# if missing.
if _is_zip(str(path_or_bytes)):
parsed = fiona.parse_path(str(path_or_bytes))
if isinstance(parsed, fiona.path.ParsedPath):
# If fiona is able to parse the path, we can safely look at the scheme
# and update it to have a zip scheme if necessary.
schemes = (parsed.scheme or "").split("+")
if "zip" not in schemes:
parsed.scheme = "+".join(["zip"] + schemes)
path_or_bytes = parsed.name
elif isinstance(parsed, fiona.path.UnparsedPath) and not str(
path_or_bytes
).startswith("/vsi"):
# If fiona is unable to parse the path, it might have a Windows drive
# scheme. Try adding zip:// to the front. If the path starts with "/vsi"
# it is a legacy GDAL path type, so let it pass unmodified.
path_or_bytes = "zip://" + parsed.name
if from_bytes:
reader = fiona.BytesCollection
else:
reader = fiona.open
with fiona_env():
with reader(path_or_bytes, **kwargs) as features:
crs = features.crs_wkt
# attempt to get EPSG code
try:
# fiona 1.9+
epsg = features.crs.to_epsg(confidence_threshold=100)
if epsg is not None:
crs = epsg
except AttributeError:
# fiona <= 1.8
try:
crs = features.crs["init"]
except (TypeError, KeyError):
pass
# handle loading the bounding box
if bbox is not None:
if isinstance(bbox, (GeoDataFrame, GeoSeries)):
bbox = tuple(bbox.to_crs(crs).total_bounds)
elif isinstance(bbox, BaseGeometry):
bbox = bbox.bounds
assert len(bbox) == 4
# handle loading the mask
elif isinstance(mask, (GeoDataFrame, GeoSeries)):
mask = mapping(mask.to_crs(crs).unary_union)
elif isinstance(mask, BaseGeometry):
mask = mapping(mask)
filters = {}
if bbox is not None:
filters["bbox"] = bbox
if mask is not None:
filters["mask"] = mask
if where is not None:
filters["where"] = where
# setup the data loading filter
if rows is not None:
if isinstance(rows, int):
rows = slice(rows)
elif not isinstance(rows, slice):
raise TypeError("'rows' must be an integer or a slice.")
f_filt = features.filter(rows.start, rows.stop, rows.step, **filters)
elif filters:
f_filt = features.filter(**filters)
else:
f_filt = features
# get list of columns
columns = list(features.schema["properties"])
datetime_fields = [
k for (k, v) in features.schema["properties"].items() if v == "datetime"
]
if kwargs.get("ignore_geometry", False):
df = pd.DataFrame(
[record["properties"] for record in f_filt], columns=columns
)
else:
df = GeoDataFrame.from_features(
f_filt, crs=crs, columns=columns + ["geometry"]
)
for k in datetime_fields:
as_dt = pd.to_datetime(df[k], errors="ignore")
# if to_datetime failed, try again for mixed timezone offsets
if as_dt.dtype == "object":
# This can still fail if there are invalid datetimes
as_dt = pd.to_datetime(df[k], errors="ignore", utc=True)
# if to_datetime succeeded, round datetimes as
# fiona only supports up to ms precision (any microseconds are
# floating point rounding error)
if not (as_dt.dtype == "object"):
df[k] = as_dt.dt.round(freq="ms")
return df
def _read_file_pyogrio(path_or_bytes, bbox=None, mask=None, rows=None, **kwargs):
import pyogrio
if rows is not None:
if isinstance(rows, int):
kwargs["max_features"] = rows
elif isinstance(rows, slice):
if rows.start is not None:
if rows.start < 0:
raise ValueError(
"Negative slice start not supported with the 'pyogrio' engine."
)
kwargs["skip_features"] = rows.start
if rows.stop is not None:
kwargs["max_features"] = rows.stop - (rows.start or 0)
if rows.step is not None:
raise ValueError("slice with step is not supported")
else:
raise TypeError("'rows' must be an integer or a slice.")
if bbox is not None:
if isinstance(bbox, (GeoDataFrame, GeoSeries)):
bbox = tuple(bbox.total_bounds)
elif isinstance(bbox, BaseGeometry):
bbox = bbox.bounds
if len(bbox) != 4:
raise ValueError("'bbox' should be a length-4 tuple.")
if mask is not None:
raise ValueError(
"The 'mask' keyword is not supported with the 'pyogrio' engine. "
"You can use 'bbox' instead."
)
if kwargs.pop("ignore_geometry", False):
kwargs["read_geometry"] = False
# TODO: if bbox is not None, check its CRS vs the CRS of the file
return pyogrio.read_dataframe(path_or_bytes, bbox=bbox, **kwargs)
def read_file(*args, **kwargs):
warnings.warn(
"geopandas.io.file.read_file() is intended for internal "
"use only, and will be deprecated. Use geopandas.read_file() instead.",
FutureWarning,
stacklevel=2,
)
return _read_file(*args, **kwargs)
def to_file(*args, **kwargs):
warnings.warn(
"geopandas.io.file.to_file() is intended for internal "
"use only, and will be deprecated. Use GeoDataFrame.to_file() "
"or GeoSeries.to_file() instead.",
FutureWarning,
stacklevel=2,
)
return _to_file(*args, **kwargs)
def _detect_driver(path):
"""
Attempt to auto-detect driver based on the extension
"""
try:
# in case the path is a file handle
path = path.name
except AttributeError:
pass
try:
return _EXTENSION_TO_DRIVER[Path(path).suffix.lower()]
except KeyError:
# Assume it is a shapefile folder for now. In the future,
# will likely raise an exception when the expected
# folder writing behavior is more clearly defined.
return "ESRI Shapefile"
def _to_file(
df,
filename,
driver=None,
schema=None,
index=None,
mode="w",
crs=None,
engine=None,
**kwargs,
):
"""
Write this GeoDataFrame to an OGR data source
A dictionary of supported OGR providers is available via:
>>> import fiona
>>> fiona.supported_drivers # doctest: +SKIP
.. note::
GeoPandas currently defaults to use Fiona as the engine in ``to_file``.
However, GeoPandas 1.0 will switch to use pyogrio as the default engine, since
pyogrio can provide a significant speedup compared to Fiona. We recommend to
already install pyogrio and specify the engine by using the ``engine`` keyword
(``df.to_file(..., engine="pyogrio")``), or by setting the default for
the ``engine`` keyword globally with::
geopandas.options.io_engine = "pyogrio"
Parameters
----------
df : GeoDataFrame to be written
filename : string
File path or file handle to write to. The path may specify a
GDAL VSI scheme.
driver : string, default None
The OGR format driver used to write the vector file.
If not specified, it attempts to infer it from the file extension.
If no extension is specified, it saves ESRI Shapefile to a folder.
schema : dict, default None
If specified, the schema dictionary is passed to Fiona to
better control how the file is written. If None, GeoPandas
will determine the schema based on each column's dtype.
Not supported for the "pyogrio" engine.
index : bool, default None
If True, write index into one or more columns (for MultiIndex).
Default None writes the index into one or more columns only if
the index is named, is a MultiIndex, or has a non-integer data
type. If False, no index is written.
.. versionadded:: 0.7
Previously the index was not written.
mode : string, default 'w'
The write mode, 'w' to overwrite the existing file and 'a' to append;
when using the pyogrio engine, you can also pass ``append=True``.
Not all drivers support appending. For the fiona engine, the drivers
that support appending are listed in fiona.supported_drivers or
https://github.com/Toblerity/Fiona/blob/master/fiona/drvsupport.py.
For the pyogrio engine, you should be able to use any driver that
is available in your installation of GDAL that supports append
capability; see the specific driver entry at
https://gdal.org/drivers/vector/index.html for more information.
crs : pyproj.CRS, default None
If specified, the CRS is passed to Fiona to
better control how the file is written. If None, GeoPandas
will determine the crs based on crs df attribute.
The value can be anything accepted
by :meth:`pyproj.CRS.from_user_input() <pyproj.crs.CRS.from_user_input>`,
such as an authority string (eg "EPSG:4326") or a WKT string.
engine : str, "fiona" or "pyogrio"
The underlying library that is used to write the file. Currently, the
supported options are "fiona" and "pyogrio". Defaults to "fiona" if
installed, otherwise tries "pyogrio".
**kwargs :
Keyword args to be passed to the engine, and can be used to write
to multi-layer data, store data within archives (zip files), etc.
In case of the "fiona" engine, the keyword arguments are passed to
fiona.open`. For more information on possible keywords, type:
``import fiona; help(fiona.open)``. In case of the "pyogrio" engine,
the keyword arguments are passed to `pyogrio.write_dataframe`.
Notes
-----
The format drivers will attempt to detect the encoding of your data, but
may fail. In this case, the proper encoding can be specified explicitly
by using the encoding keyword parameter, e.g. ``encoding='utf-8'``.
"""
engine = _check_engine(engine, "'to_file' method")
filename = _expand_user(filename)
if index is None:
# Determine if index attribute(s) should be saved to file
# (only if they are named or are non-integer)
index = list(df.index.names) != [None] or not is_integer_dtype(df.index.dtype)
if index:
df = df.reset_index(drop=False)
if driver is None:
driver = _detect_driver(filename)
if driver == "ESRI Shapefile" and any(len(c) > 10 for c in df.columns.tolist()):
warnings.warn(
"Column names longer than 10 characters will be truncated when saved to "
"ESRI Shapefile.",
stacklevel=3,
)
if (df.dtypes == "geometry").sum() > 1:
raise ValueError(
"GeoDataFrame contains multiple geometry columns but GeoDataFrame.to_file "
"supports only a single geometry column. Use a GeoDataFrame.to_parquet or "
"GeoDataFrame.to_feather, drop additional geometry columns or convert them "
"to a supported format like a well-known text (WKT) using "
"`GeoSeries.to_wkt()`.",
)
if mode not in ("w", "a"):
raise ValueError(f"'mode' should be one of 'w' or 'a', got '{mode}' instead")
if engine == "fiona":
_to_file_fiona(df, filename, driver, schema, crs, mode, **kwargs)
elif engine == "pyogrio":
_to_file_pyogrio(df, filename, driver, schema, crs, mode, **kwargs)
else:
raise ValueError(f"unknown engine '{engine}'")
def _to_file_fiona(df, filename, driver, schema, crs, mode, **kwargs):
if schema is None:
schema = infer_schema(df)
if crs:
crs = pyproj.CRS.from_user_input(crs)
else:
crs = df.crs
with fiona_env():
crs_wkt = None
try:
gdal_version = fiona.env.get_gdal_release_name()
except AttributeError:
gdal_version = "2.0.0" # just assume it is not the latest
if Version(gdal_version) >= Version("3.0.0") and crs:
crs_wkt = crs.to_wkt()
elif crs:
crs_wkt = crs.to_wkt("WKT1_GDAL")
with fiona.open(
filename, mode=mode, driver=driver, crs_wkt=crs_wkt, schema=schema, **kwargs
) as colxn:
colxn.writerecords(df.iterfeatures())
def _to_file_pyogrio(df, filename, driver, schema, crs, mode, **kwargs):
import pyogrio
if schema is not None:
raise ValueError(
"The 'schema' argument is not supported with the 'pyogrio' engine."
)
if mode == "a":
kwargs["append"] = True
if crs is not None:
raise ValueError("Passing 'crs' it not supported with the 'pyogrio' engine.")
# for the fiona engine, this check is done in gdf.iterfeatures()
if not df.columns.is_unique:
raise ValueError("GeoDataFrame cannot contain duplicated column names.")
pyogrio.write_dataframe(df, filename, driver=driver, **kwargs)
def infer_schema(df):
from collections import OrderedDict
# TODO: test pandas string type and boolean type once released
types = {
"Int32": "int32",
"int32": "int32",
"Int64": "int",
"string": "str",
"boolean": "bool",
}
def convert_type(column, in_type):
if in_type == object:
return "str"
if in_type.name.startswith("datetime64"):
# numpy datetime type regardless of frequency
return "datetime"
if str(in_type) in types:
out_type = types[str(in_type)]
else:
out_type = type(np.zeros(1, in_type).item()).__name__
if out_type == "long":
out_type = "int"
return out_type
properties = OrderedDict(
[
(col, convert_type(col, _type))
for col, _type in zip(df.columns, df.dtypes)
if col != df._geometry_column_name
]
)
if df.empty:
warnings.warn(
"You are attempting to write an empty DataFrame to file. "
"For some drivers, this operation may fail.",
UserWarning,
stacklevel=3,
)
# Since https://github.com/Toblerity/Fiona/issues/446 resolution,
# Fiona allows a list of geometry types
geom_types = _geometry_types(df)
schema = {"geometry": geom_types, "properties": properties}
return schema
def _geometry_types(df):
"""
Determine the geometry types in the GeoDataFrame for the schema.
"""
geom_types_2D = df[~df.geometry.has_z].geometry.geom_type.unique()
geom_types_2D = [gtype for gtype in geom_types_2D if gtype is not None]
geom_types_3D = df[df.geometry.has_z].geometry.geom_type.unique()
geom_types_3D = ["3D " + gtype for gtype in geom_types_3D if gtype is not None]
geom_types = geom_types_3D + geom_types_2D
if len(geom_types) == 0:
# Default geometry type supported by Fiona
# (Since https://github.com/Toblerity/Fiona/issues/446 resolution)
return "Unknown"
if len(geom_types) == 1:
geom_types = geom_types[0]
return geom_types

View File

@@ -0,0 +1,471 @@
import warnings
from contextlib import contextmanager
import pandas as pd
import shapely
import shapely.wkb
from geopandas import GeoDataFrame
from geopandas import _compat as compat
@contextmanager
def _get_conn(conn_or_engine):
"""
Yield a connection within a transaction context.
Engine.begin() returns a Connection with an implicit Transaction while
Connection.begin() returns the Transaction. This helper will always return a
Connection with an implicit (possibly nested) Transaction.
Parameters
----------
conn_or_engine : Connection or Engine
A sqlalchemy Connection or Engine instance
Returns
-------
Connection
"""
from sqlalchemy.engine.base import Engine, Connection
if isinstance(conn_or_engine, Connection):
if not conn_or_engine.in_transaction():
with conn_or_engine.begin():
yield conn_or_engine
else:
yield conn_or_engine
elif isinstance(conn_or_engine, Engine):
with conn_or_engine.begin() as conn:
yield conn
else:
raise ValueError(f"Unknown Connectable: {conn_or_engine}")
def _df_to_geodf(df, geom_col="geom", crs=None):
"""
Transforms a pandas DataFrame into a GeoDataFrame.
The column 'geom_col' must be a geometry column in WKB representation.
To be used to convert df based on pd.read_sql to gdf.
Parameters
----------
df : DataFrame
pandas DataFrame with geometry column in WKB representation.
geom_col : string, default 'geom'
column name to convert to shapely geometries
crs : pyproj.CRS, optional
CRS to use for the returned GeoDataFrame. The value can be anything accepted
by :meth:`pyproj.CRS.from_user_input() <pyproj.crs.CRS.from_user_input>`,
such as an authority string (eg "EPSG:4326") or a WKT string.
If not set, tries to determine CRS from the SRID associated with the
first geometry in the database, and assigns that to all geometries.
Returns
-------
GeoDataFrame
"""
if geom_col not in df:
raise ValueError("Query missing geometry column '{}'".format(geom_col))
if df.columns.to_list().count(geom_col) > 1:
raise ValueError(
f"Duplicate geometry column '{geom_col}' detected in SQL query output. Only"
"one geometry column is allowed."
)
geoms = df[geom_col].dropna()
if not geoms.empty:
load_geom_bytes = shapely.wkb.loads
"""Load from Python 3 binary."""
def load_geom_buffer(x):
"""Load from Python 2 binary."""
return shapely.wkb.loads(str(x))
def load_geom_text(x):
"""Load from binary encoded as text."""
return shapely.wkb.loads(str(x), hex=True)
if isinstance(geoms.iat[0], bytes):
load_geom = load_geom_bytes
else:
load_geom = load_geom_text
df[geom_col] = geoms = geoms.apply(load_geom)
if crs is None:
if compat.SHAPELY_GE_20:
srid = shapely.get_srid(geoms.iat[0])
else:
srid = shapely.geos.lgeos.GEOSGetSRID(geoms.iat[0]._geom)
# if no defined SRID in geodatabase, returns SRID of 0
if srid != 0:
crs = "epsg:{}".format(srid)
return GeoDataFrame(df, crs=crs, geometry=geom_col)
def _read_postgis(
sql,
con,
geom_col="geom",
crs=None,
index_col=None,
coerce_float=True,
parse_dates=None,
params=None,
chunksize=None,
):
"""
Returns a GeoDataFrame corresponding to the result of the query
string, which must contain a geometry column in WKB representation.
It is also possible to use :meth:`~GeoDataFrame.read_file` to read from a database.
Especially for file geodatabases like GeoPackage or SpatiaLite this can be easier.
Parameters
----------
sql : string
SQL query to execute in selecting entries from database, or name
of the table to read from the database.
con : sqlalchemy.engine.Connection or sqlalchemy.engine.Engine
Active connection to the database to query.
geom_col : string, default 'geom'
column name to convert to shapely geometries
crs : dict or str, optional
CRS to use for the returned GeoDataFrame; if not set, tries to
determine CRS from the SRID associated with the first geometry in
the database, and assigns that to all geometries.
chunksize : int, default None
If specified, return an iterator where chunksize is the number of rows to
include in each chunk.
See the documentation for pandas.read_sql for further explanation
of the following parameters:
index_col, coerce_float, parse_dates, params, chunksize
Returns
-------
GeoDataFrame
Examples
--------
PostGIS
>>> from sqlalchemy import create_engine # doctest: +SKIP
>>> db_connection_url = "postgresql://myusername:mypassword@myhost:5432/mydatabase"
>>> con = create_engine(db_connection_url) # doctest: +SKIP
>>> sql = "SELECT geom, highway FROM roads"
>>> df = geopandas.read_postgis(sql, con) # doctest: +SKIP
SpatiaLite
>>> sql = "SELECT ST_AsBinary(geom) AS geom, highway FROM roads"
>>> df = geopandas.read_postgis(sql, con) # doctest: +SKIP
"""
if chunksize is None:
# read all in one chunk and return a single GeoDataFrame
df = pd.read_sql(
sql,
con,
index_col=index_col,
coerce_float=coerce_float,
parse_dates=parse_dates,
params=params,
chunksize=chunksize,
)
return _df_to_geodf(df, geom_col=geom_col, crs=crs)
else:
# read data in chunks and return a generator
df_generator = pd.read_sql(
sql,
con,
index_col=index_col,
coerce_float=coerce_float,
parse_dates=parse_dates,
params=params,
chunksize=chunksize,
)
return (_df_to_geodf(df, geom_col=geom_col, crs=crs) for df in df_generator)
def read_postgis(*args, **kwargs):
import warnings
warnings.warn(
"geopandas.io.sql.read_postgis() is intended for internal "
"use only, and will be deprecated. Use geopandas.read_postgis() instead.",
FutureWarning,
stacklevel=2,
)
return _read_postgis(*args, **kwargs)
def _get_geometry_type(gdf):
"""
Get basic geometry type of a GeoDataFrame. See more info from:
https://geoalchemy-2.readthedocs.io/en/latest/types.html#geoalchemy2.types._GISType
Following rules apply:
- if geometries all share the same geometry-type,
geometries are inserted with the given GeometryType with following types:
- Point, LineString, Polygon, MultiPoint, MultiLineString, MultiPolygon,
GeometryCollection.
- LinearRing geometries will be converted into LineString -objects.
- in all other cases, geometries will be inserted with type GEOMETRY:
- a mix of Polygons and MultiPolygons in GeoSeries
- a mix of Points and LineStrings in GeoSeries
- geometry is of type GeometryCollection,
such as GeometryCollection([Point, LineStrings])
- if any of the geometries has Z-coordinate, all records will
be written with 3D.
"""
geom_types = list(gdf.geometry.geom_type.unique())
has_curve = False
for gt in geom_types:
if gt is None:
continue
elif "LinearRing" in gt:
has_curve = True
if len(geom_types) == 1:
if has_curve:
target_geom_type = "LINESTRING"
else:
if geom_types[0] is None:
raise ValueError("No valid geometries in the data.")
else:
target_geom_type = geom_types[0].upper()
else:
target_geom_type = "GEOMETRY"
# Check for 3D-coordinates
if any(gdf.geometry.has_z):
target_geom_type += "Z"
return target_geom_type, has_curve
def _get_srid_from_crs(gdf):
"""
Get EPSG code from CRS if available. If not, return -1.
"""
# Use geoalchemy2 default for srid
# Note: undefined srid in PostGIS is 0
srid = None
warning_msg = (
"Could not parse CRS from the GeoDataFrame. "
"Inserting data without defined CRS."
)
if gdf.crs is not None:
try:
for confidence in (100, 70, 25):
srid = gdf.crs.to_epsg(min_confidence=confidence)
if srid is not None:
break
auth_srid = gdf.crs.to_authority(
auth_name="ESRI", min_confidence=confidence
)
if auth_srid is not None:
srid = int(auth_srid[1])
break
except Exception:
warnings.warn(warning_msg, UserWarning, stacklevel=2)
if srid is None:
srid = -1
warnings.warn(warning_msg, UserWarning, stacklevel=2)
return srid
def _convert_linearring_to_linestring(gdf, geom_name):
from shapely.geometry import LineString
# Todo: Use Pygeos function once it's implemented:
# https://github.com/pygeos/pygeos/issues/76
mask = gdf.geom_type == "LinearRing"
gdf.loc[mask, geom_name] = gdf.loc[mask, geom_name].apply(
lambda geom: LineString(geom)
)
return gdf
def _convert_to_ewkb(gdf, geom_name, srid):
"""Convert geometries to ewkb."""
if compat.USE_SHAPELY_20:
geoms = shapely.to_wkb(
shapely.set_srid(gdf[geom_name].values._data, srid=srid),
hex=True,
include_srid=True,
)
elif compat.USE_PYGEOS:
from pygeos import set_srid, to_wkb
geoms = to_wkb(
set_srid(gdf[geom_name].values._data, srid=srid),
hex=True,
include_srid=True,
)
else:
from shapely.wkb import dumps
geoms = [dumps(geom, srid=srid, hex=True) for geom in gdf[geom_name]]
# The gdf will warn that the geometry column doesn't hold in-memory geometries
# now that they are EWKB, so convert back to a regular dataframe to avoid warning
# the user that the dtypes are unexpected.
df = pd.DataFrame(gdf, copy=False)
df[geom_name] = geoms
return df
def _psql_insert_copy(tbl, conn, keys, data_iter):
import io
import csv
s_buf = io.StringIO()
writer = csv.writer(s_buf)
writer.writerows(data_iter)
s_buf.seek(0)
columns = ", ".join('"{}"'.format(k) for k in keys)
dbapi_conn = conn.connection
with dbapi_conn.cursor() as cur:
sql = 'COPY "{}"."{}" ({}) FROM STDIN WITH CSV'.format(
tbl.table.schema, tbl.table.name, columns
)
cur.copy_expert(sql=sql, file=s_buf)
def _write_postgis(
gdf,
name,
con,
schema=None,
if_exists="fail",
index=False,
index_label=None,
chunksize=None,
dtype=None,
):
"""
Upload GeoDataFrame into PostGIS database.
This method requires SQLAlchemy and GeoAlchemy2, and a PostgreSQL
Python driver (e.g. psycopg2) to be installed.
Parameters
----------
name : str
Name of the target table.
con : sqlalchemy.engine.Connection or sqlalchemy.engine.Engine
Active connection to the PostGIS database.
if_exists : {'fail', 'replace', 'append'}, default 'fail'
How to behave if the table already exists:
- fail: Raise a ValueError.
- replace: Drop the table before inserting new values.
- append: Insert new values to the existing table.
schema : string, optional
Specify the schema. If None, use default schema: 'public'.
index : bool, default True
Write DataFrame index as a column.
Uses *index_label* as the column name in the table.
index_label : string or sequence, default None
Column label for index column(s).
If None is given (default) and index is True,
then the index names are used.
chunksize : int, optional
Rows will be written in batches of this size at a time.
By default, all rows will be written at once.
dtype : dict of column name to SQL type, default None
Specifying the datatype for columns.
The keys should be the column names and the values
should be the SQLAlchemy types.
Examples
--------
>>> from sqlalchemy import create_engine # doctest: +SKIP
>>> engine = create_engine("postgresql://myusername:mypassword@myhost:5432\
/mydatabase";) # doctest: +SKIP
>>> gdf.to_postgis("my_table", engine) # doctest: +SKIP
"""
try:
from geoalchemy2 import Geometry
from sqlalchemy import text
except ImportError:
raise ImportError("'to_postgis()' requires geoalchemy2 package.")
gdf = gdf.copy()
geom_name = gdf.geometry.name
# Get srid
srid = _get_srid_from_crs(gdf)
# Get geometry type and info whether data contains LinearRing.
geometry_type, has_curve = _get_geometry_type(gdf)
# Build dtype with Geometry
if dtype is not None:
dtype[geom_name] = Geometry(geometry_type=geometry_type, srid=srid)
else:
dtype = {geom_name: Geometry(geometry_type=geometry_type, srid=srid)}
# Convert LinearRing geometries to LineString
if has_curve:
gdf = _convert_linearring_to_linestring(gdf, geom_name)
# Convert geometries to EWKB
gdf = _convert_to_ewkb(gdf, geom_name, srid)
if schema is not None:
schema_name = schema
else:
schema_name = "public"
if if_exists == "append":
# Check that the geometry srid matches with the current GeoDataFrame
with _get_conn(con) as connection:
# Only check SRID if table exists
if connection.dialect.has_table(connection, name, schema):
target_srid = connection.execute(
text(
"SELECT Find_SRID('{schema}', '{table}', '{geom_col}');".format(
schema=schema_name, table=name, geom_col=geom_name
)
)
).fetchone()[0]
if target_srid != srid:
msg = (
"The CRS of the target table (EPSG:{epsg_t}) differs from the "
"CRS of current GeoDataFrame (EPSG:{epsg_src}).".format(
epsg_t=target_srid, epsg_src=srid
)
)
raise ValueError(msg)
with _get_conn(con) as connection:
gdf.to_sql(
name,
connection,
schema=schema_name,
if_exists=if_exists,
index=index,
index_label=index_label,
chunksize=chunksize,
dtype=dtype,
method=_psql_insert_copy,
)

View File

@@ -0,0 +1,98 @@
"""
Script to create the data and write legacy storage (pickle) files.
Based on pandas' generate_legacy_storage_files.py script.
To use this script, create an environment for which you want to
generate pickles, activate the environment, and run this script as:
$ python geopandas/geopandas/io/tests/generate_legacy_storage_files.py \
geopandas/geopandas/io/tests/data/pickle/ pickle
This script generates a storage file for the current arch, system,
The idea here is you are using the *current* version of the
generate_legacy_storage_files with an *older* version of geopandas to
generate a pickle file. We will then check this file into a current
branch, and test using test_pickle.py. This will load the *older*
pickles and test versus the current data that is generated
(with master). These are then compared.
"""
import os
import pickle
import platform
import sys
import pandas as pd
import geopandas
from shapely.geometry import Point
def create_pickle_data():
"""create the pickle data"""
# custom geometry column name
gdf_the_geom = geopandas.GeoDataFrame(
{"a": [1, 2, 3], "the_geom": [Point(1, 1), Point(2, 2), Point(3, 3)]},
geometry="the_geom",
)
# with crs
gdf_crs = geopandas.GeoDataFrame(
{"a": [0.1, 0.2, 0.3], "geometry": [Point(1, 1), Point(2, 2), Point(3, 3)]},
crs="EPSG:4326",
)
return {"gdf_the_geom": gdf_the_geom, "gdf_crs": gdf_crs}
def platform_name():
return "_".join(
[
str(geopandas.__version__),
"pd-" + str(pd.__version__),
"py-" + str(platform.python_version()),
str(platform.machine()),
str(platform.system().lower()),
]
)
def write_legacy_pickles(output_dir):
print(
"This script generates a storage file for the current arch, system, "
"and python version"
)
print("geopandas version: {}").format(geopandas.__version__)
print(" output dir : {}".format(output_dir))
print(" storage format: pickle")
pth = "{}.pickle".format(platform_name())
fh = open(os.path.join(output_dir, pth), "wb")
pickle.dump(create_pickle_data(), fh, pickle.DEFAULT_PROTOCOL)
fh.close()
print("created pickle file: {}".format(pth))
def main():
if len(sys.argv) != 3:
sys.exit(
"Specify output directory and storage type: generate_legacy_"
"storage_files.py <output_dir> <storage_type> "
)
output_dir = str(sys.argv[1])
storage_type = str(sys.argv[2])
if storage_type == "pickle":
write_legacy_pickles(output_dir=output_dir)
else:
sys.exit("storage_type must be one of {'pickle'}")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,914 @@
from __future__ import absolute_import
from itertools import product
import json
from packaging.version import Version
import os
import pathlib
import pytest
from pandas import DataFrame, read_parquet as pd_read_parquet
from pandas.testing import assert_frame_equal
import numpy as np
import pyproj
from shapely.geometry import box, Point, MultiPolygon
import geopandas
import geopandas._compat as compat
from geopandas import GeoDataFrame, read_file, read_parquet, read_feather
from geopandas.array import to_wkb
from geopandas.datasets import get_path
from geopandas.io.arrow import (
SUPPORTED_VERSIONS,
_create_metadata,
_decode_metadata,
_encode_metadata,
_geopandas_to_arrow,
_get_filesystem_path,
_remove_id_from_member_of_ensembles,
_validate_dataframe,
_validate_metadata,
METADATA_VERSION,
)
from geopandas.testing import assert_geodataframe_equal, assert_geoseries_equal
from geopandas.tests.util import mock
DATA_PATH = pathlib.Path(os.path.dirname(__file__)) / "data"
# Skip all tests in this module if pyarrow is not available
pyarrow = pytest.importorskip("pyarrow")
@pytest.fixture(
params=[
"parquet",
pytest.param(
"feather",
marks=pytest.mark.skipif(
Version(pyarrow.__version__) < Version("0.17.0"),
reason="needs pyarrow >= 0.17",
),
),
]
)
def file_format(request):
if request.param == "parquet":
return read_parquet, GeoDataFrame.to_parquet
elif request.param == "feather":
return read_feather, GeoDataFrame.to_feather
def test_create_metadata():
test_dataset = "naturalearth_lowres"
df = read_file(get_path(test_dataset))
metadata = _create_metadata(df)
assert isinstance(metadata, dict)
assert metadata["version"] == METADATA_VERSION
assert metadata["primary_column"] == "geometry"
assert "geometry" in metadata["columns"]
crs_expected = df.crs.to_json_dict()
_remove_id_from_member_of_ensembles(crs_expected)
assert metadata["columns"]["geometry"]["crs"] == crs_expected
assert metadata["columns"]["geometry"]["encoding"] == "WKB"
assert metadata["columns"]["geometry"]["geometry_types"] == [
"MultiPolygon",
"Polygon",
]
assert np.array_equal(
metadata["columns"]["geometry"]["bbox"], df.geometry.total_bounds
)
assert metadata["creator"]["library"] == "geopandas"
assert metadata["creator"]["version"] == geopandas.__version__
def test_crs_metadata_datum_ensemble():
# compatibility for older PROJ versions using PROJJSON with datum ensembles
# https://github.com/geopandas/geopandas/pull/2453
crs = pyproj.CRS("EPSG:4326")
crs_json = crs.to_json_dict()
check_ensemble = False
if "datum_ensemble" in crs_json:
# older version of PROJ don't yet have datum ensembles
check_ensemble = True
assert "id" in crs_json["datum_ensemble"]["members"][0]
_remove_id_from_member_of_ensembles(crs_json)
if check_ensemble:
assert "id" not in crs_json["datum_ensemble"]["members"][0]
# ensure roundtrip still results in an equivalent CRS
assert pyproj.CRS(crs_json) == crs
def test_write_metadata_invalid_spec_version():
gdf = geopandas.GeoDataFrame(geometry=[box(0, 0, 10, 10)], crs="EPSG:4326")
with pytest.raises(ValueError, match="schema_version must be one of"):
_create_metadata(gdf, schema_version="invalid")
def test_encode_metadata():
metadata = {"a": "b"}
expected = b'{"a": "b"}'
assert _encode_metadata(metadata) == expected
def test_decode_metadata():
metadata_str = b'{"a": "b"}'
expected = {"a": "b"}
assert _decode_metadata(metadata_str) == expected
assert _decode_metadata(None) is None
def test_validate_dataframe():
test_dataset = "naturalearth_lowres"
df = read_file(get_path(test_dataset))
# valid: should not raise ValueError
_validate_dataframe(df)
_validate_dataframe(df.set_index("iso_a3"))
# add column with non-string type
df[0] = 1
# invalid: should raise ValueError
with pytest.raises(ValueError):
_validate_dataframe(df)
with pytest.raises(ValueError):
_validate_dataframe(df.set_index(0))
# not a DataFrame: should raise ValueError
with pytest.raises(ValueError):
_validate_dataframe("not a dataframe")
def test_validate_metadata_valid():
_validate_metadata(
{
"primary_column": "geometry",
"columns": {"geometry": {"crs": None, "encoding": "WKB"}},
"schema_version": "0.1.0",
}
)
_validate_metadata(
{
"primary_column": "geometry",
"columns": {"geometry": {"crs": None, "encoding": "WKB"}},
"version": "<version>",
}
)
_validate_metadata(
{
"primary_column": "geometry",
"columns": {
"geometry": {
"crs": {
# truncated PROJJSON for testing, as PROJJSON contents
# not validated here
"id": {"authority": "EPSG", "code": 4326},
},
"encoding": "WKB",
}
},
"version": "0.4.0",
}
)
@pytest.mark.parametrize(
"metadata,error",
[
(None, "Missing or malformed geo metadata in Parquet/Feather file"),
({}, "Missing or malformed geo metadata in Parquet/Feather file"),
# missing "version" key:
(
{"primary_column": "foo", "columns": None},
"'geo' metadata in Parquet/Feather file is missing required key",
),
# missing "columns" key:
(
{"primary_column": "foo", "version": "<version>"},
"'geo' metadata in Parquet/Feather file is missing required key:",
),
# missing "primary_column"
(
{"columns": [], "version": "<version>"},
"'geo' metadata in Parquet/Feather file is missing required key:",
),
(
{"primary_column": "foo", "columns": [], "version": "<version>"},
"'columns' in 'geo' metadata must be a dict",
),
# missing "encoding" for column
(
{"primary_column": "foo", "columns": {"foo": {}}, "version": "<version>"},
(
"'geo' metadata in Parquet/Feather file is missing required key "
"'encoding' for column 'foo'"
),
),
# invalid column encoding
(
{
"primary_column": "foo",
"columns": {"foo": {"crs": None, "encoding": None}},
"version": "<version>",
},
"Only WKB geometry encoding is supported",
),
(
{
"primary_column": "foo",
"columns": {"foo": {"crs": None, "encoding": "BKW"}},
"version": "<version>",
},
"Only WKB geometry encoding is supported",
),
],
)
def test_validate_metadata_invalid(metadata, error):
with pytest.raises(ValueError, match=error):
_validate_metadata(metadata)
def test_validate_metadata_edges():
metadata = {
"primary_column": "geometry",
"columns": {"geometry": {"crs": None, "encoding": "WKB", "edges": "spherical"}},
"version": "1.0.0-beta.1",
}
with pytest.warns(
UserWarning,
match="The geo metadata indicate that column 'geometry' has spherical edges",
):
_validate_metadata(metadata)
def test_to_parquet_fails_on_invalid_engine(tmpdir):
df = GeoDataFrame(data=[[1, 2, 3]], columns=["a", "b", "a"], geometry=[Point(1, 1)])
with pytest.raises(
ValueError,
match=(
"GeoPandas only supports using pyarrow as the engine for "
"to_parquet: 'fastparquet' passed instead."
),
):
df.to_parquet(tmpdir / "test.parquet", engine="fastparquet")
@mock.patch("geopandas.io.arrow._to_parquet")
def test_to_parquet_does_not_pass_engine_along(mock_to_parquet):
df = GeoDataFrame(data=[[1, 2, 3]], columns=["a", "b", "a"], geometry=[Point(1, 1)])
df.to_parquet("", engine="pyarrow")
# assert that engine keyword is not passed through to _to_parquet (and thus
# parquet.write_table)
mock_to_parquet.assert_called_with(
df, "", compression="snappy", index=None, schema_version=None
)
# TEMPORARY: used to determine if pyarrow fails for roundtripping pandas data
# without geometries
def test_pandas_parquet_roundtrip1(tmpdir):
df = DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]})
filename = os.path.join(str(tmpdir), "test.pq")
df.to_parquet(filename)
pq_df = pd_read_parquet(filename)
assert_frame_equal(df, pq_df)
@pytest.mark.parametrize(
"test_dataset", ["naturalearth_lowres", "naturalearth_cities", "nybb"]
)
def test_pandas_parquet_roundtrip2(test_dataset, tmpdir):
test_dataset = "naturalearth_lowres"
df = DataFrame(read_file(get_path(test_dataset)).drop(columns=["geometry"]))
filename = os.path.join(str(tmpdir), "test.pq")
df.to_parquet(filename)
pq_df = pd_read_parquet(filename)
assert_frame_equal(df, pq_df)
@pytest.mark.parametrize(
"test_dataset", ["naturalearth_lowres", "naturalearth_cities", "nybb"]
)
def test_roundtrip(tmpdir, file_format, test_dataset):
"""Writing to parquet should not raise errors, and should not alter original
GeoDataFrame
"""
reader, writer = file_format
df = read_file(get_path(test_dataset))
orig = df.copy()
filename = os.path.join(str(tmpdir), "test.pq")
writer(df, filename)
assert os.path.exists(filename)
# make sure that the original data frame is unaltered
assert_geodataframe_equal(df, orig)
# make sure that we can roundtrip the data frame
pq_df = reader(filename)
assert isinstance(pq_df, GeoDataFrame)
assert_geodataframe_equal(df, pq_df)
def test_index(tmpdir, file_format):
"""Setting index=`True` should preserve index in output, and
setting index=`False` should drop index from output.
"""
reader, writer = file_format
test_dataset = "naturalearth_lowres"
df = read_file(get_path(test_dataset)).set_index("iso_a3")
filename = os.path.join(str(tmpdir), "test_with_index.pq")
writer(df, filename, index=True)
pq_df = reader(filename)
assert_geodataframe_equal(df, pq_df)
filename = os.path.join(str(tmpdir), "drop_index.pq")
writer(df, filename, index=False)
pq_df = reader(filename)
assert_geodataframe_equal(df.reset_index(drop=True), pq_df)
@pytest.mark.parametrize("compression", ["snappy", "gzip", "brotli", None])
def test_parquet_compression(compression, tmpdir):
"""Using compression options should not raise errors, and should
return identical GeoDataFrame.
"""
test_dataset = "naturalearth_lowres"
df = read_file(get_path(test_dataset))
filename = os.path.join(str(tmpdir), "test.pq")
df.to_parquet(filename, compression=compression)
pq_df = read_parquet(filename)
assert isinstance(pq_df, GeoDataFrame)
assert_geodataframe_equal(df, pq_df)
@pytest.mark.skipif(
Version(pyarrow.__version__) < Version("0.17.0"),
reason="Feather only supported for pyarrow >= 0.17",
)
@pytest.mark.parametrize("compression", ["uncompressed", "lz4", "zstd"])
def test_feather_compression(compression, tmpdir):
"""Using compression options should not raise errors, and should
return identical GeoDataFrame.
"""
test_dataset = "naturalearth_lowres"
df = read_file(get_path(test_dataset))
filename = os.path.join(str(tmpdir), "test.feather")
df.to_feather(filename, compression=compression)
pq_df = read_feather(filename)
assert isinstance(pq_df, GeoDataFrame)
assert_geodataframe_equal(df, pq_df)
def test_parquet_multiple_geom_cols(tmpdir, file_format):
"""If multiple geometry columns are present when written to parquet,
they should all be returned as such when read from parquet.
"""
reader, writer = file_format
test_dataset = "naturalearth_lowres"
df = read_file(get_path(test_dataset))
df["geom2"] = df.geometry.copy()
filename = os.path.join(str(tmpdir), "test.pq")
writer(df, filename)
assert os.path.exists(filename)
pq_df = reader(filename)
assert isinstance(pq_df, GeoDataFrame)
assert_geodataframe_equal(df, pq_df)
assert_geoseries_equal(df.geom2, pq_df.geom2, check_geom_type=True)
def test_parquet_missing_metadata(tmpdir):
"""Missing geo metadata, such as from a parquet file created
from a pandas DataFrame, will raise a ValueError.
"""
test_dataset = "naturalearth_lowres"
df = read_file(get_path(test_dataset))
# convert to DataFrame
df = DataFrame(df)
# convert the geometry column so we can extract later
df["geometry"] = to_wkb(df["geometry"].values)
filename = os.path.join(str(tmpdir), "test.pq")
# use pandas to_parquet (no geo metadata)
df.to_parquet(filename)
# missing metadata will raise ValueError
with pytest.raises(
ValueError, match="Missing geo metadata in Parquet/Feather file."
):
read_parquet(filename)
def test_parquet_missing_metadata2(tmpdir):
"""Missing geo metadata, such as from a parquet file created
from a pyarrow Table (which will also not contain pandas metadata),
will raise a ValueError.
"""
import pyarrow.parquet as pq
table = pyarrow.table({"a": [1, 2, 3]})
filename = os.path.join(str(tmpdir), "test.pq")
# use pyarrow.parquet write_table (no geo metadata, but also no pandas metadata)
pq.write_table(table, filename)
# missing metadata will raise ValueError
with pytest.raises(
ValueError, match="Missing geo metadata in Parquet/Feather file."
):
read_parquet(filename)
@pytest.mark.parametrize(
"geo_meta,error",
[
({"geo": b""}, "Missing or malformed geo metadata in Parquet/Feather file"),
(
{"geo": _encode_metadata({})},
"Missing or malformed geo metadata in Parquet/Feather file",
),
(
{"geo": _encode_metadata({"foo": "bar"})},
"'geo' metadata in Parquet/Feather file is missing required key",
),
],
)
def test_parquet_invalid_metadata(tmpdir, geo_meta, error):
"""Has geo metadata with missing required fields will raise a ValueError.
This requires writing the parquet file directly below, so that we can
control the metadata that is written for this test.
"""
from pyarrow import parquet, Table
test_dataset = "naturalearth_lowres"
df = read_file(get_path(test_dataset))
# convert to DataFrame and encode geometry to WKB
df = DataFrame(df)
df["geometry"] = to_wkb(df["geometry"].values)
table = Table.from_pandas(df)
metadata = table.schema.metadata
metadata.update(geo_meta)
table = table.replace_schema_metadata(metadata)
filename = os.path.join(str(tmpdir), "test.pq")
parquet.write_table(table, filename)
with pytest.raises(ValueError, match=error):
read_parquet(filename)
def test_subset_columns(tmpdir, file_format):
"""Reading a subset of columns should correctly decode selected geometry
columns.
"""
reader, writer = file_format
test_dataset = "naturalearth_lowres"
df = read_file(get_path(test_dataset))
filename = os.path.join(str(tmpdir), "test.pq")
writer(df, filename)
pq_df = reader(filename, columns=["name", "geometry"])
assert_geodataframe_equal(df[["name", "geometry"]], pq_df)
with pytest.raises(
ValueError, match="No geometry columns are included in the columns read"
):
reader(filename, columns=["name"])
def test_promote_secondary_geometry(tmpdir, file_format):
"""Reading a subset of columns that does not include the primary geometry
column should promote the first geometry column present.
"""
reader, writer = file_format
test_dataset = "naturalearth_lowres"
df = read_file(get_path(test_dataset))
df["geom2"] = df.geometry.copy()
filename = os.path.join(str(tmpdir), "test.pq")
writer(df, filename)
pq_df = reader(filename, columns=["name", "geom2"])
assert_geodataframe_equal(df.set_geometry("geom2")[["name", "geom2"]], pq_df)
df["geom3"] = df.geometry.copy()
writer(df, filename)
with pytest.warns(
UserWarning,
match="Multiple non-primary geometry columns read from Parquet/Feather file.",
):
pq_df = reader(filename, columns=["name", "geom2", "geom3"])
assert_geodataframe_equal(
df.set_geometry("geom2")[["name", "geom2", "geom3"]], pq_df
)
def test_columns_no_geometry(tmpdir, file_format):
"""Reading a parquet file that is missing all of the geometry columns
should raise a ValueError"""
reader, writer = file_format
test_dataset = "naturalearth_lowres"
df = read_file(get_path(test_dataset))
filename = os.path.join(str(tmpdir), "test.pq")
writer(df, filename)
with pytest.raises(ValueError):
reader(filename, columns=["name"])
def test_missing_crs(tmpdir, file_format):
"""If CRS is `None`, it should be properly handled
and remain `None` when read from parquet`.
"""
reader, writer = file_format
test_dataset = "naturalearth_lowres"
df = read_file(get_path(test_dataset))
df.crs = None
filename = os.path.join(str(tmpdir), "test.pq")
writer(df, filename)
pq_df = reader(filename)
assert pq_df.crs is None
assert_geodataframe_equal(df, pq_df, check_crs=True)
def test_default_geo_col_writes(tmp_path):
# edge case geo col name None writes successfully
df = GeoDataFrame({"a": [1, 2]})
df.to_parquet(tmp_path / "test.pq")
# cannot be round tripped as gdf due to invalid geom col
pq_df = pd_read_parquet(tmp_path / "test.pq")
assert_frame_equal(df, pq_df)
@pytest.mark.skipif(
Version(pyarrow.__version__) >= Version("0.17.0"),
reason="Feather only supported for pyarrow >= 0.17",
)
def test_feather_arrow_version(tmpdir):
df = read_file(get_path("naturalearth_lowres"))
filename = os.path.join(str(tmpdir), "test.feather")
with pytest.raises(
ImportError, match="pyarrow >= 0.17 required for Feather support"
):
df.to_feather(filename)
def test_fsspec_url():
fsspec = pytest.importorskip("fsspec")
import fsspec.implementations.memory
class MyMemoryFileSystem(fsspec.implementations.memory.MemoryFileSystem):
# Simple fsspec filesystem that adds a required keyword.
# Attempting to use this filesystem without the keyword will raise an exception.
def __init__(self, is_set, *args, **kwargs):
self.is_set = is_set
super().__init__(*args, **kwargs)
fsspec.register_implementation("memory", MyMemoryFileSystem, clobber=True)
memfs = MyMemoryFileSystem(is_set=True)
test_dataset = "naturalearth_lowres"
df = read_file(get_path(test_dataset))
with memfs.open("data.parquet", "wb") as f:
df.to_parquet(f)
result = read_parquet("memory://data.parquet", storage_options={"is_set": True})
assert_geodataframe_equal(result, df)
result = read_parquet("memory://data.parquet", filesystem=memfs)
assert_geodataframe_equal(result, df)
# reset fsspec registry
fsspec.register_implementation(
"memory", fsspec.implementations.memory.MemoryFileSystem, clobber=True
)
def test_non_fsspec_url_with_storage_options_raises():
with pytest.raises(ValueError, match="storage_options"):
test_dataset = "naturalearth_lowres"
read_parquet(get_path(test_dataset), storage_options={"foo": "bar"})
@pytest.mark.skipif(
Version(pyarrow.__version__) < Version("5.0.0"),
reason="pyarrow.fs requires pyarrow>=5.0.0",
)
def test_prefers_pyarrow_fs():
filesystem, _ = _get_filesystem_path("file:///data.parquet")
assert isinstance(filesystem, pyarrow.fs.LocalFileSystem)
def test_write_read_parquet_expand_user():
gdf = geopandas.GeoDataFrame(geometry=[box(0, 0, 10, 10)], crs="epsg:4326")
test_file = "~/test_file.parquet"
gdf.to_parquet(test_file)
pq_df = geopandas.read_parquet(test_file)
assert_geodataframe_equal(gdf, pq_df, check_crs=True)
os.remove(os.path.expanduser(test_file))
def test_write_read_feather_expand_user():
gdf = geopandas.GeoDataFrame(geometry=[box(0, 0, 10, 10)], crs="epsg:4326")
test_file = "~/test_file.feather"
gdf.to_feather(test_file)
f_df = geopandas.read_feather(test_file)
assert_geodataframe_equal(gdf, f_df, check_crs=True)
os.remove(os.path.expanduser(test_file))
@pytest.mark.parametrize("geometry", [[], [None]])
def test_write_empty_bbox(tmpdir, geometry):
# empty dataframe or all missing geometries -> avoid bbox with NaNs
gdf = geopandas.GeoDataFrame({"col": [1] * len(geometry)}, geometry=geometry)
gdf.to_parquet(tmpdir / "test.parquet")
from pyarrow.parquet import read_table
table = read_table(tmpdir / "test.parquet")
metadata = json.loads(table.schema.metadata[b"geo"])
assert "encoding" in metadata["columns"]["geometry"]
assert "bbox" not in metadata["columns"]["geometry"]
@pytest.mark.parametrize("format", ["feather", "parquet"])
def test_write_read_default_crs(tmpdir, format):
if format == "feather":
from pyarrow.feather import write_feather as write
else:
from pyarrow.parquet import write_table as write
filename = os.path.join(str(tmpdir), f"test.{format}")
gdf = geopandas.GeoDataFrame(geometry=[box(0, 0, 10, 10)])
table = _geopandas_to_arrow(gdf)
# update the geo metadata to strip 'crs' entry
metadata = table.schema.metadata
geo_metadata = _decode_metadata(metadata[b"geo"])
del geo_metadata["columns"]["geometry"]["crs"]
metadata.update({b"geo": _encode_metadata(geo_metadata)})
table = table.replace_schema_metadata(metadata)
write(table, filename)
read = getattr(geopandas, f"read_{format}")
df = read(filename)
assert df.crs.equals(pyproj.CRS("OGC:CRS84"))
def test_write_iso_wkb(tmpdir):
gdf = geopandas.GeoDataFrame(
geometry=geopandas.GeoSeries.from_wkt(["POINT Z (1 2 3)"])
)
if compat.USE_SHAPELY_20:
gdf.to_parquet(tmpdir / "test.parquet")
else:
with pytest.warns(UserWarning, match="The GeoDataFrame contains 3D geometries"):
gdf.to_parquet(tmpdir / "test.parquet")
from pyarrow.parquet import read_table
table = read_table(tmpdir / "test.parquet")
wkb = table["geometry"][0].as_py().hex()
if compat.USE_SHAPELY_20:
# correct ISO flavor
assert wkb == "01e9030000000000000000f03f00000000000000400000000000000840"
else:
assert wkb == "0101000080000000000000f03f00000000000000400000000000000840"
@pytest.mark.parametrize(
"format,schema_version",
product(["feather", "parquet"], [None] + SUPPORTED_VERSIONS),
)
def test_write_spec_version(tmpdir, format, schema_version):
if format == "feather":
from pyarrow.feather import read_table
else:
from pyarrow.parquet import read_table
filename = os.path.join(str(tmpdir), f"test.{format}")
gdf = geopandas.GeoDataFrame(geometry=[box(0, 0, 10, 10)], crs="EPSG:4326")
write = getattr(gdf, f"to_{format}")
write(filename, schema_version=schema_version)
# ensure that we can roundtrip data regardless of version
read = getattr(geopandas, f"read_{format}")
df = read(filename)
assert_geodataframe_equal(df, gdf)
# verify the correct version is written in the metadata
schema_version = schema_version or METADATA_VERSION
table = read_table(filename)
metadata = json.loads(table.schema.metadata[b"geo"])
assert metadata["version"] == schema_version
# verify that CRS is correctly handled between versions
if schema_version == "0.1.0":
assert metadata["columns"]["geometry"]["crs"] == gdf.crs.to_wkt()
else:
crs_expected = gdf.crs.to_json_dict()
_remove_id_from_member_of_ensembles(crs_expected)
assert metadata["columns"]["geometry"]["crs"] == crs_expected
# verify that geometry_type(s) is correctly handled between versions
if Version(schema_version) <= Version("0.4.0"):
assert "geometry_type" in metadata["columns"]["geometry"]
assert metadata["columns"]["geometry"]["geometry_type"] == "Polygon"
else:
assert "geometry_types" in metadata["columns"]["geometry"]
assert metadata["columns"]["geometry"]["geometry_types"] == ["Polygon"]
@pytest.mark.parametrize(
"format,version", product(["feather", "parquet"], [None] + SUPPORTED_VERSIONS)
)
def test_write_deprecated_version_parameter(tmpdir, format, version):
if format == "feather":
from pyarrow.feather import read_table
version = version or 2
else:
from pyarrow.parquet import read_table
version = version or "2.6"
filename = os.path.join(str(tmpdir), f"test.{format}")
gdf = geopandas.GeoDataFrame(geometry=[box(0, 0, 10, 10)], crs="EPSG:4326")
write = getattr(gdf, f"to_{format}")
if version in SUPPORTED_VERSIONS:
with pytest.warns(
FutureWarning,
match="the `version` parameter has been replaced with `schema_version`",
):
write(filename, version=version)
else:
# no warning raised if not one of the captured versions
write(filename, version=version)
table = read_table(filename)
metadata = json.loads(table.schema.metadata[b"geo"])
if version in SUPPORTED_VERSIONS:
# version is captured as a parameter
assert metadata["version"] == version
else:
# version is passed to underlying writer
assert metadata["version"] == METADATA_VERSION
@pytest.mark.parametrize("version", ["0.1.0", "0.4.0", "1.0.0-beta.1"])
def test_read_versioned_file(version):
"""
Verify that files for different metadata spec versions can be read
created for each supported version:
# small dummy test dataset (not naturalearth_lowres, as this can change over time)
from shapely.geometry import box, MultiPolygon
df = geopandas.GeoDataFrame(
{"col_str": ["a", "b"], "col_int": [1, 2], "col_float": [0.1, 0.2]},
geometry=[MultiPolygon([box(0, 0, 1, 1), box(2, 2, 3, 3)]), box(4, 4, 5,5)],
crs="EPSG:4326",
)
df.to_feather(DATA_PATH / 'arrow' / f'test_data_v{METADATA_VERSION}.feather')
df.to_parquet(DATA_PATH / 'arrow' / f'test_data_v{METADATA_VERSION}.parquet')
"""
expected = geopandas.GeoDataFrame(
{"col_str": ["a", "b"], "col_int": [1, 2], "col_float": [0.1, 0.2]},
geometry=[MultiPolygon([box(0, 0, 1, 1), box(2, 2, 3, 3)]), box(4, 4, 5, 5)],
crs="EPSG:4326",
)
df = geopandas.read_feather(DATA_PATH / "arrow" / f"test_data_v{version}.feather")
assert_geodataframe_equal(df, expected, check_crs=True)
df = geopandas.read_parquet(DATA_PATH / "arrow" / f"test_data_v{version}.parquet")
assert_geodataframe_equal(df, expected, check_crs=True)
def test_read_gdal_files():
"""
Verify that files written by GDAL can be read by geopandas.
Since it is currently not yet straightforward to install GDAL with
Parquet/Arrow enabled in our conda setup, we are testing with some
generated files included in the repo (using GDAL 3.5.0):
# small dummy test dataset (not naturalearth_lowres, as this can change over time)
from shapely.geometry import box, MultiPolygon
df = geopandas.GeoDataFrame(
{"col_str": ["a", "b"], "col_int": [1, 2], "col_float": [0.1, 0.2]},
geometry=[MultiPolygon([box(0, 0, 1, 1), box(2, 2, 3, 3)]), box(4, 4, 5,5)],
crs="EPSG:4326",
)
df.to_file("test_data.gpkg", GEOMETRY_NAME="geometry")
and then the gpkg file is converted to Parquet/Arrow with:
$ ogr2ogr -f Parquet -lco FID= test_data_gdal350.parquet test_data.gpkg
$ ogr2ogr -f Arrow -lco FID= -lco GEOMETRY_ENCODING=WKB test_data_gdal350.arrow test_data.gpkg
""" # noqa: E501
expected = geopandas.GeoDataFrame(
{"col_str": ["a", "b"], "col_int": [1, 2], "col_float": [0.1, 0.2]},
geometry=[MultiPolygon([box(0, 0, 1, 1), box(2, 2, 3, 3)]), box(4, 4, 5, 5)],
crs="EPSG:4326",
)
df = geopandas.read_parquet(DATA_PATH / "arrow" / "test_data_gdal350.parquet")
assert_geodataframe_equal(df, expected, check_crs=True)
df = geopandas.read_feather(DATA_PATH / "arrow" / "test_data_gdal350.arrow")
assert_geodataframe_equal(df, expected, check_crs=True)
def test_parquet_read_partitioned_dataset(tmpdir):
# we don't yet explicitly support this (in writing), but for Parquet it
# works for reading (by relying on pyarrow.read_table)
df = read_file(get_path("naturalearth_lowres"))
# manually create partitioned dataset
basedir = tmpdir / "partitioned_dataset"
basedir.mkdir()
df[:100].to_parquet(basedir / "data1.parquet")
df[100:].to_parquet(basedir / "data2.parquet")
result = read_parquet(basedir)
assert_geodataframe_equal(result, df)
def test_parquet_read_partitioned_dataset_fsspec(tmpdir):
fsspec = pytest.importorskip("fsspec")
df = read_file(get_path("naturalearth_lowres"))
# manually create partitioned dataset
memfs = fsspec.filesystem("memory")
memfs.mkdir("partitioned_dataset")
with memfs.open("partitioned_dataset/data1.parquet", "wb") as f:
df[:100].to_parquet(f)
with memfs.open("partitioned_dataset/data2.parquet", "wb") as f:
df[100:].to_parquet(f)
result = read_parquet("memory://partitioned_dataset")
assert_geodataframe_equal(result, df)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,307 @@
import os
from shapely.geometry import (
LineString,
MultiLineString,
MultiPoint,
MultiPolygon,
Point,
Polygon,
)
import geopandas
from geopandas import GeoDataFrame
from geopandas.testing import assert_geodataframe_equal
import pytest
from .test_file import FIONA_MARK, PYOGRIO_MARK
# Credit: Polygons below come from Montreal city Open Data portal
# http://donnees.ville.montreal.qc.ca/dataset/unites-evaluation-fonciere
city_hall_boundaries = Polygon(
(
(-73.5541107525234, 45.5091983609661),
(-73.5546126200639, 45.5086813829106),
(-73.5540185061397, 45.5084409343852),
(-73.5539986525799, 45.5084323044531),
(-73.5535801792994, 45.5089539203786),
(-73.5541107525234, 45.5091983609661),
)
)
vauquelin_place = Polygon(
(
(-73.5542465586147, 45.5081555487952),
(-73.5540185061397, 45.5084409343852),
(-73.5546126200639, 45.5086813829106),
(-73.5548825850032, 45.5084033554357),
(-73.5542465586147, 45.5081555487952),
)
)
city_hall_walls = [
LineString(
(
(-73.5541107525234, 45.5091983609661),
(-73.5546126200639, 45.5086813829106),
(-73.5540185061397, 45.5084409343852),
)
),
LineString(
(
(-73.5539986525799, 45.5084323044531),
(-73.5535801792994, 45.5089539203786),
(-73.5541107525234, 45.5091983609661),
)
),
]
city_hall_entrance = Point(-73.553785, 45.508722)
city_hall_balcony = Point(-73.554138, 45.509080)
city_hall_council_chamber = Point(-73.554246, 45.508931)
point_3D = Point(-73.553785, 45.508722, 300)
# *****************************************
# TEST TOOLING
class _ExpectedError:
def __init__(self, error_type, error_message_match):
self.type = error_type
self.match = error_message_match
class _ExpectedErrorBuilder:
def __init__(self, composite_key):
self.composite_key = composite_key
def to_raise(self, error_type, error_match):
_expected_exceptions[self.composite_key] = _ExpectedError(
error_type, error_match
)
def _expect_writing(gdf, ogr_driver):
return _ExpectedErrorBuilder(_composite_key(gdf, ogr_driver))
def _composite_key(gdf, ogr_driver):
return frozenset([id(gdf), ogr_driver])
def _expected_error_on(gdf, ogr_driver):
composite_key = _composite_key(gdf, ogr_driver)
return _expected_exceptions.get(composite_key, None)
# *****************************************
# TEST CASES
_geodataframes_to_write = []
_expected_exceptions = {}
_CRS = "epsg:4326"
# ------------------
# gdf with Points
gdf = GeoDataFrame(
{"a": [1, 2]}, crs=_CRS, geometry=[city_hall_entrance, city_hall_balcony]
)
_geodataframes_to_write.append(gdf)
# ------------------
# gdf with MultiPoints
gdf = GeoDataFrame(
{"a": [1, 2]},
crs=_CRS,
geometry=[
MultiPoint([city_hall_balcony, city_hall_council_chamber]),
MultiPoint([city_hall_entrance, city_hall_balcony, city_hall_council_chamber]),
],
)
_geodataframes_to_write.append(gdf)
# ------------------
# gdf with Points and MultiPoints
gdf = GeoDataFrame(
{"a": [1, 2]},
crs=_CRS,
geometry=[MultiPoint([city_hall_entrance, city_hall_balcony]), city_hall_balcony],
)
_geodataframes_to_write.append(gdf)
# 'ESRI Shapefile' driver supports writing LineString/MultiLinestring and
# Polygon/MultiPolygon but does not mention Point/MultiPoint
# see https://www.gdal.org/drv_shapefile.html
_expect_writing(gdf, "ESRI Shapefile").to_raise(RuntimeError, "Failed to write record")
# ------------------
# gdf with LineStrings
gdf = GeoDataFrame({"a": [1, 2]}, crs=_CRS, geometry=city_hall_walls)
_geodataframes_to_write.append(gdf)
# ------------------
# gdf with MultiLineStrings
gdf = GeoDataFrame(
{"a": [1, 2]},
crs=_CRS,
geometry=[MultiLineString(city_hall_walls), MultiLineString(city_hall_walls)],
)
_geodataframes_to_write.append(gdf)
# ------------------
# gdf with LineStrings and MultiLineStrings
gdf = GeoDataFrame(
{"a": [1, 2]},
crs=_CRS,
geometry=[MultiLineString(city_hall_walls), city_hall_walls[0]],
)
_geodataframes_to_write.append(gdf)
# ------------------
# gdf with Polygons
gdf = GeoDataFrame(
{"a": [1, 2]}, crs=_CRS, geometry=[city_hall_boundaries, vauquelin_place]
)
_geodataframes_to_write.append(gdf)
# ------------------
# gdf with MultiPolygon
gdf = GeoDataFrame(
{"a": [1]},
crs=_CRS,
geometry=[MultiPolygon((city_hall_boundaries, vauquelin_place))],
)
_geodataframes_to_write.append(gdf)
# ------------------
# gdf with Polygon and MultiPolygon
gdf = GeoDataFrame(
{"a": [1, 2]},
crs=_CRS,
geometry=[
MultiPolygon((city_hall_boundaries, vauquelin_place)),
city_hall_boundaries,
],
)
_geodataframes_to_write.append(gdf)
# ------------------
# gdf with null geometry and Point
gdf = GeoDataFrame({"a": [1, 2]}, crs=_CRS, geometry=[None, city_hall_entrance])
_geodataframes_to_write.append(gdf)
# ------------------
# gdf with null geometry and 3D Point
gdf = GeoDataFrame({"a": [1, 2]}, crs=_CRS, geometry=[None, point_3D])
_geodataframes_to_write.append(gdf)
# ------------------
# gdf with null geometries only
gdf = GeoDataFrame({"a": [1, 2]}, crs=_CRS, geometry=[None, None])
_geodataframes_to_write.append(gdf)
# ------------------
# gdf with all shape types mixed together
gdf = GeoDataFrame(
{"a": [1, 2, 3, 4, 5, 6]},
crs=_CRS,
geometry=[
MultiPolygon((city_hall_boundaries, vauquelin_place)),
city_hall_entrance,
MultiLineString(city_hall_walls),
city_hall_walls[0],
MultiPoint([city_hall_entrance, city_hall_balcony]),
city_hall_balcony,
],
)
_geodataframes_to_write.append(gdf)
# Not supported by 'ESRI Shapefile' driver
_expect_writing(gdf, "ESRI Shapefile").to_raise(RuntimeError, "Failed to write record")
# ------------------
# gdf with all 2D shape types and 3D Point mixed together
gdf = GeoDataFrame(
{"a": [1, 2, 3, 4, 5, 6, 7]},
crs=_CRS,
geometry=[
MultiPolygon((city_hall_boundaries, vauquelin_place)),
city_hall_entrance,
MultiLineString(city_hall_walls),
city_hall_walls[0],
MultiPoint([city_hall_entrance, city_hall_balcony]),
city_hall_balcony,
point_3D,
],
)
_geodataframes_to_write.append(gdf)
# Not supported by 'ESRI Shapefile' driver
_expect_writing(gdf, "ESRI Shapefile").to_raise(RuntimeError, "Failed to write record")
@pytest.fixture(params=_geodataframes_to_write)
def geodataframe(request):
return request.param
@pytest.fixture(params=["GeoJSON", "ESRI Shapefile", "GPKG", "SQLite"])
def ogr_driver(request):
return request.param
@pytest.fixture(
params=[
pytest.param("fiona", marks=FIONA_MARK),
pytest.param("pyogrio", marks=PYOGRIO_MARK),
]
)
def engine(request):
return request.param
def test_to_file_roundtrip(tmpdir, geodataframe, ogr_driver, engine):
output_file = os.path.join(str(tmpdir), "output_file")
write_kwargs = {}
if ogr_driver == "SQLite":
write_kwargs["spatialite"] = True
# This if statement can be removed once minimal fiona version >= 1.8.20
if engine == "fiona":
import fiona
from packaging.version import Version
if Version(fiona.__version__) < Version("1.8.20"):
pytest.skip("SQLite driver only available from version 1.8.20")
# If only 3D Points, geometry_type needs to be specified for spatialite at the
# moment. This if can be removed once the following PR is released:
# https://github.com/geopandas/pyogrio/pull/223
if (
engine == "pyogrio"
and len(geodataframe == 2)
and geodataframe.geometry[0] is None
and geodataframe.geometry[1] is not None
and geodataframe.geometry[1].has_z
):
write_kwargs["geometry_type"] = "Point Z"
expected_error = _expected_error_on(geodataframe, ogr_driver)
if expected_error:
with pytest.raises(
RuntimeError, match="Failed to write record|Could not add feature to layer"
):
geodataframe.to_file(
output_file, driver=ogr_driver, engine=engine, **write_kwargs
)
else:
geodataframe.to_file(
output_file, driver=ogr_driver, engine=engine, **write_kwargs
)
reloaded = geopandas.read_file(output_file, engine=engine)
if ogr_driver == "GeoJSON" and engine == "pyogrio":
# For GeoJSON files, the int64 column comes back as int32
reloaded["a"] = reloaded["a"].astype("int64")
assert_geodataframe_equal(geodataframe, reloaded, check_column_type="equiv")

View File

@@ -0,0 +1,304 @@
from collections import OrderedDict
from shapely.geometry import (
LineString,
MultiLineString,
MultiPoint,
MultiPolygon,
Point,
Polygon,
)
import pandas as pd
import pytest
import numpy as np
from geopandas import GeoDataFrame
from geopandas.io.file import infer_schema
# Credit: Polygons below come from Montreal city Open Data portal
# http://donnees.ville.montreal.qc.ca/dataset/unites-evaluation-fonciere
city_hall_boundaries = Polygon(
(
(-73.5541107525234, 45.5091983609661),
(-73.5546126200639, 45.5086813829106),
(-73.5540185061397, 45.5084409343852),
(-73.5539986525799, 45.5084323044531),
(-73.5535801792994, 45.5089539203786),
(-73.5541107525234, 45.5091983609661),
)
)
vauquelin_place = Polygon(
(
(-73.5542465586147, 45.5081555487952),
(-73.5540185061397, 45.5084409343852),
(-73.5546126200639, 45.5086813829106),
(-73.5548825850032, 45.5084033554357),
(-73.5542465586147, 45.5081555487952),
)
)
city_hall_walls = [
LineString(
(
(-73.5541107525234, 45.5091983609661),
(-73.5546126200639, 45.5086813829106),
(-73.5540185061397, 45.5084409343852),
)
),
LineString(
(
(-73.5539986525799, 45.5084323044531),
(-73.5535801792994, 45.5089539203786),
(-73.5541107525234, 45.5091983609661),
)
),
]
city_hall_entrance = Point(-73.553785, 45.508722)
city_hall_balcony = Point(-73.554138, 45.509080)
city_hall_council_chamber = Point(-73.554246, 45.508931)
point_3D = Point(-73.553785, 45.508722, 300)
linestring_3D = LineString(
(
(-73.5541107525234, 45.5091983609661, 300),
(-73.5546126200639, 45.5086813829106, 300),
(-73.5540185061397, 45.5084409343852, 300),
)
)
polygon_3D = Polygon(
(
(-73.5541107525234, 45.5091983609661, 300),
(-73.5535801792994, 45.5089539203786, 300),
(-73.5541107525234, 45.5091983609661, 300),
)
)
def test_infer_schema_only_points():
df = GeoDataFrame(geometry=[city_hall_entrance, city_hall_balcony])
assert infer_schema(df) == {"geometry": "Point", "properties": OrderedDict()}
def test_infer_schema_points_and_multipoints():
df = GeoDataFrame(
geometry=[
MultiPoint([city_hall_entrance, city_hall_balcony]),
city_hall_balcony,
]
)
assert infer_schema(df) == {
"geometry": ["MultiPoint", "Point"],
"properties": OrderedDict(),
}
def test_infer_schema_only_multipoints():
df = GeoDataFrame(
geometry=[
MultiPoint(
[city_hall_entrance, city_hall_balcony, city_hall_council_chamber]
)
]
)
assert infer_schema(df) == {"geometry": "MultiPoint", "properties": OrderedDict()}
def test_infer_schema_only_linestrings():
df = GeoDataFrame(geometry=city_hall_walls)
assert infer_schema(df) == {"geometry": "LineString", "properties": OrderedDict()}
def test_infer_schema_linestrings_and_multilinestrings():
df = GeoDataFrame(geometry=[MultiLineString(city_hall_walls), city_hall_walls[0]])
assert infer_schema(df) == {
"geometry": ["MultiLineString", "LineString"],
"properties": OrderedDict(),
}
def test_infer_schema_only_multilinestrings():
df = GeoDataFrame(geometry=[MultiLineString(city_hall_walls)])
assert infer_schema(df) == {
"geometry": "MultiLineString",
"properties": OrderedDict(),
}
def test_infer_schema_only_polygons():
df = GeoDataFrame(geometry=[city_hall_boundaries, vauquelin_place])
assert infer_schema(df) == {"geometry": "Polygon", "properties": OrderedDict()}
def test_infer_schema_polygons_and_multipolygons():
df = GeoDataFrame(
geometry=[
MultiPolygon((city_hall_boundaries, vauquelin_place)),
city_hall_boundaries,
]
)
assert infer_schema(df) == {
"geometry": ["MultiPolygon", "Polygon"],
"properties": OrderedDict(),
}
def test_infer_schema_only_multipolygons():
df = GeoDataFrame(geometry=[MultiPolygon((city_hall_boundaries, vauquelin_place))])
assert infer_schema(df) == {"geometry": "MultiPolygon", "properties": OrderedDict()}
def test_infer_schema_multiple_shape_types():
df = GeoDataFrame(
geometry=[
MultiPolygon((city_hall_boundaries, vauquelin_place)),
city_hall_boundaries,
MultiLineString(city_hall_walls),
city_hall_walls[0],
MultiPoint([city_hall_entrance, city_hall_balcony]),
city_hall_balcony,
]
)
assert infer_schema(df) == {
"geometry": [
"MultiPolygon",
"Polygon",
"MultiLineString",
"LineString",
"MultiPoint",
"Point",
],
"properties": OrderedDict(),
}
def test_infer_schema_mixed_3D_shape_type():
df = GeoDataFrame(
geometry=[
MultiPolygon((city_hall_boundaries, vauquelin_place)),
city_hall_boundaries,
MultiLineString(city_hall_walls),
city_hall_walls[0],
MultiPoint([city_hall_entrance, city_hall_balcony]),
city_hall_balcony,
point_3D,
]
)
assert infer_schema(df) == {
"geometry": [
"3D Point",
"MultiPolygon",
"Polygon",
"MultiLineString",
"LineString",
"MultiPoint",
"Point",
],
"properties": OrderedDict(),
}
def test_infer_schema_mixed_3D_Point():
df = GeoDataFrame(geometry=[city_hall_balcony, point_3D])
assert infer_schema(df) == {
"geometry": ["3D Point", "Point"],
"properties": OrderedDict(),
}
def test_infer_schema_only_3D_Points():
df = GeoDataFrame(geometry=[point_3D, point_3D])
assert infer_schema(df) == {"geometry": "3D Point", "properties": OrderedDict()}
def test_infer_schema_mixed_3D_linestring():
df = GeoDataFrame(geometry=[city_hall_walls[0], linestring_3D])
assert infer_schema(df) == {
"geometry": ["3D LineString", "LineString"],
"properties": OrderedDict(),
}
def test_infer_schema_only_3D_linestrings():
df = GeoDataFrame(geometry=[linestring_3D, linestring_3D])
assert infer_schema(df) == {
"geometry": "3D LineString",
"properties": OrderedDict(),
}
def test_infer_schema_mixed_3D_Polygon():
df = GeoDataFrame(geometry=[city_hall_boundaries, polygon_3D])
assert infer_schema(df) == {
"geometry": ["3D Polygon", "Polygon"],
"properties": OrderedDict(),
}
def test_infer_schema_only_3D_Polygons():
df = GeoDataFrame(geometry=[polygon_3D, polygon_3D])
assert infer_schema(df) == {"geometry": "3D Polygon", "properties": OrderedDict()}
def test_infer_schema_null_geometry_and_2D_point():
df = GeoDataFrame(geometry=[None, city_hall_entrance])
# None geometry type is then omitted
assert infer_schema(df) == {"geometry": "Point", "properties": OrderedDict()}
def test_infer_schema_null_geometry_and_3D_point():
df = GeoDataFrame(geometry=[None, point_3D])
# None geometry type is then omitted
assert infer_schema(df) == {"geometry": "3D Point", "properties": OrderedDict()}
def test_infer_schema_null_geometry_all():
df = GeoDataFrame(geometry=[None, None])
# None geometry type in then replaced by 'Unknown'
# (default geometry type supported by Fiona)
assert infer_schema(df) == {"geometry": "Unknown", "properties": OrderedDict()}
@pytest.mark.parametrize(
"array_data,dtype", [([1, 2**31 - 1], np.int32), ([1, np.nan], pd.Int32Dtype())]
)
def test_infer_schema_int32(array_data, dtype):
int32col = pd.array(data=array_data, dtype=dtype)
df = GeoDataFrame(geometry=[city_hall_entrance, city_hall_balcony])
df["int32_column"] = int32col
assert infer_schema(df) == {
"geometry": "Point",
"properties": OrderedDict([("int32_column", "int32")]),
}
def test_infer_schema_int64():
int64col = pd.array([1, np.nan], dtype=pd.Int64Dtype())
df = GeoDataFrame(geometry=[city_hall_entrance, city_hall_balcony])
df["int64_column"] = int64col
assert infer_schema(df) == {
"geometry": "Point",
"properties": OrderedDict([("int64_column", "int")]),
}

View File

@@ -0,0 +1,110 @@
"""
See generate_legacy_storage_files.py for the creation of the legacy files.
"""
from contextlib import contextmanager
import glob
import os
import pathlib
import pandas as pd
import pytest
from geopandas.testing import assert_geodataframe_equal
from geopandas import _compat as compat
import geopandas
from shapely.geometry import Point
DATA_PATH = pathlib.Path(os.path.dirname(__file__)) / "data"
@pytest.fixture(scope="module")
def current_pickle_data():
# our current version pickle data
from .generate_legacy_storage_files import create_pickle_data
return create_pickle_data()
files = glob.glob(str(DATA_PATH / "pickle" / "*.pickle"))
@pytest.fixture(params=files, ids=[p.split("/")[-1] for p in files])
def legacy_pickle(request):
return request.param
@contextmanager
def with_use_pygeos(option):
orig = geopandas.options.use_pygeos
geopandas.options.use_pygeos = option
try:
yield
finally:
geopandas.options.use_pygeos = orig
@pytest.mark.skipif(
compat.USE_SHAPELY_20 or compat.USE_PYGEOS,
reason=(
"shapely 2.0/pygeos-based unpickling currently only works for "
"shapely-2.0/pygeos-written files"
),
)
def test_legacy_pickles(current_pickle_data, legacy_pickle):
result = pd.read_pickle(legacy_pickle)
for name, value in result.items():
expected = current_pickle_data[name]
assert_geodataframe_equal(value, expected)
def test_round_trip_current(tmpdir, current_pickle_data):
data = current_pickle_data
for name, value in data.items():
path = str(tmpdir / "{}.pickle".format(name))
value.to_pickle(path)
result = pd.read_pickle(path)
assert_geodataframe_equal(result, value)
assert isinstance(result.has_sindex, bool)
def _create_gdf():
return geopandas.GeoDataFrame(
{"a": [0.1, 0.2, 0.3], "geometry": [Point(1, 1), Point(2, 2), Point(3, 3)]},
crs="EPSG:4326",
)
@pytest.mark.skipif(not compat.HAS_PYGEOS, reason="requires pygeos to test #1745")
def test_pygeos_switch(tmpdir):
# writing and reading with pygeos disabled
with with_use_pygeos(False):
gdf = _create_gdf()
path = str(tmpdir / "gdf_crs1.pickle")
gdf.to_pickle(path)
result = pd.read_pickle(path)
assert_geodataframe_equal(result, gdf)
# writing without pygeos, reading with pygeos
with with_use_pygeos(False):
gdf = _create_gdf()
path = str(tmpdir / "gdf_crs1.pickle")
gdf.to_pickle(path)
with with_use_pygeos(True):
result = pd.read_pickle(path)
gdf = _create_gdf()
assert_geodataframe_equal(result, gdf)
# writing with pygeos, reading without pygeos
with with_use_pygeos(True):
gdf = _create_gdf()
path = str(tmpdir / "gdf_crs1.pickle")
gdf.to_pickle(path)
with with_use_pygeos(False):
result = pd.read_pickle(path)
gdf = _create_gdf()
assert_geodataframe_equal(result, gdf)

View File

@@ -0,0 +1,752 @@
"""
Tests here include reading/writing to different types of spatial databases.
The spatial database tests may not work without additional system
configuration. postGIS tests require a test database to have been setup;
see geopandas.tests.util for more information.
"""
import os
import warnings
import pandas as pd
import geopandas
from geopandas import GeoDataFrame, read_file, read_postgis
import geopandas._compat as compat
from geopandas.io.sql import _get_conn as get_conn, _write_postgis as write_postgis
from geopandas.tests.util import create_postgis, create_spatialite, validate_boro_df
import pytest
try:
from sqlalchemy import text
except ImportError:
# Avoid local imports for text in all sqlalchemy tests
# all tests using text use engine_postgis, which ensures sqlalchemy is available
text = str
@pytest.fixture
def df_nybb():
nybb_path = geopandas.datasets.get_path("nybb")
df = read_file(nybb_path)
return df
@pytest.fixture()
def connection_postgis():
"""
Initiates a connection to a postGIS database that must already exist.
See create_postgis for more information.
"""
psycopg2 = pytest.importorskip("psycopg2")
from psycopg2 import OperationalError
dbname = "test_geopandas"
user = os.environ.get("PGUSER")
password = os.environ.get("PGPASSWORD")
host = os.environ.get("PGHOST")
port = os.environ.get("PGPORT")
try:
con = psycopg2.connect(
dbname=dbname, user=user, password=password, host=host, port=port
)
except OperationalError:
pytest.skip("Cannot connect with postgresql database")
with warnings.catch_warnings():
warnings.filterwarnings(
"ignore", message="pandas only supports SQLAlchemy connectable.*"
)
yield con
con.close()
@pytest.fixture()
def engine_postgis():
"""
Initiates a connection engine to a postGIS database that must already exist.
"""
sqlalchemy = pytest.importorskip("sqlalchemy")
from sqlalchemy.engine.url import URL
user = os.environ.get("PGUSER")
password = os.environ.get("PGPASSWORD")
host = os.environ.get("PGHOST")
port = os.environ.get("PGPORT")
dbname = "test_geopandas"
try:
con = sqlalchemy.create_engine(
URL.create(
drivername="postgresql+psycopg2",
username=user,
database=dbname,
password=password,
host=host,
port=port,
)
)
con.connect()
except Exception:
pytest.skip("Cannot connect with postgresql database")
yield con
con.dispose()
@pytest.fixture()
def connection_spatialite():
"""
Return a memory-based SQLite3 connection with SpatiaLite enabled & initialized.
`The sqlite3 module must be built with loadable extension support
<https://docs.python.org/3/library/sqlite3.html#f1>`_ and
`SpatiaLite <https://www.gaia-gis.it/fossil/libspatialite/index>`_
must be available on the system as a SQLite module.
Packages available on Anaconda meet requirements.
Exceptions
----------
``AttributeError`` on missing support for loadable SQLite extensions
``sqlite3.OperationalError`` on missing SpatiaLite
"""
sqlite3 = pytest.importorskip("sqlite3")
try:
with sqlite3.connect(":memory:") as con:
con.enable_load_extension(True)
con.load_extension("mod_spatialite")
con.execute("SELECT InitSpatialMetaData(TRUE)")
except Exception:
con.close()
pytest.skip("Cannot setup spatialite database")
yield con
con.close()
def drop_table_if_exists(conn_or_engine, table):
sqlalchemy = pytest.importorskip("sqlalchemy")
if sqlalchemy.inspect(conn_or_engine).has_table(table):
metadata = sqlalchemy.MetaData()
with warnings.catch_warnings():
warnings.filterwarnings(
"ignore", message="Did not recognize type 'geometry' of column.*"
)
metadata.reflect(conn_or_engine)
table = metadata.tables.get(table)
if table is not None:
table.drop(conn_or_engine, checkfirst=True)
@pytest.fixture
def df_mixed_single_and_multi():
from shapely.geometry import Point, LineString, MultiLineString
df = geopandas.GeoDataFrame(
{
"geometry": [
LineString([(0, 0), (1, 1)]),
MultiLineString([[(0, 0), (1, 1)], [(2, 2), (3, 3)]]),
Point(0, 1),
]
},
crs="epsg:4326",
)
return df
@pytest.fixture
def df_geom_collection():
from shapely.geometry import Point, LineString, Polygon, GeometryCollection
df = geopandas.GeoDataFrame(
{
"geometry": [
GeometryCollection(
[
Polygon([(0, 0), (1, 1), (0, 1)]),
LineString([(0, 0), (1, 1)]),
Point(0, 0),
]
)
]
},
crs="epsg:4326",
)
return df
@pytest.fixture
def df_linear_ring():
from shapely.geometry import LinearRing
df = geopandas.GeoDataFrame(
{"geometry": [LinearRing(((0, 0), (0, 1), (1, 1), (1, 0)))]}, crs="epsg:4326"
)
return df
@pytest.fixture
def df_3D_geoms():
from shapely.geometry import Point, LineString, Polygon
df = geopandas.GeoDataFrame(
{
"geometry": [
LineString([(0, 0, 0), (1, 1, 1)]),
Polygon([(0, 0, 0), (1, 1, 1), (0, 1, 1)]),
Point(0, 1, 2),
]
},
crs="epsg:4326",
)
return df
class TestIO:
def test_get_conn(self, engine_postgis):
Connection = pytest.importorskip("sqlalchemy.engine.base").Connection
engine = engine_postgis
with get_conn(engine) as output:
assert isinstance(output, Connection)
with engine.connect() as conn:
with get_conn(conn) as output:
assert isinstance(output, Connection)
with pytest.raises(ValueError):
with get_conn(object()):
pass
def test_read_postgis_default(self, connection_postgis, df_nybb):
con = connection_postgis
create_postgis(con, df_nybb)
sql = "SELECT * FROM nybb;"
df = read_postgis(sql, con)
validate_boro_df(df)
# no crs defined on the created geodatabase, and none specified
# by user; should not be set to 0, as from get_srid failure
assert df.crs is None
def test_read_postgis_custom_geom_col(self, connection_postgis, df_nybb):
con = connection_postgis
geom_col = "the_geom"
create_postgis(con, df_nybb, geom_col=geom_col)
sql = "SELECT * FROM nybb;"
df = read_postgis(sql, con, geom_col=geom_col)
validate_boro_df(df)
def test_read_postgis_select_geom_as(self, connection_postgis, df_nybb):
"""Tests that a SELECT {geom} AS {some_other_geom} works."""
con = connection_postgis
orig_geom = "geom"
out_geom = "the_geom"
create_postgis(con, df_nybb, geom_col=orig_geom)
sql = """SELECT borocode, boroname, shape_leng, shape_area,
{} as {} FROM nybb;""".format(
orig_geom, out_geom
)
df = read_postgis(sql, con, geom_col=out_geom)
validate_boro_df(df)
def test_read_postgis_get_srid(self, connection_postgis, df_nybb):
"""Tests that an SRID can be read from a geodatabase (GH #451)."""
con = connection_postgis
crs = "epsg:4269"
df_reproj = df_nybb.to_crs(crs)
create_postgis(con, df_reproj, srid=4269)
sql = "SELECT * FROM nybb;"
df = read_postgis(sql, con)
validate_boro_df(df)
assert df.crs == crs
def test_read_postgis_override_srid(self, connection_postgis, df_nybb):
"""Tests that a user specified CRS overrides the geodatabase SRID."""
con = connection_postgis
orig_crs = df_nybb.crs
create_postgis(con, df_nybb, srid=4269)
sql = "SELECT * FROM nybb;"
df = read_postgis(sql, con, crs=orig_crs)
validate_boro_df(df)
assert df.crs == orig_crs
def test_from_postgis_default(self, connection_postgis, df_nybb):
con = connection_postgis
create_postgis(con, df_nybb)
sql = "SELECT * FROM nybb;"
df = GeoDataFrame.from_postgis(sql, con)
validate_boro_df(df, case_sensitive=False)
def test_from_postgis_custom_geom_col(self, connection_postgis, df_nybb):
con = connection_postgis
geom_col = "the_geom"
create_postgis(con, df_nybb, geom_col=geom_col)
sql = "SELECT * FROM nybb;"
df = GeoDataFrame.from_postgis(sql, con, geom_col=geom_col)
validate_boro_df(df, case_sensitive=False)
def test_read_postgis_null_geom(self, connection_spatialite, df_nybb):
"""Tests that geometry with NULL is accepted."""
con = connection_spatialite
geom_col = df_nybb.geometry.name
df_nybb.geometry.iat[0] = None
create_spatialite(con, df_nybb)
sql = (
"SELECT ogc_fid, borocode, boroname, shape_leng, shape_area, "
'AsEWKB("{0}") AS "{0}" FROM nybb'.format(geom_col)
)
df = read_postgis(sql, con, geom_col=geom_col)
validate_boro_df(df)
def test_read_postgis_binary(self, connection_spatialite, df_nybb):
"""Tests that geometry read as binary is accepted."""
con = connection_spatialite
geom_col = df_nybb.geometry.name
create_spatialite(con, df_nybb)
sql = (
"SELECT ogc_fid, borocode, boroname, shape_leng, shape_area, "
'ST_AsBinary("{0}") AS "{0}" FROM nybb'.format(geom_col)
)
df = read_postgis(sql, con, geom_col=geom_col)
validate_boro_df(df)
def test_read_postgis_chunksize(self, connection_postgis, df_nybb):
"""Test chunksize argument"""
chunksize = 2
con = connection_postgis
create_postgis(con, df_nybb)
sql = "SELECT * FROM nybb;"
df = pd.concat(read_postgis(sql, con, chunksize=chunksize))
validate_boro_df(df)
# no crs defined on the created geodatabase, and none specified
# by user; should not be set to 0, as from get_srid failure
assert df.crs is None
def test_read_postgis_privacy(self, connection_postgis, df_nybb):
con = connection_postgis
create_postgis(con, df_nybb)
sql = "SELECT * FROM nybb;"
with pytest.warns(FutureWarning):
geopandas.io.sql.read_postgis(sql, con)
def test_write_postgis_default(self, engine_postgis, df_nybb):
"""Tests that GeoDataFrame can be written to PostGIS with defaults."""
engine = engine_postgis
table = "nybb"
# If table exists, delete it before trying to write with defaults
drop_table_if_exists(engine, table)
# Write to db
write_postgis(df_nybb, con=engine, name=table, if_exists="fail")
# Validate
sql = text("SELECT * FROM {table};".format(table=table))
df = read_postgis(sql, engine, geom_col="geometry")
validate_boro_df(df)
def test_write_postgis_uppercase_tablename(self, engine_postgis, df_nybb):
"""Tests writing GeoDataFrame to PostGIS with uppercase tablename."""
engine = engine_postgis
table = "aTestTable"
# If table exists, delete it before trying to write with defaults
drop_table_if_exists(engine, table)
# Write to db
write_postgis(df_nybb, con=engine, name=table, if_exists="fail")
# Validate
sql = text('SELECT * FROM "{table}";'.format(table=table))
df = read_postgis(sql, engine, geom_col="geometry")
validate_boro_df(df)
def test_write_postgis_sqlalchemy_connection(self, engine_postgis, df_nybb):
"""Tests that GeoDataFrame can be written to PostGIS with defaults."""
with engine_postgis.begin() as con:
table = "nybb_con"
# If table exists, delete it before trying to write with defaults
drop_table_if_exists(con, table)
# Write to db
write_postgis(df_nybb, con=con, name=table, if_exists="fail")
# Validate
sql = text("SELECT * FROM {table};".format(table=table))
df = read_postgis(sql, con, geom_col="geometry")
validate_boro_df(df)
def test_write_postgis_fail_when_table_exists(self, engine_postgis, df_nybb):
"""
Tests that uploading the same table raises error when: if_replace='fail'.
"""
engine = engine_postgis
table = "nybb"
# Ensure table exists
write_postgis(df_nybb, con=engine, name=table, if_exists="replace")
try:
write_postgis(df_nybb, con=engine, name=table, if_exists="fail")
except ValueError as e:
if "already exists" in str(e):
pass
else:
raise e
def test_write_postgis_replace_when_table_exists(self, engine_postgis, df_nybb):
"""
Tests that replacing a table is possible when: if_replace='replace'.
"""
engine = engine_postgis
table = "nybb"
# Ensure table exists
write_postgis(df_nybb, con=engine, name=table, if_exists="replace")
# Overwrite
write_postgis(df_nybb, con=engine, name=table, if_exists="replace")
# Validate
sql = text("SELECT * FROM {table};".format(table=table))
df = read_postgis(sql, engine, geom_col="geometry")
validate_boro_df(df)
def test_write_postgis_append_when_table_exists(self, engine_postgis, df_nybb):
"""
Tests that appending to existing table produces correct results when:
if_replace='append'.
"""
engine = engine_postgis
table = "nybb"
orig_rows, orig_cols = df_nybb.shape
write_postgis(df_nybb, con=engine, name=table, if_exists="replace")
write_postgis(df_nybb, con=engine, name=table, if_exists="append")
# Validate
sql = text("SELECT * FROM {table};".format(table=table))
df = read_postgis(sql, engine, geom_col="geometry")
new_rows, new_cols = df.shape
# There should be twice as many rows in the new table
assert new_rows == orig_rows * 2, (
"There should be {target} rows,"
"found: {current}".format(target=orig_rows * 2, current=new_rows),
)
# Number of columns should stay the same
assert new_cols == orig_cols, (
"There should be {target} columns,"
"found: {current}".format(target=orig_cols, current=new_cols),
)
def test_write_postgis_without_crs(self, engine_postgis, df_nybb):
"""
Tests that GeoDataFrame can be written to PostGIS without CRS information.
"""
engine = engine_postgis
table = "nybb"
# Write to db
df_nybb = df_nybb
df_nybb.crs = None
with pytest.warns(UserWarning, match="Could not parse CRS from the GeoDataF"):
write_postgis(df_nybb, con=engine, name=table, if_exists="replace")
# Validate that srid is -1
sql = text(
"SELECT Find_SRID('{schema}', '{table}', '{geom_col}');".format(
schema="public", table=table, geom_col="geometry"
)
)
with engine.connect() as conn:
target_srid = conn.execute(sql).fetchone()[0]
assert target_srid == 0, "SRID should be 0, found %s" % target_srid
def test_write_postgis_with_esri_authority(self, engine_postgis, df_nybb):
"""
Tests that GeoDataFrame can be written to PostGIS with ESRI Authority
CRS information (GH #2414).
"""
engine = engine_postgis
table = "nybb"
# Write to db
df_nybb_esri = df_nybb.to_crs("ESRI:102003")
write_postgis(df_nybb_esri, con=engine, name=table, if_exists="replace")
# Validate that srid is 102003
sql = text(
"SELECT Find_SRID('{schema}', '{table}', '{geom_col}');".format(
schema="public", table=table, geom_col="geometry"
)
)
with engine.connect() as conn:
target_srid = conn.execute(sql).fetchone()[0]
assert target_srid == 102003, "SRID should be 102003, found %s" % target_srid
def test_write_postgis_geometry_collection(
self, engine_postgis, df_geom_collection
):
"""
Tests that writing a mix of different geometry types is possible.
"""
engine = engine_postgis
table = "geomtype_tests"
write_postgis(df_geom_collection, con=engine, name=table, if_exists="replace")
# Validate geometry type
sql = text(
"SELECT DISTINCT(GeometryType(geometry)) FROM {table} ORDER BY 1;".format(
table=table
)
)
with engine.connect() as conn:
geom_type = conn.execute(sql).fetchone()[0]
sql = text("SELECT * FROM {table};".format(table=table))
df = read_postgis(sql, engine, geom_col="geometry")
assert geom_type.upper() == "GEOMETRYCOLLECTION"
assert df.geom_type.unique()[0] == "GeometryCollection"
def test_write_postgis_mixed_geometry_types(
self, engine_postgis, df_mixed_single_and_multi
):
"""
Tests that writing a mix of single and MultiGeometries is possible.
"""
engine = engine_postgis
table = "geomtype_tests"
write_postgis(
df_mixed_single_and_multi, con=engine, name=table, if_exists="replace"
)
# Validate geometry type
sql = text(
"SELECT DISTINCT GeometryType(geometry) FROM {table} ORDER BY 1;".format(
table=table
)
)
with engine.connect() as conn:
res = conn.execute(sql).fetchall()
assert res[0][0].upper() == "LINESTRING"
assert res[1][0].upper() == "MULTILINESTRING"
assert res[2][0].upper() == "POINT"
def test_write_postgis_linear_ring(self, engine_postgis, df_linear_ring):
"""
Tests that writing a LinearRing.
"""
engine = engine_postgis
table = "geomtype_tests"
write_postgis(df_linear_ring, con=engine, name=table, if_exists="replace")
# Validate geometry type
sql = text(
"SELECT DISTINCT(GeometryType(geometry)) FROM {table} ORDER BY 1;".format(
table=table
)
)
with engine.connect() as conn:
geom_type = conn.execute(sql).fetchone()[0]
assert geom_type.upper() == "LINESTRING"
def test_write_postgis_in_chunks(self, engine_postgis, df_mixed_single_and_multi):
"""
Tests writing a LinearRing works.
"""
engine = engine_postgis
table = "geomtype_tests"
write_postgis(
df_mixed_single_and_multi,
con=engine,
name=table,
if_exists="replace",
chunksize=1,
)
# Validate row count
sql = text("SELECT COUNT(geometry) FROM {table};".format(table=table))
with engine.connect() as conn:
row_cnt = conn.execute(sql).fetchone()[0]
assert row_cnt == 3
# Validate geometry type
sql = text(
"SELECT DISTINCT GeometryType(geometry) FROM {table} ORDER BY 1;".format(
table=table
)
)
with engine.connect() as conn:
res = conn.execute(sql).fetchall()
assert res[0][0].upper() == "LINESTRING"
assert res[1][0].upper() == "MULTILINESTRING"
assert res[2][0].upper() == "POINT"
def test_write_postgis_to_different_schema(self, engine_postgis, df_nybb):
"""
Tests writing data to alternative schema.
"""
engine = engine_postgis
table = "nybb"
schema_to_use = "test"
sql = text("CREATE SCHEMA IF NOT EXISTS {schema};".format(schema=schema_to_use))
with engine.begin() as conn:
conn.execute(sql)
write_postgis(
df_nybb, con=engine, name=table, if_exists="replace", schema=schema_to_use
)
# Validate
sql = text(
"SELECT * FROM {schema}.{table};".format(schema=schema_to_use, table=table)
)
df = read_postgis(sql, engine, geom_col="geometry")
validate_boro_df(df)
def test_write_postgis_to_different_schema_when_table_exists(
self, engine_postgis, df_nybb
):
"""
Tests writing data to alternative schema.
"""
engine = engine_postgis
table = "nybb"
schema_to_use = "test"
sql = text("CREATE SCHEMA IF NOT EXISTS {schema};".format(schema=schema_to_use))
with engine.begin() as conn:
conn.execute(sql)
try:
write_postgis(
df_nybb, con=engine, name=table, if_exists="fail", schema=schema_to_use
)
# Validate
sql = text(
"SELECT * FROM {schema}.{table};".format(
schema=schema_to_use, table=table
)
)
df = read_postgis(sql, engine, geom_col="geometry")
validate_boro_df(df)
# Should raise a ValueError when table exists
except ValueError:
pass
# Try with replace flag on
write_postgis(
df_nybb, con=engine, name=table, if_exists="replace", schema=schema_to_use
)
# Validate
sql = text(
"SELECT * FROM {schema}.{table};".format(schema=schema_to_use, table=table)
)
df = read_postgis(sql, engine, geom_col="geometry")
validate_boro_df(df)
def test_write_postgis_3D_geometries(self, engine_postgis, df_3D_geoms):
"""
Tests writing a geometries with 3 dimensions works.
"""
engine = engine_postgis
table = "geomtype_tests"
write_postgis(df_3D_geoms, con=engine, name=table, if_exists="replace")
# Check that all geometries have 3 dimensions
sql = text("SELECT * FROM {table};".format(table=table))
df = read_postgis(sql, engine, geom_col="geometry")
assert list(df.geometry.has_z) == [True, True, True]
def test_row_order(self, engine_postgis, df_nybb):
"""
Tests that the row order in db table follows the order of the original frame.
"""
engine = engine_postgis
table = "row_order_test"
correct_order = df_nybb["BoroCode"].tolist()
write_postgis(df_nybb, con=engine, name=table, if_exists="replace")
# Check that the row order matches
sql = text("SELECT * FROM {table};".format(table=table))
df = read_postgis(sql, engine, geom_col="geometry")
assert df["BoroCode"].tolist() == correct_order
def test_append_before_table_exists(self, engine_postgis, df_nybb):
"""
Tests that insert works with if_exists='append' when table does not exist yet.
"""
engine = engine_postgis
table = "nybb"
# If table exists, delete it before trying to write with defaults
drop_table_if_exists(engine, table)
write_postgis(df_nybb, con=engine, name=table, if_exists="append")
# Check that the row order matches
sql = text("SELECT * FROM {table};".format(table=table))
df = read_postgis(sql, engine, geom_col="geometry")
validate_boro_df(df)
def test_append_with_different_crs(self, engine_postgis, df_nybb):
"""
Tests that the warning is raised if table CRS differs from frame.
"""
engine = engine_postgis
table = "nybb"
write_postgis(df_nybb, con=engine, name=table, if_exists="replace")
# Reproject
df_nybb2 = df_nybb.to_crs(epsg=4326)
# Should raise error when appending
with pytest.raises(ValueError, match="CRS of the target table"):
write_postgis(df_nybb2, con=engine, name=table, if_exists="append")
@pytest.mark.xfail(
compat.PANDAS_GE_20 and not compat.PANDAS_GE_21,
reason="Duplicate columns are dropped in read_sql with pandas 2.0.x",
)
def test_duplicate_geometry_column_fails(self, engine_postgis):
"""
Tests that a ValueError is raised if an SQL query returns two geometry columns.
"""
engine = engine_postgis
sql = "select ST_MakePoint(0, 0) as geom, ST_MakePoint(0, 0) as geom;"
with pytest.raises(ValueError):
read_postgis(sql, engine, geom_col="geom")