that's too much!

This commit is contained in:
2024-12-19 20:22:56 -08:00
parent 0020a609dd
commit 32cd60e92b
8443 changed files with 1446950 additions and 42 deletions

View File

@@ -0,0 +1,135 @@
from pathlib import Path
from zipfile import ZipFile, ZIP_DEFLATED
import pytest
from pyogrio import (
__gdal_version_string__,
__version__,
list_drivers,
)
from pyogrio._compat import HAS_ARROW_API, HAS_GDAL_GEOS, HAS_SHAPELY
from pyogrio.raw import read, write
_data_dir = Path(__file__).parent.resolve() / "fixtures"
# mapping of driver extension to driver name for well-supported drivers
DRIVERS = {
".fgb": "FlatGeobuf",
".geojson": "GeoJSON",
".geojsonl": "GeoJSONSeq",
".geojsons": "GeoJSONSeq",
".gpkg": "GPKG",
".shp": "ESRI Shapefile",
}
# mapping of driver name to extension
DRIVER_EXT = {driver: ext for ext, driver in DRIVERS.items()}
ALL_EXTS = [".fgb", ".geojson", ".geojsonl", ".gpkg", ".shp"]
def pytest_report_header(config):
drivers = ", ".join(
f"{driver}({capability})"
for driver, capability in sorted(list_drivers().items())
)
return (
f"pyogrio {__version__}\n"
f"GDAL {__gdal_version_string__}\n"
f"Supported drivers: {drivers}"
)
# marks to skip tests if optional dependecies are not present
requires_arrow_api = pytest.mark.skipif(
not HAS_ARROW_API, reason="GDAL>=3.6 and pyarrow required"
)
requires_gdal_geos = pytest.mark.skipif(
not HAS_GDAL_GEOS, reason="GDAL compiled with GEOS required"
)
requires_shapely = pytest.mark.skipif(not HAS_SHAPELY, reason="Shapely >= 2.0 required")
def prepare_testfile(testfile_path, dst_dir, ext):
if ext == testfile_path.suffix:
return testfile_path
dst_path = dst_dir / f"{testfile_path.stem}{ext}"
if dst_path.exists():
return dst_path
meta, _, geometry, field_data = read(testfile_path)
if ext == ".fgb":
# For .fgb, spatial_index=False to avoid the rows being reordered
meta["spatial_index"] = False
# allow mixed Polygons/MultiPolygons type
meta["geometry_type"] = "Unknown"
elif ext == ".gpkg":
# For .gpkg, spatial_index=False to avoid the rows being reordered
meta["spatial_index"] = False
meta["geometry_type"] = "MultiPolygon"
write(dst_path, geometry, field_data, **meta)
return dst_path
@pytest.fixture(scope="session")
def data_dir():
return _data_dir
@pytest.fixture(scope="function")
def naturalearth_lowres(tmp_path, request):
ext = getattr(request, "param", ".shp")
testfile_path = _data_dir / Path("naturalearth_lowres/naturalearth_lowres.shp")
return prepare_testfile(testfile_path, tmp_path, ext)
@pytest.fixture(scope="function", params=ALL_EXTS)
def naturalearth_lowres_all_ext(tmp_path, naturalearth_lowres, request):
return prepare_testfile(naturalearth_lowres, tmp_path, request.param)
@pytest.fixture(scope="function")
def naturalearth_lowres_vsi(tmp_path, naturalearth_lowres):
"""Wrap naturalearth_lowres as a zip file for vsi tests"""
path = tmp_path / f"{naturalearth_lowres.name}.zip"
with ZipFile(path, mode="w", compression=ZIP_DEFLATED, compresslevel=5) as out:
for ext in ["dbf", "prj", "shp", "shx"]:
filename = f"{naturalearth_lowres.stem}.{ext}"
out.write(naturalearth_lowres.parent / filename, filename)
return path, f"/vsizip/{path}/{naturalearth_lowres.name}"
@pytest.fixture(scope="session")
def test_fgdb_vsi():
return f"/vsizip/{_data_dir}/test_fgdb.gdb.zip"
@pytest.fixture(scope="session")
def test_gpkg_nulls():
return _data_dir / "test_gpkg_nulls.gpkg"
@pytest.fixture(scope="session")
def test_ogr_types_list():
return _data_dir / "test_ogr_types_list.geojson"
@pytest.fixture(scope="session")
def test_datetime():
return _data_dir / "test_datetime.geojson"
@pytest.fixture(scope="session")
def test_datetime_tz():
return _data_dir / "test_datetime_tz.geojson"

View File

@@ -0,0 +1,89 @@
# Test datasets
## Natural Earth lowres
`naturalearth_lowres.shp` was copied from GeoPandas.
## FGDB test dataset
`test_fgdb.gdb.zip`
Downloaded from http://trac.osgeo.org/gdal/raw-attachment/wiki/FileGDB/test_fgdb.gdb.zip
### GPKG test dataset with null values
`test_gpkg_nulls.gpkg` was created using Fiona backend to GeoPandas:
```
from collections import OrderedDict
import fiona
import geopandas as gp
import numpy as np
from pyogrio import write_dataframe
filename = "test_gpkg_nulls.gpkg"
df = gp.GeoDataFrame(
{
"col_bool": np.array([True, False, True], dtype="bool"),
"col_int8": np.array([1, 2, 3], dtype="int8"),
"col_int16": np.array([1, 2, 3], dtype="int16"),
"col_int32": np.array([1, 2, 3], dtype="int32"),
"col_int64": np.array([1, 2, 3], dtype="int64"),
"col_uint8": np.array([1, 2, 3], dtype="uint8"),
"col_uint16": np.array([1, 2, 3], dtype="uint16"),
"col_uint32": np.array([1, 2, 3], dtype="uint32"),
"col_uint64": np.array([1, 2, 3], dtype="uint64"),
"col_float32": np.array([1.5, 2.5, 3.5], dtype="float32"),
"col_float64": np.array([1.5, 2.5, 3.5], dtype="float64"),
},
geometry=gp.points_from_xy([0, 1, 2], [0, 1, 2]),
crs="EPSG:4326",
)
write_dataframe(df, filename)
# construct row with null values
# Note: np.nan can only be used for float values
null_row = {
"type": "Fetaure",
"id": 4,
"properties": OrderedDict(
[
("col_bool", None),
("col_int8", None),
("col_int16", None),
("col_int32", None),
("col_int64", None),
("col_uint8", None),
("col_uint16", None),
("col_uint32", None),
("col_uint64", None),
("col_float32", np.nan),
("col_float64", np.nan),
]
),
"geometry": {"type": "Point", "coordinates": (4.0, 4.0)},
}
# append row with nulls to GPKG
with fiona.open(filename, "a") as c:
c.write(null_row)
```
NOTE: Reading boolean values into GeoPandas using Fiona backend treats those
values as `None` and column dtype as `object`; Pyogrio treats those values as
`np.nan` and column dtype as `float64`.
### GPKG test with MultiSurface
This was extracted from https://prd-tnm.s3.amazonaws.com/StagedProducts/Hydrography/NHDPlusHR/Beta/GDB/NHDPLUS_H_0308_HU4_GDB.zip
`NHDWaterbody` layer using ogr2ogr:
```bash
ogr2ogr test_mixed_surface.gpkg NHDPLUS_H_0308_HU4_GDB.gdb NHDWaterbody -where '"NHDPlusID" = 15000300070477' -select "NHDPlusID"
```
### OSM PBF test
This was downloaded from https://github.com/openstreetmap/OSM-binary/blob/master/resources/sample.pbf

View File

@@ -0,0 +1 @@
GEOGCS["GCS_WGS_1984",DATUM["D_WGS_1984",SPHEROID["WGS_1984",6378137,298.257223563]],PRIMEM["Greenwich",0],UNIT["Degree",0.017453292519943295]]

View File

@@ -0,0 +1,7 @@
{
"type": "FeatureCollection",
"features": [
{ "type": "Feature", "properties": { "col": "2020-01-01T09:00:00.123" }, "geometry": { "type": "Point", "coordinates": [ 1.0, 1.0 ] } },
{ "type": "Feature", "properties": { "col": "2020-01-01T10:00:00" }, "geometry": { "type": "Point", "coordinates": [ 2.0, 2.0 ] } }
]
}

View File

@@ -0,0 +1,8 @@
{
"type": "FeatureCollection",
"crs": { "type": "name", "properties": { "name": "urn:ogc:def:crs:OGC:1.3:CRS84" } },
"features": [
{ "type": "Feature", "properties": { "datetime_col": "2020-01-01T09:00:00.123-05:00" }, "geometry": { "type": "Point", "coordinates": [ 1.0, 1.0 ] } },
{ "type": "Feature", "properties": { "datetime_col": "2020-01-01T10:00:00-05:00" }, "geometry": { "type": "Point", "coordinates": [ 2.0, 2.0 ] } }
]
}

View File

@@ -0,0 +1,18 @@
{
"type": "FeatureCollection",
"features": [
{
"type": "Feature",
"geometry": {
"type": "Point",
"coordinates": [0, 0]
},
"properties": {
"top_level": "A",
"intermediate_level": {
"bottom_level": "B"
}
}
}
]
}

View File

@@ -0,0 +1,12 @@
{
"type": "FeatureCollection",
"name": "test",
"crs": { "type": "name", "properties": { "name": "urn:ogc:def:crs:OGC:1.3:CRS84" } },
"features": [
{ "type": "Feature", "properties": { "int64": 1, "list_int64": [ 0, 1 ] }, "geometry": { "type": "Point", "coordinates": [ 0.0, 2.0 ] } },
{ "type": "Feature", "properties": { "int64": 2, "list_int64": [ 2, 3 ] }, "geometry": { "type": "Point", "coordinates": [ 1.0, 2.0 ] } },
{ "type": "Feature", "properties": { "int64": 3, "list_int64": [ 4, 5 ] }, "geometry": { "type": "Point", "coordinates": [ 2.0, 2.0 ] } },
{ "type": "Feature", "properties": { "int64": 4, "list_int64": [ 6, 7 ] }, "geometry": { "type": "Point", "coordinates": [ 3.0, 2.0 ] } },
{ "type": "Feature", "properties": { "int64": 5, "list_int64": [ 8, 9 ] }, "geometry": { "type": "Point", "coordinates": [ 4.0, 2.0 ] } }
]
}

View File

@@ -0,0 +1,207 @@
import contextlib
import math
import os
import pytest
from pyogrio import __gdal_version__, read_dataframe
from pyogrio.raw import open_arrow, read_arrow
from pyogrio.tests.conftest import requires_arrow_api
try:
import pandas as pd
from pandas.testing import assert_frame_equal, assert_index_equal
from geopandas.testing import assert_geodataframe_equal
import pyarrow
except ImportError:
pass
# skip all tests in this file if Arrow API or GeoPandas are unavailable
pytestmark = requires_arrow_api
pytest.importorskip("geopandas")
def test_read_arrow(naturalearth_lowres_all_ext):
result = read_dataframe(naturalearth_lowres_all_ext, use_arrow=True)
expected = read_dataframe(naturalearth_lowres_all_ext, use_arrow=False)
if naturalearth_lowres_all_ext.suffix.startswith(".geojson"):
check_less_precise = True
else:
check_less_precise = False
assert_geodataframe_equal(result, expected, check_less_precise=check_less_precise)
@pytest.mark.parametrize("skip_features, expected", [(10, 167), (200, 0)])
def test_read_arrow_skip_features(naturalearth_lowres, skip_features, expected):
table = read_arrow(naturalearth_lowres, skip_features=skip_features)[1]
assert len(table) == expected
def test_read_arrow_negative_skip_features(naturalearth_lowres):
with pytest.raises(ValueError, match="'skip_features' must be >= 0"):
read_arrow(naturalearth_lowres, skip_features=-1)
@pytest.mark.parametrize(
"max_features, expected", [(0, 0), (10, 10), (200, 177), (100000, 177)]
)
def test_read_arrow_max_features(naturalearth_lowres, max_features, expected):
table = read_arrow(naturalearth_lowres, max_features=max_features)[1]
assert len(table) == expected
def test_read_arrow_negative_max_features(naturalearth_lowres):
with pytest.raises(ValueError, match="'max_features' must be >= 0"):
read_arrow(naturalearth_lowres, max_features=-1)
@pytest.mark.parametrize(
"skip_features, max_features, expected",
[
(0, 0, 0),
(10, 0, 0),
(200, 0, 0),
(1, 200, 176),
(176, 10, 1),
(100, 100, 77),
(100, 100000, 77),
],
)
def test_read_arrow_skip_features_max_features(
naturalearth_lowres, skip_features, max_features, expected
):
table = read_arrow(
naturalearth_lowres, skip_features=skip_features, max_features=max_features
)[1]
assert len(table) == expected
def test_read_arrow_fid(naturalearth_lowres_all_ext):
kwargs = {"use_arrow": True, "where": "fid >= 2 AND fid <= 3"}
df = read_dataframe(naturalearth_lowres_all_ext, fid_as_index=False, **kwargs)
assert_index_equal(df.index, pd.RangeIndex(0, 2))
df = read_dataframe(naturalearth_lowres_all_ext, fid_as_index=True, **kwargs)
assert_index_equal(df.index, pd.Index([2, 3], name="fid"))
def test_read_arrow_columns(naturalearth_lowres):
result = read_dataframe(naturalearth_lowres, use_arrow=True, columns=["continent"])
assert result.columns.tolist() == ["continent", "geometry"]
def test_read_arrow_ignore_geometry(naturalearth_lowres):
result = read_dataframe(naturalearth_lowres, use_arrow=True, read_geometry=False)
assert type(result) is pd.DataFrame
expected = read_dataframe(naturalearth_lowres, use_arrow=True).drop(
columns=["geometry"]
)
assert_frame_equal(result, expected)
def test_read_arrow_nested_types(test_ogr_types_list):
# with arrow, list types are supported
result = read_dataframe(test_ogr_types_list, use_arrow=True)
assert "list_int64" in result.columns
assert result["list_int64"][0].tolist() == [0, 1]
def test_read_arrow_to_pandas_kwargs(test_fgdb_vsi):
# with arrow, list types are supported
arrow_to_pandas_kwargs = {"strings_to_categorical": True}
result = read_dataframe(
test_fgdb_vsi,
use_arrow=True,
arrow_to_pandas_kwargs=arrow_to_pandas_kwargs,
)
assert "SEGMENT_NAME" in result.columns
assert result["SEGMENT_NAME"].dtype.name == "category"
def test_read_arrow_raw(naturalearth_lowres):
meta, table = read_arrow(naturalearth_lowres)
assert isinstance(meta, dict)
assert isinstance(table, pyarrow.Table)
def test_open_arrow(naturalearth_lowres):
with open_arrow(naturalearth_lowres) as (meta, reader):
assert isinstance(meta, dict)
assert isinstance(reader, pyarrow.RecordBatchReader)
assert isinstance(reader.read_all(), pyarrow.Table)
def test_open_arrow_batch_size(naturalearth_lowres):
meta, table = read_arrow(naturalearth_lowres)
batch_size = math.ceil(len(table) / 2)
with open_arrow(naturalearth_lowres, batch_size=batch_size) as (meta, reader):
assert isinstance(meta, dict)
assert isinstance(reader, pyarrow.RecordBatchReader)
count = 0
tables = []
for table in reader:
tables.append(table)
count += 1
assert count == 2, "Should be two batches given the batch_size parameter"
assert len(tables[0]) == batch_size, "First table should match the batch size"
@pytest.mark.skipif(
__gdal_version__ >= (3, 8, 0),
reason="skip_features supported by Arrow stream API for GDAL>=3.8.0",
)
@pytest.mark.parametrize("skip_features", [10, 200])
def test_open_arrow_skip_features_unsupported(naturalearth_lowres, skip_features):
"""skip_features are not supported for the Arrow stream interface for
GDAL < 3.8.0"""
with pytest.raises(
ValueError,
match="specifying 'skip_features' is not supported for Arrow for GDAL<3.8.0",
):
with open_arrow(naturalearth_lowres, skip_features=skip_features) as (
meta,
reader,
):
pass
@pytest.mark.parametrize("max_features", [10, 200])
def test_open_arrow_max_features_unsupported(naturalearth_lowres, max_features):
"""max_features are not supported for the Arrow stream interface"""
with pytest.raises(
ValueError,
match="specifying 'max_features' is not supported for Arrow",
):
with open_arrow(naturalearth_lowres, max_features=max_features) as (
meta,
reader,
):
pass
@contextlib.contextmanager
def use_arrow_context():
original = os.environ.get("PYOGRIO_USE_ARROW", None)
os.environ["PYOGRIO_USE_ARROW"] = "1"
yield
if original:
os.environ["PYOGRIO_USE_ARROW"] = original
else:
del os.environ["PYOGRIO_USE_ARROW"]
def test_enable_with_environment_variable(test_ogr_types_list):
# list types are only supported with arrow, so don't work by default and work
# when arrow is enabled through env variable
result = read_dataframe(test_ogr_types_list)
assert "list_int64" not in result.columns
with use_arrow_context():
result = read_dataframe(test_ogr_types_list)
assert "list_int64" in result.columns

View File

@@ -0,0 +1,496 @@
import numpy as np
from numpy import array_equal, allclose
import pytest
from pyogrio import (
__gdal_version__,
__gdal_geos_version__,
list_drivers,
list_layers,
read_bounds,
read_info,
set_gdal_config_options,
get_gdal_config_option,
get_gdal_data_path,
)
from pyogrio.core import detect_write_driver
from pyogrio.errors import DataSourceError, DataLayerError
from pyogrio.tests.conftest import HAS_SHAPELY, prepare_testfile
from pyogrio._env import GDALEnv
with GDALEnv():
# NOTE: this must be AFTER above imports, which init the GDAL and PROJ data
# search paths
from pyogrio._ogr import ogr_driver_supports_write, has_gdal_data, has_proj_data
try:
import shapely
except ImportError:
pass
def test_gdal_data():
# test will fail if GDAL data files cannot be found, indicating an
# installation error
assert has_gdal_data()
def test_proj_data():
# test will fail if PROJ data files cannot be found, indicating an
# installation error
assert has_proj_data()
def test_get_gdal_data_path():
# test will fail if the function returns None, which means that GDAL
# cannot find data files, indicating an installation error
assert isinstance(get_gdal_data_path(), str)
def test_gdal_geos_version():
assert __gdal_geos_version__ is None or isinstance(__gdal_geos_version__, tuple)
@pytest.mark.parametrize(
"path,expected",
[
("test.shp", "ESRI Shapefile"),
("test.shp.zip", "ESRI Shapefile"),
("test.geojson", "GeoJSON"),
("test.geojsonl", "GeoJSONSeq"),
("test.gpkg", "GPKG"),
pytest.param(
"test.gpkg.zip",
"GPKG",
marks=pytest.mark.skipif(
__gdal_version__ < (3, 7, 0),
reason="writing *.gpkg.zip requires GDAL >= 3.7.0",
),
),
# postgres can be detected by prefix instead of extension
pytest.param(
"PG:dbname=test",
"PostgreSQL",
marks=pytest.mark.skipif(
"PostgreSQL" not in list_drivers(),
reason="PostgreSQL path test requires PostgreSQL driver",
),
),
],
)
def test_detect_write_driver(path, expected):
assert detect_write_driver(path) == expected
@pytest.mark.parametrize(
"path",
[
"test.svg", # only supports read
"test.", # not a valid extension
"test", # no extension or prefix
"test.foo", # not a valid extension
"FOO:test", # not a valid prefix
],
)
def test_detect_write_driver_unsupported(path):
with pytest.raises(ValueError, match="Could not infer driver from path"):
detect_write_driver(path)
@pytest.mark.parametrize("path", ["test.xml", "test.txt"])
def test_detect_write_driver_multiple_unsupported(path):
with pytest.raises(ValueError, match="multiple drivers are available"):
detect_write_driver(path)
@pytest.mark.parametrize(
"driver,expected",
[
# drivers known to be well-supported by pyogrio
("ESRI Shapefile", True),
("GeoJSON", True),
("GeoJSONSeq", True),
("GPKG", True),
# drivers not supported for write by GDAL
("HTTP", False),
("OAPIF", False),
],
)
def test_ogr_driver_supports_write(driver, expected):
assert ogr_driver_supports_write(driver) == expected
def test_list_drivers():
all_drivers = list_drivers()
# verify that the core drivers are present
for name in ("ESRI Shapefile", "GeoJSON", "GeoJSONSeq", "GPKG", "OpenFileGDB"):
assert name in all_drivers
expected_capability = "rw"
if name == "OpenFileGDB" and __gdal_version__ < (3, 6, 0):
expected_capability = "r"
assert all_drivers[name] == expected_capability
drivers = list_drivers(read=True)
expected = {k: v for k, v in all_drivers.items() if v.startswith("r")}
assert len(drivers) == len(expected)
drivers = list_drivers(write=True)
expected = {k: v for k, v in all_drivers.items() if v.endswith("w")}
assert len(drivers) == len(expected)
drivers = list_drivers(read=True, write=True)
expected = {
k: v for k, v in all_drivers.items() if v.startswith("r") and v.endswith("w")
}
assert len(drivers) == len(expected)
def test_list_layers(naturalearth_lowres, naturalearth_lowres_vsi, test_fgdb_vsi):
assert array_equal(
list_layers(naturalearth_lowres), [["naturalearth_lowres", "Polygon"]]
)
assert array_equal(
list_layers(naturalearth_lowres_vsi[1]), [["naturalearth_lowres", "Polygon"]]
)
# Measured 3D is downgraded to plain 3D during read
# Make sure this warning is raised
with pytest.warns(
UserWarning, match=r"Measured \(M\) geometry types are not supported"
):
fgdb_layers = list_layers(test_fgdb_vsi)
# GDAL >= 3.4.0 includes 'another_relationship' layer
assert len(fgdb_layers) >= 7
# Make sure that nonspatial layer has None for geometry
assert array_equal(fgdb_layers[0], ["basetable_2", None])
# Confirm that measured 3D is downgraded to plain 3D during read
assert array_equal(fgdb_layers[3], ["test_lines", "MultiLineString Z"])
assert array_equal(fgdb_layers[6], ["test_areas", "MultiPolygon Z"])
def test_read_bounds(naturalearth_lowres):
fids, bounds = read_bounds(naturalearth_lowres)
assert fids.shape == (177,)
assert bounds.shape == (4, 177)
assert fids[0] == 0
# Fiji; wraps antimeridian
assert allclose(bounds[:, 0], [-180.0, -18.28799, 180.0, -16.02088])
def test_read_bounds_max_features(naturalearth_lowres):
bounds = read_bounds(naturalearth_lowres, max_features=2)[1]
assert bounds.shape == (4, 2)
def test_read_bounds_negative_max_features(naturalearth_lowres):
with pytest.raises(ValueError, match="'max_features' must be >= 0"):
read_bounds(naturalearth_lowres, max_features=-1)
def test_read_bounds_skip_features(naturalearth_lowres):
expected_bounds = read_bounds(naturalearth_lowres, max_features=11)[1][:, 10]
fids, bounds = read_bounds(naturalearth_lowres, skip_features=10)
assert bounds.shape == (4, 167)
assert allclose(bounds[:, 0], expected_bounds)
assert fids[0] == 10
def test_read_bounds_negative_skip_features(naturalearth_lowres):
with pytest.raises(ValueError, match="'skip_features' must be >= 0"):
read_bounds(naturalearth_lowres, skip_features=-1)
def test_read_bounds_where_invalid(naturalearth_lowres_all_ext):
with pytest.raises(ValueError, match="Invalid SQL"):
read_bounds(naturalearth_lowres_all_ext, where="invalid")
def test_read_bounds_where(naturalearth_lowres):
fids, bounds = read_bounds(naturalearth_lowres, where="iso_a3 = 'CAN'")
assert fids.shape == (1,)
assert bounds.shape == (4, 1)
assert fids[0] == 3
assert allclose(bounds[:, 0], [-140.99778, 41.675105, -52.648099, 83.23324])
@pytest.mark.parametrize("bbox", [(1,), (1, 2), (1, 2, 3)])
def test_read_bounds_bbox_invalid(naturalearth_lowres, bbox):
with pytest.raises(ValueError, match="Invalid bbox"):
read_bounds(naturalearth_lowres, bbox=bbox)
def test_read_bounds_bbox(naturalearth_lowres_all_ext):
# should return no features
fids, bounds = read_bounds(
naturalearth_lowres_all_ext, bbox=(0, 0, 0.00001, 0.00001)
)
assert fids.shape == (0,)
assert bounds.shape == (4, 0)
fids, bounds = read_bounds(naturalearth_lowres_all_ext, bbox=(-85, 8, -80, 10))
assert fids.shape == (2,)
if naturalearth_lowres_all_ext.suffix == ".gpkg":
# fid in gpkg is 1-based
assert array_equal(fids, [34, 35]) # PAN, CRI
else:
# fid in other formats is 0-based
assert array_equal(fids, [33, 34]) # PAN, CRI
assert bounds.shape == (4, 2)
assert allclose(
bounds.T,
[
[-82.96578305, 7.22054149, -77.24256649, 9.61161001],
[-85.94172543, 8.22502798, -82.54619626, 11.21711925],
],
)
@pytest.mark.skipif(
not HAS_SHAPELY, reason="Shapely is required for mask functionality"
)
@pytest.mark.parametrize(
"mask",
[
{"type": "Point", "coordinates": [0, 0]},
'{"type": "Point", "coordinates": [0, 0]}',
"invalid",
],
)
def test_read_bounds_mask_invalid(naturalearth_lowres, mask):
with pytest.raises(ValueError, match="'mask' parameter must be a Shapely geometry"):
read_bounds(naturalearth_lowres, mask=mask)
@pytest.mark.skipif(
not HAS_SHAPELY, reason="Shapely is required for mask functionality"
)
def test_read_bounds_bbox_mask_invalid(naturalearth_lowres):
with pytest.raises(ValueError, match="cannot set both 'bbox' and 'mask'"):
read_bounds(
naturalearth_lowres, bbox=(-85, 8, -80, 10), mask=shapely.Point(-105, 55)
)
@pytest.mark.skipif(
not HAS_SHAPELY, reason="Shapely is required for mask functionality"
)
@pytest.mark.parametrize(
"mask,expected",
[
("POINT (-105 55)", [3]),
("POLYGON ((-80 8, -80 10, -85 10, -85 8, -80 8))", [33, 34]),
(
"""POLYGON ((
6.101929 50.97085,
5.773002 50.906611,
5.593156 50.642649,
6.059271 50.686052,
6.374064 50.851481,
6.101929 50.97085
))""",
[121, 129, 130],
),
(
"""GEOMETRYCOLLECTION (
POINT (-7.7 53),
POLYGON ((-80 8, -80 10, -85 10, -85 8, -80 8))
)""",
[33, 34, 133],
),
],
)
def test_read_bounds_mask(naturalearth_lowres_all_ext, mask, expected):
mask = shapely.from_wkt(mask)
fids = read_bounds(naturalearth_lowres_all_ext, mask=mask)[0]
if naturalearth_lowres_all_ext.suffix == ".gpkg":
# fid in gpkg is 1-based
assert array_equal(fids, np.array(expected) + 1)
else:
# fid in other formats is 0-based
assert array_equal(fids, expected)
@pytest.mark.skipif(
__gdal_version__ < (3, 4, 0),
reason="Cannot determine if GEOS is present or absent for GDAL < 3.4",
)
def test_read_bounds_bbox_intersects_vs_envelope_overlaps(naturalearth_lowres_all_ext):
# If GEOS is present and used by GDAL, bbox filter will be based on intersection
# of bbox and actual geometries; if GEOS is absent or not used by GDAL, it
# will be based on overlap of bounding boxes instead
fids, _ = read_bounds(naturalearth_lowres_all_ext, bbox=(-140, 20, -100, 45))
if __gdal_geos_version__ is None:
# bboxes for CAN, RUS overlap but do not intersect geometries
assert fids.shape == (4,)
if naturalearth_lowres_all_ext.suffix == ".gpkg":
# fid in gpkg is 1-based
assert array_equal(fids, [4, 5, 19, 28]) # CAN, USA, RUS, MEX
else:
# fid in other formats is 0-based
assert array_equal(fids, [3, 4, 18, 27]) # CAN, USA, RUS, MEX
else:
assert fids.shape == (2,)
if naturalearth_lowres_all_ext.suffix == ".gpkg":
# fid in gpkg is 1-based
assert array_equal(fids, [5, 28]) # USA, MEX
else:
# fid in other formats is 0-based
assert array_equal(fids, [4, 27]) # USA, MEX
def test_read_info(naturalearth_lowres):
meta = read_info(naturalearth_lowres)
assert meta["crs"] == "EPSG:4326"
assert meta["geometry_type"] == "Polygon"
assert meta["encoding"] == "UTF-8"
assert meta["fields"].shape == (5,)
assert meta["dtypes"].tolist() == ["int64", "object", "object", "object", "float64"]
assert meta["features"] == 177
assert allclose(meta["total_bounds"], (-180, -90, 180, 83.64513))
assert meta["driver"] == "ESRI Shapefile"
assert meta["capabilities"]["random_read"] is True
assert meta["capabilities"]["fast_set_next_by_index"] is True
assert meta["capabilities"]["fast_spatial_filter"] is False
assert meta["capabilities"]["fast_feature_count"] is True
assert meta["capabilities"]["fast_total_bounds"] is True
@pytest.mark.parametrize(
"dataset_kwargs,fields",
[
({}, ["top_level", "intermediate_level"]),
(
{"FLATTEN_NESTED_ATTRIBUTES": "YES"},
[
"top_level",
"intermediate_level_bottom_level",
],
),
(
{"flatten_nested_attributes": "yes"},
[
"top_level",
"intermediate_level_bottom_level",
],
),
(
{"flatten_nested_attributes": True},
[
"top_level",
"intermediate_level_bottom_level",
],
),
],
)
def test_read_info_dataset_kwargs(data_dir, dataset_kwargs, fields):
meta = read_info(data_dir / "test_nested.geojson", **dataset_kwargs)
assert meta["fields"].tolist() == fields
def test_read_info_invalid_dataset_kwargs(naturalearth_lowres):
with pytest.warns(RuntimeWarning, match="does not support open option INVALID"):
read_info(naturalearth_lowres, INVALID="YES")
def test_read_info_force_feature_count_exception(data_dir):
with pytest.raises(DataLayerError, match="Could not iterate over features"):
read_info(data_dir / "sample.osm.pbf", layer="lines", force_feature_count=True)
@pytest.mark.parametrize(
"layer, force, expected",
[
("points", False, -1),
("points", True, 8),
("lines", False, -1),
("lines", True, 36),
],
)
def test_read_info_force_feature_count(data_dir, layer, force, expected):
# the sample OSM file has non-increasing node IDs which causes the default
# custom indexing to raise an exception iterating over features
meta = read_info(
data_dir / "sample.osm.pbf",
layer=layer,
force_feature_count=force,
USE_CUSTOM_INDEXING=False,
)
assert meta["features"] == expected
@pytest.mark.parametrize(
"force_total_bounds, expected_total_bounds",
[(True, (-180.0, -90.0, 180.0, 83.64513)), (False, None)],
)
def test_read_info_force_total_bounds(
tmpdir, naturalearth_lowres, force_total_bounds, expected_total_bounds
):
# Geojson files don't hava a fast way to determine total_bounds
geojson_path = prepare_testfile(naturalearth_lowres, dst_dir=tmpdir, ext=".geojson")
info = read_info(geojson_path, force_total_bounds=force_total_bounds)
if expected_total_bounds is not None:
assert allclose(info["total_bounds"], expected_total_bounds)
else:
assert info["total_bounds"] is None
def test_read_info_without_geometry(test_fgdb_vsi):
assert read_info(test_fgdb_vsi)["total_bounds"] is None
@pytest.mark.parametrize(
"name,value,expected",
[
("CPL_DEBUG", "ON", True),
("CPL_DEBUG", True, True),
("CPL_DEBUG", "OFF", False),
("CPL_DEBUG", False, False),
],
)
def test_set_config_options(name, value, expected):
set_gdal_config_options({name: value})
actual = get_gdal_config_option(name)
assert actual == expected
def test_reset_config_options():
set_gdal_config_options({"foo": "bar"})
assert get_gdal_config_option("foo") == "bar"
set_gdal_config_options({"foo": None})
assert get_gdal_config_option("foo") is None
def test_error_handling(capfd):
# an operation that triggers a GDAL Failure
# -> error translated into Python exception + not printed to stderr
with pytest.raises(DataSourceError, match="No such file or directory"):
read_info("non-existent.shp")
assert capfd.readouterr().err == ""
def test_error_handling_warning(capfd, naturalearth_lowres):
# an operation that triggers a GDAL Warning
# -> translated into a Python warning + not printed to stderr
with pytest.warns(RuntimeWarning, match="does not support open option INVALID"):
read_info(naturalearth_lowres, INVALID="YES")
assert capfd.readouterr().err == ""

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,332 @@
import os
import contextlib
from zipfile import ZipFile, ZIP_DEFLATED
import pytest
import pyogrio
import pyogrio.raw
from pyogrio.util import vsi_path
try:
import geopandas # NOQA
has_geopandas = True
except ImportError:
has_geopandas = False
@contextlib.contextmanager
def change_cwd(path):
curdir = os.getcwd()
os.chdir(str(path))
try:
yield
finally:
os.chdir(curdir)
@pytest.mark.parametrize(
"path, expected",
[
# local file paths that should be passed through as is
("data.gpkg", "data.gpkg"),
("/home/user/data.gpkg", "/home/user/data.gpkg"),
(r"C:\User\Documents\data.gpkg", r"C:\User\Documents\data.gpkg"),
("file:///home/user/data.gpkg", "/home/user/data.gpkg"),
# cloud URIs
("https://testing/data.gpkg", "/vsicurl/https://testing/data.gpkg"),
("s3://testing/data.gpkg", "/vsis3/testing/data.gpkg"),
("gs://testing/data.gpkg", "/vsigs/testing/data.gpkg"),
("az://testing/data.gpkg", "/vsiaz/testing/data.gpkg"),
("adl://testing/data.gpkg", "/vsiadls/testing/data.gpkg"),
("adls://testing/data.gpkg", "/vsiadls/testing/data.gpkg"),
("hdfs://testing/data.gpkg", "/vsihdfs/testing/data.gpkg"),
("webhdfs://testing/data.gpkg", "/vsiwebhdfs/testing/data.gpkg"),
# archives
("zip://data.zip", "/vsizip/data.zip"),
("tar://data.tar", "/vsitar/data.tar"),
("gzip://data.gz", "/vsigzip/data.gz"),
("tar://./my.tar!my.geojson", "/vsitar/./my.tar/my.geojson"),
(
"zip://home/data/shapefile.zip!layer.shp",
"/vsizip/home/data/shapefile.zip/layer.shp",
),
# combined schemes
("zip+s3://testing/shapefile.zip", "/vsizip/vsis3/testing/shapefile.zip"),
(
"zip+https://s3.amazonaws.com/testing/shapefile.zip",
"/vsizip/vsicurl/https://s3.amazonaws.com/testing/shapefile.zip",
),
# auto-prefix zip files
("test.zip", "/vsizip/test.zip"),
("/a/b/test.zip", "/vsizip//a/b/test.zip"),
("a/b/test.zip", "/vsizip/a/b/test.zip"),
# archives using ! notation should be prefixed by vsizip
("test.zip!item.shp", "/vsizip/test.zip/item.shp"),
("test.zip!/a/b/item.shp", "/vsizip/test.zip/a/b/item.shp"),
("test.zip!a/b/item.shp", "/vsizip/test.zip/a/b/item.shp"),
("/vsizip/test.zip/a/b/item.shp", "/vsizip/test.zip/a/b/item.shp"),
("zip:///test.zip/a/b/item.shp", "/vsizip//test.zip/a/b/item.shp"),
# auto-prefix remote zip files
(
"https://s3.amazonaws.com/testing/test.zip",
"/vsizip/vsicurl/https://s3.amazonaws.com/testing/test.zip",
),
(
"https://s3.amazonaws.com/testing/test.zip!/a/b/item.shp",
"/vsizip/vsicurl/https://s3.amazonaws.com/testing/test.zip/a/b/item.shp",
),
("s3://testing/test.zip", "/vsizip/vsis3/testing/test.zip"),
(
"s3://testing/test.zip!a/b/item.shp",
"/vsizip/vsis3/testing/test.zip/a/b/item.shp",
),
],
)
def test_vsi_path(path, expected):
assert vsi_path(path) == expected
def test_vsi_path_unknown():
# unrecognized URI gets passed through as is
assert vsi_path("s4://test/data.geojson") == "s4://test/data.geojson"
def test_vsi_handling_read_functions(naturalearth_lowres_vsi):
# test that all different read entry points have the path handling
# (a zip:// path would otherwise fail)
path, _ = naturalearth_lowres_vsi
path = "zip://" + str(path)
result = pyogrio.raw.read(path)
assert len(result[2]) == 177
result = pyogrio.read_info(path)
assert result["features"] == 177
result = pyogrio.read_bounds(path)
assert len(result[0]) == 177
@pytest.mark.skipif(not has_geopandas, reason="GeoPandas not available")
def test_vsi_handling_read_dataframe(naturalearth_lowres_vsi):
path, _ = naturalearth_lowres_vsi
path = "zip://" + str(path)
result = pyogrio.read_dataframe(path)
assert len(result) == 177
@pytest.mark.skipif(not has_geopandas, reason="GeoPandas not available")
def test_path_absolute(data_dir):
# pathlib path
path = data_dir / "naturalearth_lowres/naturalearth_lowres.shp"
df = pyogrio.read_dataframe(path)
assert len(df) == 177
# str path
df = pyogrio.read_dataframe(str(path))
assert len(df) == 177
def test_path_relative(data_dir):
path = "naturalearth_lowres/naturalearth_lowres.shp"
with change_cwd(data_dir):
result = pyogrio.raw.read(path)
assert len(result[2]) == 177
result = pyogrio.read_info(path)
assert result["features"] == 177
result = pyogrio.read_bounds(path)
assert len(result[0]) == 177
@pytest.mark.skipif(not has_geopandas, reason="GeoPandas not available")
def test_path_relative_dataframe(data_dir):
with change_cwd(data_dir):
df = pyogrio.read_dataframe("naturalearth_lowres/naturalearth_lowres.shp")
assert len(df) == 177
def test_uri_local_file(data_dir):
path = "file://" + str(data_dir / "naturalearth_lowres/naturalearth_lowres.shp")
result = pyogrio.raw.read(path)
assert len(result[2]) == 177
result = pyogrio.read_info(path)
assert result["features"] == 177
result = pyogrio.read_bounds(path)
assert len(result[0]) == 177
@pytest.mark.skipif(not has_geopandas, reason="GeoPandas not available")
def test_uri_local_file_dataframe(data_dir):
uri = "file://" + str(data_dir / "naturalearth_lowres/naturalearth_lowres.shp")
df = pyogrio.read_dataframe(uri)
assert len(df) == 177
def test_zip_path(naturalearth_lowres_vsi):
path, path_vsi = naturalearth_lowres_vsi
path_zip = "zip://" + str(path)
# absolute zip path
result = pyogrio.raw.read(path_zip)
assert len(result[2]) == 177
result = pyogrio.read_info(path_zip)
assert result["features"] == 177
result = pyogrio.read_bounds(path_zip)
assert len(result[0]) == 177
# absolute vsizip path
result = pyogrio.raw.read(path_vsi)
assert len(result[2]) == 177
result = pyogrio.read_info(path_vsi)
assert result["features"] == 177
result = pyogrio.read_bounds(path_vsi)
assert len(result[0]) == 177
# relative zip path
relative_path = "zip://" + path.name
with change_cwd(path.parent):
result = pyogrio.raw.read(relative_path)
assert len(result[2]) == 177
result = pyogrio.read_info(relative_path)
assert result["features"] == 177
result = pyogrio.read_bounds(relative_path)
assert len(result[0]) == 177
@pytest.mark.skipif(not has_geopandas, reason="GeoPandas not available")
def test_zip_path_dataframe(naturalearth_lowres_vsi):
path, path_vsi = naturalearth_lowres_vsi
path_zip = "zip://" + str(path)
# absolute zip path
df = pyogrio.read_dataframe(path_zip)
assert len(df) == 177
# absolute vsizip path
df = pyogrio.read_dataframe(path_vsi)
assert len(df) == 177
# relative zip path
with change_cwd(path.parent):
df = pyogrio.read_dataframe("zip://" + path.name)
assert len(df) == 177
@pytest.mark.skipif(not has_geopandas, reason="GeoPandas not available")
def test_detect_zip_path(tmp_path, naturalearth_lowres):
# create a zipfile with 2 shapefiles in a set of subdirectories
df = pyogrio.read_dataframe(naturalearth_lowres, where="iso_a3 in ('CAN', 'PER')")
pyogrio.write_dataframe(df.loc[df.iso_a3 == "CAN"], tmp_path / "test1.shp")
pyogrio.write_dataframe(df.loc[df.iso_a3 == "PER"], tmp_path / "test2.shp")
path = tmp_path / "test.zip"
with ZipFile(path, mode="w", compression=ZIP_DEFLATED, compresslevel=5) as out:
for ext in ["dbf", "prj", "shp", "shx"]:
filename = f"test1.{ext}"
out.write(tmp_path / filename, filename)
filename = f"test2.{ext}"
out.write(tmp_path / filename, f"/a/b/{filename}")
# defaults to the first shapefile found, at lowest subdirectory
df = pyogrio.read_dataframe(path)
assert df.iso_a3[0] == "CAN"
# selecting a shapefile from within the zip requires "!"" archive specifier
df = pyogrio.read_dataframe(f"{path}!test1.shp")
assert df.iso_a3[0] == "CAN"
df = pyogrio.read_dataframe(f"{path}!/a/b/test2.shp")
assert df.iso_a3[0] == "PER"
# specifying zip:// scheme should also work
df = pyogrio.read_dataframe(f"zip://{path}!/a/b/test2.shp")
assert df.iso_a3[0] == "PER"
# specifying /vsizip/ should also work but path must already be in GDAL ready
# format without the "!"" archive specifier
df = pyogrio.read_dataframe(f"/vsizip/{path}/a/b/test2.shp")
assert df.iso_a3[0] == "PER"
@pytest.mark.network
def test_url():
url = "https://raw.githubusercontent.com/geopandas/pyogrio/main/pyogrio/tests/fixtures/naturalearth_lowres/naturalearth_lowres.shp" # NOQA
result = pyogrio.raw.read(url)
assert len(result[2]) == 177
result = pyogrio.read_info(url)
assert result["features"] == 177
result = pyogrio.read_bounds(url)
assert len(result[0]) == 177
@pytest.mark.skipif(not has_geopandas, reason="GeoPandas not available")
def test_url_dataframe():
url = "https://raw.githubusercontent.com/geopandas/pyogrio/main/pyogrio/tests/fixtures/naturalearth_lowres/naturalearth_lowres.shp" # NOQA
assert len(pyogrio.read_dataframe(url)) == 177
@pytest.mark.network
def test_url_with_zip():
url = "zip+https://s3.amazonaws.com/fiona-testing/coutwildrnp.zip"
result = pyogrio.raw.read(url)
assert len(result[2]) == 67
result = pyogrio.read_info(url)
assert result["features"] == 67
result = pyogrio.read_bounds(url)
assert len(result[0]) == 67
@pytest.mark.network
@pytest.mark.skipif(not has_geopandas, reason="GeoPandas not available")
def test_url_with_zip_dataframe():
url = "zip+https://s3.amazonaws.com/fiona-testing/coutwildrnp.zip"
df = pyogrio.read_dataframe(url)
assert len(df) == 67
@pytest.fixture
def aws_env_setup(monkeypatch):
monkeypatch.setenv("AWS_NO_SIGN_REQUEST", "YES")
@pytest.mark.network
def test_uri_s3(aws_env_setup):
url = "zip+s3://fiona-testing/coutwildrnp.zip"
result = pyogrio.raw.read(url)
assert len(result[2]) == 67
result = pyogrio.read_info(url)
assert result["features"] == 67
result = pyogrio.read_bounds(url)
assert len(result[0]) == 67
@pytest.mark.network
@pytest.mark.skipif(not has_geopandas, reason="GeoPandas not available")
def test_uri_s3_dataframe(aws_env_setup):
df = pyogrio.read_dataframe("zip+s3://fiona-testing/coutwildrnp.zip")
assert len(df) == 67

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,86 @@
"""Run pytest tests manually on Windows due to import errors
"""
from pathlib import Path
import platform
from tempfile import TemporaryDirectory
data_dir = Path(__file__).parent.resolve() / "fixtures"
if platform.system() == "Windows":
naturalearth_lowres = data_dir / Path("naturalearth_lowres/naturalearth_lowres.shp")
test_fgdb_vsi = f"/vsizip/{data_dir}/test_fgdb.gdb.zip"
from pyogrio.tests.test_core import test_read_info
try:
test_read_info(naturalearth_lowres)
except Exception as ex:
print(ex)
from pyogrio.tests.test_raw_io import (
test_read,
test_read_no_geometry,
test_read_columns,
test_read_skip_features,
test_read_max_features,
test_read_where,
test_read_where_invalid,
test_write,
test_write_gpkg,
test_write_geojson,
)
try:
test_read(naturalearth_lowres)
except Exception as ex:
print(ex)
try:
test_read_no_geometry(naturalearth_lowres)
except Exception as ex:
print(ex)
try:
test_read_columns(naturalearth_lowres)
except Exception as ex:
print(ex)
try:
test_read_skip_features(naturalearth_lowres)
except Exception as ex:
print(ex)
try:
test_read_max_features(naturalearth_lowres)
except Exception as ex:
print(ex)
try:
test_read_where(naturalearth_lowres)
except Exception as ex:
print(ex)
try:
test_read_where_invalid(naturalearth_lowres)
except Exception as ex:
print(ex)
with TemporaryDirectory() as tmpdir:
try:
test_write(tmpdir, naturalearth_lowres)
except Exception as ex:
print(ex)
with TemporaryDirectory() as tmpdir:
try:
test_write_gpkg(tmpdir, naturalearth_lowres)
except Exception as ex:
print(ex)
with TemporaryDirectory() as tmpdir:
try:
test_write_geojson(tmpdir, naturalearth_lowres)
except Exception as ex:
print(ex)