that's too much!

2024-12-19 20:22:56 -08:00
parent 0020a609dd
commit 32cd60e92b
8443 changed files with 1446950 additions and 42 deletions
--- a/.venv/lib/python3.12/site-packages/geopandas/tools/sjoin.py
+++ b/.venv/lib/python3.12/site-packages/geopandas/tools/sjoin.py
@@ -0,0 +1,553 @@
+from typing import Optional
+import warnings
+
+import numpy as np
+import pandas as pd
+
+from geopandas import GeoDataFrame
+from geopandas import _compat as compat
+from geopandas.array import _check_crs, _crs_mismatch_warn
+
+
+def sjoin(
+    left_df,
+    right_df,
+    how="inner",
+    predicate="intersects",
+    lsuffix="left",
+    rsuffix="right",
+    **kwargs,
+):
+    """Spatial join of two GeoDataFrames.
+
+    See the User Guide page :doc:`../../user_guide/mergingdata` for details.
+
+
+    Parameters
+    ----------
+    left_df, right_df : GeoDataFrames
+    how : string, default 'inner'
+        The type of join:
+
+        * 'left': use keys from left_df; retain only left_df geometry column
+        * 'right': use keys from right_df; retain only right_df geometry column
+        * 'inner': use intersection of keys from both dfs; retain only
+          left_df geometry column
+    predicate : string, default 'intersects'
+        Binary predicate. Valid values are determined by the spatial index used.
+        You can check the valid values in left_df or right_df as
+        ``left_df.sindex.valid_query_predicates`` or
+        ``right_df.sindex.valid_query_predicates``
+        Replaces deprecated ``op`` parameter.
+    lsuffix : string, default 'left'
+        Suffix to apply to overlapping column names (left GeoDataFrame).
+    rsuffix : string, default 'right'
+        Suffix to apply to overlapping column names (right GeoDataFrame).
+
+    Examples
+    --------
+    >>> import geodatasets
+    >>> chicago = geopandas.read_file(
+    ...     geodatasets.get_path("geoda.chicago_health")
+    ... )
+    >>> groceries = geopandas.read_file(
+    ...     geodatasets.get_path("geoda.groceries")
+    ... ).to_crs(chicago.crs)
+
+    >>> chicago.head()  # doctest: +SKIP
+        ComAreaID  ...                                           geometry
+    0         35  ...  POLYGON ((-87.60914 41.84469, -87.60915 41.844...
+    1         36  ...  POLYGON ((-87.59215 41.81693, -87.59231 41.816...
+    2         37  ...  POLYGON ((-87.62880 41.80189, -87.62879 41.801...
+    3         38  ...  POLYGON ((-87.60671 41.81681, -87.60670 41.816...
+    4         39  ...  POLYGON ((-87.59215 41.81693, -87.59215 41.816...
+    [5 rows x 87 columns]
+
+    >>> groceries.head()  # doctest: +SKIP
+        OBJECTID     Ycoord  ...  Category                         geometry
+    0        16  41.973266  ...       NaN  MULTIPOINT (-87.65661 41.97321)
+    1        18  41.696367  ...       NaN  MULTIPOINT (-87.68136 41.69713)
+    2        22  41.868634  ...       NaN  MULTIPOINT (-87.63918 41.86847)
+    3        23  41.877590  ...       new  MULTIPOINT (-87.65495 41.87783)
+    4        27  41.737696  ...       NaN  MULTIPOINT (-87.62715 41.73623)
+    [5 rows x 8 columns]
+
+    >>> groceries_w_communities = geopandas.sjoin(groceries, chicago)
+    >>> groceries_w_communities.head()  # doctest: +SKIP
+            OBJECTID     Ycoord     Xcoord  ... GonorrF GonorrM Tuberc
+    0          16  41.973266 -87.657073  ...   170.8   468.7   13.6
+    87        365  41.961707 -87.654058  ...   170.8   468.7   13.6
+    90        373  41.963131 -87.656352  ...   170.8   468.7   13.6
+    140       582  41.969131 -87.674882  ...   170.8   468.7   13.6
+    1          18  41.696367 -87.681315  ...   800.5   741.1    2.6
+    [5 rows x 95 columns]
+
+    See also
+    --------
+    overlay : overlay operation resulting in a new geometry
+    GeoDataFrame.sjoin : equivalent method
+
+    Notes
+    -----
+    Every operation in GeoPandas is planar, i.e. the potential third
+    dimension is not taken into account.
+    """
+    if "op" in kwargs:
+        op = kwargs.pop("op")
+        deprecation_message = (
+            "The `op` parameter is deprecated and will be removed"
+            " in a future release. Please use the `predicate` parameter"
+            " instead."
+        )
+        if predicate != "intersects" and op != predicate:
+            override_message = (
+                "A non-default value for `predicate` was passed"
+                f' (got `predicate="{predicate}"`'
+                f' in combination with `op="{op}"`).'
+                " The value of `predicate` will be overridden by the value of `op`,"
+                " , which may result in unexpected behavior."
+                f"\n{deprecation_message}"
+            )
+            warnings.warn(override_message, UserWarning, stacklevel=4)
+        else:
+            warnings.warn(deprecation_message, FutureWarning, stacklevel=4)
+        predicate = op
+    if kwargs:
+        first = next(iter(kwargs.keys()))
+        raise TypeError(f"sjoin() got an unexpected keyword argument '{first}'")
+
+    _basic_checks(left_df, right_df, how, lsuffix, rsuffix)
+
+    indices = _geom_predicate_query(left_df, right_df, predicate)
+
+    joined = _frame_join(indices, left_df, right_df, how, lsuffix, rsuffix)
+
+    return joined
+
+
+def _basic_checks(left_df, right_df, how, lsuffix, rsuffix):
+    """Checks the validity of join input parameters.
+
+    `how` must be one of the valid options.
+    `'index_'` concatenated with `lsuffix` or `rsuffix` must not already
+    exist as columns in the left or right data frames.
+
+    Parameters
+    ------------
+    left_df : GeoDataFrame
+    right_df : GeoData Frame
+    how : str, one of 'left', 'right', 'inner'
+        join type
+    lsuffix : str
+        left index suffix
+    rsuffix : str
+        right index suffix
+    """
+    if not isinstance(left_df, GeoDataFrame):
+        raise ValueError(
+            "'left_df' should be GeoDataFrame, got {}".format(type(left_df))
+        )
+
+    if not isinstance(right_df, GeoDataFrame):
+        raise ValueError(
+            "'right_df' should be GeoDataFrame, got {}".format(type(right_df))
+        )
+
+    allowed_hows = ["left", "right", "inner"]
+    if how not in allowed_hows:
+        raise ValueError(
+            '`how` was "{}" but is expected to be in {}'.format(how, allowed_hows)
+        )
+
+    if not _check_crs(left_df, right_df):
+        _crs_mismatch_warn(left_df, right_df, stacklevel=4)
+
+    index_left = "index_{}".format(lsuffix)
+    index_right = "index_{}".format(rsuffix)
+
+    # due to GH 352
+    if any(left_df.columns.isin([index_left, index_right])) or any(
+        right_df.columns.isin([index_left, index_right])
+    ):
+        raise ValueError(
+            "'{0}' and '{1}' cannot be names in the frames being"
+            " joined".format(index_left, index_right)
+        )
+
+
+def _geom_predicate_query(left_df, right_df, predicate):
+    """Compute geometric comparisons and get matching indices.
+
+    Parameters
+    ----------
+    left_df : GeoDataFrame
+    right_df : GeoDataFrame
+    predicate : string
+        Binary predicate to query.
+
+    Returns
+    -------
+    DataFrame
+        DataFrame with matching indices in
+        columns named `_key_left` and `_key_right`.
+    """
+    with warnings.catch_warnings():
+        # We don't need to show our own warning here
+        # TODO remove this once the deprecation has been enforced
+        warnings.filterwarnings(
+            "ignore", "Generated spatial index is empty", FutureWarning
+        )
+
+        original_predicate = predicate
+
+        if predicate == "within":
+            # within is implemented as the inverse of contains
+            # contains is a faster predicate
+            # see discussion at https://github.com/geopandas/geopandas/pull/1421
+            predicate = "contains"
+            sindex = left_df.sindex
+            input_geoms = right_df.geometry
+        else:
+            # all other predicates are symmetric
+            # keep them the same
+            sindex = right_df.sindex
+            input_geoms = left_df.geometry
+
+    if sindex:
+        l_idx, r_idx = sindex.query(input_geoms, predicate=predicate, sort=False)
+        indices = pd.DataFrame({"_key_left": l_idx, "_key_right": r_idx})
+    else:
+        # when sindex is empty / has no valid geometries
+        indices = pd.DataFrame(columns=["_key_left", "_key_right"], dtype=float)
+
+    if original_predicate == "within":
+        # within is implemented as the inverse of contains
+        # flip back the results
+        indices = indices.rename(
+            columns={"_key_left": "_key_right", "_key_right": "_key_left"}
+        )
+
+    return indices
+
+
+def _frame_join(join_df, left_df, right_df, how, lsuffix, rsuffix):
+    """Join the GeoDataFrames at the DataFrame level.
+
+    Parameters
+    ----------
+    join_df : DataFrame
+        Indices and join data returned by the geometric join.
+        Must have columns `_key_left` and `_key_right`
+        with integer indices representing the matches
+        from `left_df` and `right_df` respectively.
+        Additional columns may be included and will be copied to
+        the resultant GeoDataFrame.
+    left_df : GeoDataFrame
+    right_df : GeoDataFrame
+    lsuffix : string
+        Suffix to apply to overlapping column names (left GeoDataFrame).
+    rsuffix : string
+        Suffix to apply to overlapping column names (right GeoDataFrame).
+    how : string
+        The type of join to use on the DataFrame level.
+
+    Returns
+    -------
+    GeoDataFrame
+        Joined GeoDataFrame.
+    """
+    # the spatial index only allows limited (numeric) index types, but an
+    # index in geopandas may be any arbitrary dtype. so reset both indices now
+    # and store references to the original indices, to be reaffixed later.
+    # GH 352
+    index_left = "index_{}".format(lsuffix)
+    left_df = left_df.copy(deep=True)
+    try:
+        left_index_name = left_df.index.name
+        left_df.index = left_df.index.rename(index_left)
+    except TypeError:
+        index_left = [
+            "index_{}".format(lsuffix + str(pos))
+            for pos, ix in enumerate(left_df.index.names)
+        ]
+        left_index_name = left_df.index.names
+        left_df.index = left_df.index.rename(index_left)
+    left_df = left_df.reset_index()
+
+    index_right = "index_{}".format(rsuffix)
+    right_df = right_df.copy(deep=True)
+    try:
+        right_index_name = right_df.index.name
+        right_df.index = right_df.index.rename(index_right)
+    except TypeError:
+        index_right = [
+            "index_{}".format(rsuffix + str(pos))
+            for pos, ix in enumerate(right_df.index.names)
+        ]
+        right_index_name = right_df.index.names
+        right_df.index = right_df.index.rename(index_right)
+    right_df = right_df.reset_index()
+
+    # perform join on the dataframes
+    if how == "inner":
+        join_df = join_df.set_index("_key_left")
+        joined = (
+            left_df.merge(join_df, left_index=True, right_index=True)
+            .merge(
+                right_df.drop(right_df.geometry.name, axis=1),
+                left_on="_key_right",
+                right_index=True,
+                suffixes=("_{}".format(lsuffix), "_{}".format(rsuffix)),
+            )
+            .set_index(index_left)
+            .drop(["_key_right"], axis=1)
+        )
+        if isinstance(index_left, list):
+            joined.index.names = left_index_name
+        else:
+            joined.index.name = left_index_name
+
+    elif how == "left":
+        join_df = join_df.set_index("_key_left")
+        joined = (
+            left_df.merge(join_df, left_index=True, right_index=True, how="left")
+            .merge(
+                right_df.drop(right_df.geometry.name, axis=1),
+                how="left",
+                left_on="_key_right",
+                right_index=True,
+                suffixes=("_{}".format(lsuffix), "_{}".format(rsuffix)),
+            )
+            .set_index(index_left)
+            .drop(["_key_right"], axis=1)
+        )
+        if isinstance(index_left, list):
+            joined.index.names = left_index_name
+        else:
+            joined.index.name = left_index_name
+
+    else:  # how == 'right':
+        joined = (
+            left_df.drop(left_df.geometry.name, axis=1)
+            .merge(
+                join_df.merge(
+                    right_df, left_on="_key_right", right_index=True, how="right"
+                ),
+                left_index=True,
+                right_on="_key_left",
+                how="right",
+                suffixes=("_{}".format(lsuffix), "_{}".format(rsuffix)),
+            )
+            .set_index(index_right)
+            .drop(["_key_left", "_key_right"], axis=1)
+            .set_geometry(right_df.geometry.name)
+        )
+        if isinstance(index_right, list):
+            joined.index.names = right_index_name
+        else:
+            joined.index.name = right_index_name
+
+    return joined
+
+
+def _nearest_query(
+    left_df: GeoDataFrame,
+    right_df: GeoDataFrame,
+    max_distance: float,
+    how: str,
+    return_distance: bool,
+    exclusive: bool,
+):
+    if not (compat.USE_SHAPELY_20 or (compat.USE_PYGEOS and compat.PYGEOS_GE_010)):
+        raise NotImplementedError(
+            "Currently, only PyGEOS >= 0.10.0 or Shapely >= 2.0 supports "
+            "`nearest_all`. " + compat.INSTALL_PYGEOS_ERROR
+        )
+
+    # use the opposite of the join direction for the index
+    use_left_as_sindex = how == "right"
+    if use_left_as_sindex:
+        sindex = left_df.sindex
+        query = right_df.geometry
+    else:
+        sindex = right_df.sindex
+        query = left_df.geometry
+    if sindex:
+        res = sindex.nearest(
+            query,
+            return_all=True,
+            max_distance=max_distance,
+            return_distance=return_distance,
+            exclusive=exclusive,
+        )
+        if return_distance:
+            (input_idx, tree_idx), distances = res
+        else:
+            (input_idx, tree_idx) = res
+            distances = None
+        if use_left_as_sindex:
+            l_idx, r_idx = tree_idx, input_idx
+            sort_order = np.argsort(l_idx, kind="stable")
+            l_idx, r_idx = l_idx[sort_order], r_idx[sort_order]
+            if distances is not None:
+                distances = distances[sort_order]
+        else:
+            l_idx, r_idx = input_idx, tree_idx
+        join_df = pd.DataFrame(
+            {"_key_left": l_idx, "_key_right": r_idx, "distances": distances}
+        )
+    else:
+        # when sindex is empty / has no valid geometries
+        join_df = pd.DataFrame(
+            columns=["_key_left", "_key_right", "distances"], dtype=float
+        )
+    return join_df
+
+
+def sjoin_nearest(
+    left_df: GeoDataFrame,
+    right_df: GeoDataFrame,
+    how: str = "inner",
+    max_distance: Optional[float] = None,
+    lsuffix: str = "left",
+    rsuffix: str = "right",
+    distance_col: Optional[str] = None,
+    exclusive: bool = False,
+) -> GeoDataFrame:
+    """Spatial join of two GeoDataFrames based on the distance between their geometries.
+
+    Results will include multiple output records for a single input record
+    where there are multiple equidistant nearest or intersected neighbors.
+
+    Distance is calculated in CRS units and can be returned using the
+    `distance_col` parameter.
+
+    See the User Guide page
+    https://geopandas.readthedocs.io/en/latest/docs/user_guide/mergingdata.html
+    for more details.
+
+
+    Parameters
+    ----------
+    left_df, right_df : GeoDataFrames
+    how : string, default 'inner'
+        The type of join:
+
+        * 'left': use keys from left_df; retain only left_df geometry column
+        * 'right': use keys from right_df; retain only right_df geometry column
+        * 'inner': use intersection of keys from both dfs; retain only
+          left_df geometry column
+    max_distance : float, default None
+        Maximum distance within which to query for nearest geometry.
+        Must be greater than 0.
+        The max_distance used to search for nearest items in the tree may have a
+        significant impact on performance by reducing the number of input
+        geometries that are evaluated for nearest items in the tree.
+    lsuffix : string, default 'left'
+        Suffix to apply to overlapping column names (left GeoDataFrame).
+    rsuffix : string, default 'right'
+        Suffix to apply to overlapping column names (right GeoDataFrame).
+    distance_col : string, default None
+        If set, save the distances computed between matching geometries under a
+        column of this name in the joined GeoDataFrame.
+    exclusive : bool, default False
+        If True, the nearest geometries that are equal to the input geometry
+        will not be returned, default False.
+        Requires Shapely >= 2.0.
+
+    Examples
+    --------
+    >>> import geodatasets
+    >>> groceries = geopandas.read_file(
+    ...     geodatasets.get_path("geoda.groceries")
+    ... )
+    >>> chicago = geopandas.read_file(
+    ...     geodatasets.get_path("geoda.chicago_health")
+    ... ).to_crs(groceries.crs)
+
+    >>> chicago.head()  # doctest: +SKIP
+        ComAreaID  ...                                           geometry
+    0         35  ...  POLYGON ((-87.60914 41.84469, -87.60915 41.844...
+    1         36  ...  POLYGON ((-87.59215 41.81693, -87.59231 41.816...
+    2         37  ...  POLYGON ((-87.62880 41.80189, -87.62879 41.801...
+    3         38  ...  POLYGON ((-87.60671 41.81681, -87.60670 41.816...
+    4         39  ...  POLYGON ((-87.59215 41.81693, -87.59215 41.816...
+    [5 rows x 87 columns]
+
+    >>> groceries.head()  # doctest: +SKIP
+        OBJECTID     Ycoord  ...  Category                         geometry
+    0        16  41.973266  ...       NaN  MULTIPOINT (-87.65661 41.97321)
+    1        18  41.696367  ...       NaN  MULTIPOINT (-87.68136 41.69713)
+    2        22  41.868634  ...       NaN  MULTIPOINT (-87.63918 41.86847)
+    3        23  41.877590  ...       new  MULTIPOINT (-87.65495 41.87783)
+    4        27  41.737696  ...       NaN  MULTIPOINT (-87.62715 41.73623)
+    [5 rows x 8 columns]
+
+    >>> groceries_w_communities = geopandas.sjoin_nearest(groceries, chicago)
+    >>> groceries_w_communities[["Chain", "community", "geometry"]].head(2)
+                    Chain community                              geometry
+    0   VIET HOA PLAZA    UPTOWN  MULTIPOINT (1168268.672 1933554.350)
+    87      JEWEL OSCO    UPTOWN  MULTIPOINT (1168837.980 1929246.962)
+
+
+    To include the distances:
+
+    >>> groceries_w_communities = geopandas.sjoin_nearest(groceries, chicago, \
+distance_col="distances")
+    >>> groceries_w_communities[["Chain", "community", \
+"distances"]].head(2)  # doctest: +SKIP
+                    Chain community  distances
+    0   VIET HOA PLAZA    UPTOWN        0.0
+    87      JEWEL OSCO    UPTOWN        0.0
+
+    In the following example, we get multiple groceries for Uptown because all
+    results are equidistant (in this case zero because they intersect).
+    In fact, we get 4 results in total:
+
+    >>> chicago_w_groceries = geopandas.sjoin_nearest(groceries, chicago, \
+distance_col="distances", how="right")
+    >>> uptown_results = \
+chicago_w_groceries[chicago_w_groceries["community"] == "UPTOWN"]
+    >>> uptown_results[["Chain", "community"]]  # doctest: +SKIP
+                Chain community
+    30  VIET HOA PLAZA    UPTOWN
+    30      JEWEL OSCO    UPTOWN
+    30          TARGET    UPTOWN
+    30       Mariano's    UPTOWN
+
+    See also
+    --------
+    sjoin : binary predicate joins
+    GeoDataFrame.sjoin_nearest : equivalent method
+
+    Notes
+    -----
+    Since this join relies on distances, results will be inaccurate
+    if your geometries are in a geographic CRS.
+
+    Every operation in GeoPandas is planar, i.e. the potential third
+    dimension is not taken into account.
+    """
+    _basic_checks(left_df, right_df, how, lsuffix, rsuffix)
+
+    left_df.geometry.values.check_geographic_crs(stacklevel=1)
+    right_df.geometry.values.check_geographic_crs(stacklevel=1)
+
+    return_distance = distance_col is not None
+
+    join_df = _nearest_query(
+        left_df, right_df, max_distance, how, return_distance, exclusive
+    )
+
+    if return_distance:
+        join_df = join_df.rename(columns={"distances": distance_col})
+    else:
+        join_df.pop("distances")
+
+    joined = _frame_join(join_df, left_df, right_df, how, lsuffix, rsuffix)
+
+    if return_distance:
+        columns = [c for c in joined.columns if c != distance_col] + [distance_col]
+        joined = joined[columns]
+
+    return joined