Skip to content

Providers


bear.typing.Provider

Bases: Protocol

Provider Protocol

Provides an interface for functions a module should implement to be considered a Provider.

Methods:

Name Description
conform

Conform function definition

epsg

Retrieve the EPSG code for this provider's data

read

Read provider data via a pyarrow.RecordBatch generator

schema

Optionally retrieve the schema of this provider's data

Source code in src/bear/typing/__init__.py
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
@runtime_checkable
class Provider(Protocol):
    """Provider Protocol

    Provides an interface for functions a module should implement
    to be considered a Provider.
    """

    @classmethod
    def epsg(cls) -> int:
        """Retrieve the EPSG code for this provider's data

        Returns
        -------
        int
            EPSG code for the underlying data.
        """
        ...

    @classmethod
    def schema(cls) -> Optional[ArrowSchema]:
        """Optionally retrieve the schema of this provider's data

        Returns
        -------
        Optional[ArrowSchema]
            If supported, returns the schema of the provider's data.
        """
        ...

    @classmethod
    def read(cls, county: USCounty, *args, **kwargs) -> ArrowBatchGenerator:
        """Read provider data via a pyarrow.RecordBatch generator

        Parameters
        ----------
        county : USCounty
            The area of interest (AOI) to pull record batches from.
        *args
            Positional arguments passed to implementation.
        **kwargs
            Keyword arguments passed to implementation.

        Returns
        -------
        ArrowBatchGenerator
            A generator yielding pyarrow.RecordBatch objects.
        """
        ...

    @classmethod
    def conform(cls, lf: LazyFrame, *args, **kwargs) -> LazyFrame:
        """Conform function definition

        Parameters
        ----------
        lf : polars.LazyFrame
            Deferred evaluation polars data frame to apply
            the implemented conform function against.
        *args
            Positional arguments passed to implementation.
        **kwargs
            Keyword arguments passed to implementation.

        Returns
        -------
        polars.LazyFrame
            A new lazy frame with the conform expressions applied.
        """
        ...

conform(lf, *args, **kwargs) classmethod

Conform function definition

Parameters:

Name Type Description Default
lf LazyFrame

Deferred evaluation polars data frame to apply the implemented conform function against.

required
*args

Positional arguments passed to implementation.

()
**kwargs

Keyword arguments passed to implementation.

{}

Returns:

Type Description
LazyFrame

A new lazy frame with the conform expressions applied.

Source code in src/bear/typing/__init__.py
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
@classmethod
def conform(cls, lf: LazyFrame, *args, **kwargs) -> LazyFrame:
    """Conform function definition

    Parameters
    ----------
    lf : polars.LazyFrame
        Deferred evaluation polars data frame to apply
        the implemented conform function against.
    *args
        Positional arguments passed to implementation.
    **kwargs
        Keyword arguments passed to implementation.

    Returns
    -------
    polars.LazyFrame
        A new lazy frame with the conform expressions applied.
    """
    ...

epsg() classmethod

Retrieve the EPSG code for this provider's data

Returns:

Type Description
int

EPSG code for the underlying data.

Source code in src/bear/typing/__init__.py
27
28
29
30
31
32
33
34
35
36
@classmethod
def epsg(cls) -> int:
    """Retrieve the EPSG code for this provider's data

    Returns
    -------
    int
        EPSG code for the underlying data.
    """
    ...

read(county, *args, **kwargs) classmethod

Read provider data via a pyarrow.RecordBatch generator

Parameters:

Name Type Description Default
county USCounty

The area of interest (AOI) to pull record batches from.

required
*args

Positional arguments passed to implementation.

()
**kwargs

Keyword arguments passed to implementation.

{}

Returns:

Type Description
ArrowBatchGenerator

A generator yielding pyarrow.RecordBatch objects.

Source code in src/bear/typing/__init__.py
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
@classmethod
def read(cls, county: USCounty, *args, **kwargs) -> ArrowBatchGenerator:
    """Read provider data via a pyarrow.RecordBatch generator

    Parameters
    ----------
    county : USCounty
        The area of interest (AOI) to pull record batches from.
    *args
        Positional arguments passed to implementation.
    **kwargs
        Keyword arguments passed to implementation.

    Returns
    -------
    ArrowBatchGenerator
        A generator yielding pyarrow.RecordBatch objects.
    """
    ...

schema() classmethod

Optionally retrieve the schema of this provider's data

Returns:

Type Description
Optional[Schema]

If supported, returns the schema of the provider's data.

Source code in src/bear/typing/__init__.py
38
39
40
41
42
43
44
45
46
47
@classmethod
def schema(cls) -> Optional[ArrowSchema]:
    """Optionally retrieve the schema of this provider's data

    Returns
    -------
    Optional[ArrowSchema]
        If supported, returns the schema of the provider's data.
    """
    ...

Implementations

Bases: Provider

Microsoft Building Footprints Provider

This data is licensed by Microsoft under the Open Data Commons Open Database License (ODbL).

Source code in src/bear/providers/provider_microsoft.py
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
@register_provider("microsoft")
class MicrosoftProvider(Provider):
    """[Microsoft Building Footprints](https://github.com/microsoft/GlobalMLBuildingFootprints) Provider

    This data is licensed by Microsoft under the [Open Data Commons
    Open Database License (ODbL)](https://opendatacommons.org/licenses/odbl/>).
    """

    @classmethod
    def epsg(cls) -> int:
        return 4326

    @classmethod
    def schema(cls) -> Optional[pa.Schema]:
        return pa.schema(
            {
                "height": pa.float64(),
                "confidence": pa.float64(),
                "geometry": pa.binary(),
            }
        )

    @classmethod
    def read(cls, county: USCounty, *args, **kwargs) -> ArrowBatchGenerator:
        raise NotImplementedError()

    @classmethod
    def conform(cls, lf: pl.LazyFrame, *args, **kwargs) -> pl.LazyFrame:
        return lf.select(
            id=plh.col("geometry").bin.encode("base64").chash.sha256(),  # type: ignore
            classification=expr.NULL,
            address=expr.NULL,
            height=(
                pl.when(pl.col("height") < 0)
                .then(expr.NULL)
                .otherwise(pl.col("height"))
            ),
            levels=expr.NULL,
            geometry=pl.col("geometry"),
        )

Bases: Provider

National Address Database (NAD) Provider

This data is a work of the federal government and is not subject to copyright protection in accordance with 17 U.S.C. § 105. It is available for re-use without limitation or restriction. See the NAD disclaimer for more details.

Source code in src/bear/providers/provider_nad.py
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
@register_provider("nad")
class NADProvider(Provider):
    """[National Address Database (NAD)](https://www.transportation.gov/gis/national-address-database) Provider

    This data is a work of the federal government and is not subject to copyright protection
    in accordance with 17 U.S.C. § 105. It is available for re-use without limitation or restriction.
    See the [NAD disclaimer](https://www.transportation.gov/mission/open/gis/national-address-database/national-address-database-nad-disclaimer)
    for more details.
    """

    @classmethod
    def epsg(cls) -> int:
        raise NotImplementedError()

    @classmethod
    def schema(cls) -> Optional[pa.Schema]:
        raise NotImplementedError()

    @classmethod
    def read(cls, county: USCounty, *args, **kwargs) -> ArrowBatchGenerator:
        raise NotImplementedError()

    @classmethod
    def conform(cls, lf: pl.LazyFrame, *args, **kwargs) -> pl.LazyFrame:
        return lf.select(
            id=(
                pl.when(pl.col("UUID").eq(expr.NULL_UUID))
                .then(
                    plh.col("geometry").bin.encode("base64").chash.sha256()  # type: ignore
                )
                .otherwise(pl.col("UUID"))
            ),
            classification=(
                pl.when(pl.col("Addr_Type").is_in(["Unknown", "Other"]))
                .then(expr.NULL)
                .otherwise(pl.col("Addr_Type"))
            ),
            address=pl.concat_str(
                pl.col("AddNo_Full"),
                pl.col("StNam_Full"),
                pl.col("SubAddress"),
                separator=" ",
                ignore_nulls=True,
            ).pipe(expr.normalize_str),
            height=expr.NULL,
            levels=expr.NULL,
            geometry=pl.col("geometry"),
        )

Bases: Provider

OpenAddresses Provider

The datasets provided by OpenAddresses are individually licensed. Most are available under open licenses, but there is no guarantee.

Source code in src/bear/providers/provider_openaddresses.py
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
@register_provider("openaddresses")
class OpenAddressesProvider(Provider):
    """[OpenAddresses](https://openaddresses.io/) Provider

    The datasets provided by OpenAddresses are individually licensed.
    Most are available under open licenses, but there is no guarantee.
    """

    @classmethod
    def epsg(cls) -> int:
        raise NotImplementedError()

    @classmethod
    def schema(cls) -> Optional[pa.Schema]:
        raise NotImplementedError()

    @classmethod
    def read(cls, county: USCounty, *args, **kwargs) -> ArrowBatchGenerator:
        raise NotImplementedError()

    @classmethod
    def conform(cls, lf: pl.LazyFrame, *args, **kwargs) -> pl.LazyFrame:
        lf = (
            lf.drop("id", "region")
            .unique()
            .with_columns(X=centroid_x("geometry"), Y=centroid_y("geometry"))
            .with_columns(
                count=pl.col("hash")
                .over(["X", "Y", "number", "street"])
                .count(),
                group=pl.struct("X", "Y").rank("dense"),
                address=pl.concat_str(
                    pl.col("number"),
                    pl.col("street"),
                    # pl.col("unit"),
                    separator=" ",
                    ignore_nulls=True,
                ).pipe(expr.normalize_str),
            )
            .filter(
                pl.col("address").is_not_null().and_(pl.col("address") != "0")
            )
        )

        singles = lf.filter(pl.col("count") == 1).with_columns(
            unit_count=1, key_id=pl.col("hash")
        )

        multis = (
            lf.filter(pl.col("count") > 1)
            .with_columns(
                pl.selectors.by_index(range(6)).backward_fill().over("group"),
                unit_count=pl.col("group").count().over("group"),
                key_id=pl.col("hash").first().over("group"),
            )
            .group_by("group")
            .first()
        )

        return (
            pl.concat([singles, multis], how="diagonal_relaxed")
            .drop("hash", "group", "count")
            .select(
                id=pl.col("key_id"),
                classification=expr.NULL,
                address=pl.col("address"),
                height=expr.NULL,
                levels=expr.NULL,
                geometry=explode_multipoint("geometry"),
            )
        )

Bases: Provider

OpenStreetMap Provider

This data is licensed by the OpenStreetMap Foundation under the Open Data Commons Open Database License (ODbL).

Source code in src/bear/providers/provider_openstreetmap.py
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
@register_provider("openstreetmap")
class OpenStreetMapProvider(Provider):
    """[OpenStreetMap](https://www.openstreetmap.org) Provider

    This data is licensed by the OpenStreetMap Foundation under the [Open Data Commons
    Open Database License (ODbL)](https://opendatacommons.org/licenses/odbl/).
    """

    @classmethod
    def epsg(cls) -> int:
        raise NotImplementedError()

    @classmethod
    def schema(cls) -> Optional[pa.Schema]:
        raise NotImplementedError()

    @classmethod
    def read(cls, county: USCounty, *args, **kwargs) -> ArrowBatchGenerator:
        raise NotImplementedError()

    @classmethod
    def conform(cls, lf: pl.LazyFrame, *args, **kwargs) -> pl.LazyFrame:
        FT_TO_M = 0.3048

        # Retrieve only features with building key
        lf = lf.filter(pl.col("building").is_not_null())

        # Initial conformance
        lf = lf.with_columns(
            id=pl.coalesce(["osm_id", "osm_way_id"]),
            classification=pl.coalesce(
                ["building", "amenity", "leisure"]
            ).replace("yes", None),
            address=pl.concat_str(
                ["name", "addr_housenumber", "addr_street", "addr_unit"],
                separator=" ",
                ignore_nulls=True,
            ),
            levels=pl.col("building_levels").str.replace_all(
                "`|''|\\+|(PK)|\\>|±", ""
            ),
        ).filter(
            pl.col("classification")
            .is_null()
            .or_(
                pl.col("classification")
                .is_in(["parking", "parking_space"])
                .not_()
            )
            .and_(
                pl.col("dataset")
                .is_null()
                .or_(pl.col("dataset") != "UniversityPly")
            )
        )

        # Handle height
        lf = lf.with_columns(
            height=pl.when(pl.col("height").is_in(["0", "0.0"]))
            .then(expr.NULL)
            .when(pl.col("height").str.contains(";", literal=True))
            .then(
                pl.col("height")
                .str.split(";")
                .list.eval(pl.element().cast(pl.Float64, strict=False))
                .list.max()
            )
            .when(pl.col("height").str.contains("ft", literal=True))
            .then(
                pl.col("height")
                .str.replace("[ft\\.]", "", literal=True)
                .str.strip_chars()
                .cast(pl.Float64, strict=False)
                .mul(FT_TO_M)
            )
            .when(pl.col("height").str.contains("m", literal=True))
            .then(
                pl.col("height")
                .str.replace("m", "", literal=True)
                .str.strip_chars()
                .cast(pl.Float64, strict=False)
            )
            .when(pl.col("height").str.ends_with("'"))
            .then(
                pl.col("height")
                .str.strip_suffix("'")
                .str.strip_chars()
                .cast(pl.Float64, strict=False)
                .mul(FT_TO_M)
            )
            .otherwise(
                pl.col("height")
                .str.strip_chars()
                .cast(pl.Float64, strict=False)
            )
        )

        # Handle levels
        lf = lf.with_columns(
            levels=pl.when(pl.col("levels").is_in(["0", "Default"]))
            .then(expr.NULL)
            .when(pl.col("levels").is_in(["Bi-Level", "Split"]))
            .then(2)
            .when(
                pl.col("levels").str.contains(",", literal=True)
                & pl.col("classification").eq("school")
            )
            .then(expr.NULL)
            .when(pl.col("levels").str.contains(".5", literal=True))
            .then(
                pl.col("levels")
                .str.replace("\\.5.*", "")
                .cast(pl.Int32, strict=False)
                .add(1)
            )
            .when(pl.col("levels").str.contains("1/2"))
            .then(
                pl.col("levels")
                .str.replace("1/2", "", literal=True)
                .str.strip_chars()
                .cast(pl.Int32, strict=False)
                .add(1)
            )
            .when(pl.col("levels").str.contains(",", literal=True))
            .then(pl.col("levels").str.split(",").list.len().cast(pl.Int32))
            .when(pl.col("levels").str.contains(";", literal=True))
            .then(
                pl.col("levels")
                .str.split(";")
                .list.eval(pl.element().cast(pl.Int32, strict=False))
                .list.max()
            )
            .when(pl.col("levels").str.contains("-", literal=True))
            .then(
                pl.col("levels")
                .str.split("-")
                .list.eval(pl.element().cast(pl.Int32, strict=False))
                .list.max()
            )
            .otherwise(pl.col("levels").cast(pl.Int32, strict=False)),
        )

        # Finalize
        return lf.with_columns(
            id=pl.col("id"),
            height=pl.when(pl.col("height") < 0)
            .then(expr.NULL)
            .otherwise(pl.col("height")),
            levels=pl.when(pl.col("levels") > 110)
            .then(expr.NULL)
            .otherwise(pl.col("levels")),
        ).select(
            [
                "id",
                "classification",
                "address",
                "height",
                "levels",
                "geometry",
            ]
        )

Bases: Provider

USA Structures Provider

The data is licensed under the Creative Commons By Attribution (CC BY 4.0) license.

Source code in src/bear/providers/provider_usa_structures.py
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
@register_provider("usa_structures")
class USAStructuresProvider(Provider):
    """[USA Structures](https://gis-fema.hub.arcgis.com/pages/usa-structures) Provider

    The data is licensed under the Creative Commons By Attribution (CC BY 4.0) license.
    """

    @classmethod
    def epsg(cls) -> int:
        raise NotImplementedError()

    @classmethod
    def schema(cls) -> Optional[pa.Schema]:
        raise NotImplementedError()

    @classmethod
    def read(cls, county: USCounty, *args, **kwargs) -> ArrowBatchGenerator:
        raise NotImplementedError()

    @classmethod
    def conform(cls, lf: pl.LazyFrame, *args, **kwargs) -> pl.LazyFrame:
        return lf.select(
            id=pl.col("UUID"),
            classification=pl.col("OCC_CLS")
            .pipe(expr.normalize_str)
            .pipe(expr.null_if_empty_str),
            address=pl.col("PROP_ADDR").pipe(expr.normalize_str),
            height=pl.col("HEIGHT"),
            levels=expr.NULL,
            geometry=pl.col("geometry"),
        )