From 0e3dba27f3b6d4cd4c1cba751901866bb9884248 Mon Sep 17 00:00:00 2001 From: Jim Fulton Date: Mon, 2 Aug 2021 16:44:04 -0400 Subject: [PATCH 01/40] Removed unused method --- tests/system/test_client.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/system/test_client.py b/tests/system/test_client.py index baa2b6ad8..abe3e335d 100644 --- a/tests/system/test_client.py +++ b/tests/system/test_client.py @@ -2330,9 +2330,6 @@ def test_create_table_rows_fetch_nested_schema(self): self.assertEqual(found[7], e_favtime) self.assertEqual(found[8], decimal.Decimal(expected["FavoriteNumber"])) - def _fetch_dataframe(self, query): - return Config.CLIENT.query(query).result().to_dataframe() - @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") @unittest.skipIf( bigquery_storage is None, "Requires `google-cloud-bigquery-storage`" From ff6c4ff6588f9e279c878a63be64ccf0b7d97722 Mon Sep 17 00:00:00 2001 From: Jim Fulton Date: Tue, 3 Aug 2021 10:55:23 -0400 Subject: [PATCH 02/40] Implemented geography_as_object argument to to_dataframe and implemented to_geodataframe --- google/cloud/bigquery/job/query.py | 88 ++++++++++++++ google/cloud/bigquery/table.py | 178 ++++++++++++++++++++++++++++ setup.py | 1 + tests/system/test_pandas.py | 53 +++++++++ tests/unit/job/test_query_pandas.py | 75 ++++++++++-- tests/unit/test_table.py | 122 +++++++++++++++++++ 6 files changed, 505 insertions(+), 12 deletions(-) diff --git a/google/cloud/bigquery/job/query.py b/google/cloud/bigquery/job/query.py index 2cb7ee28e..eb7027131 100644 --- a/google/cloud/bigquery/job/query.py +++ b/google/cloud/bigquery/job/query.py @@ -1423,6 +1423,7 @@ def to_dataframe( create_bqstorage_client: bool = True, date_as_object: bool = True, max_results: Optional[int] = None, + geography_as_object: bool = False, ) -> "pandas.DataFrame": """Return a pandas DataFrame from a QueryJob @@ -1489,6 +1490,93 @@ def to_dataframe( progress_bar_type=progress_bar_type, create_bqstorage_client=create_bqstorage_client, date_as_object=date_as_object, + geography_as_object=geography_as_object, + ) + + # If changing the signature of this method, make sure to apply the same + # changes to table.RowIterator.to_dataframe(), except for the max_results parameter + # that should only exist here in the QueryJob method. + def to_geodataframe( + self, + bqstorage_client: "bigquery_storage.BigQueryReadClient" = None, + dtypes: Dict[str, Any] = None, + progress_bar_type: str = None, + create_bqstorage_client: bool = True, + date_as_object: bool = True, + max_results: Optional[int] = None, + geography_column: Optional[str] = None, + ) -> "pandas.DataFrame": + """Return a pandas DataFrame from a QueryJob + + Args: + bqstorage_client (Optional[google.cloud.bigquery_storage_v1.BigQueryReadClient]): + A BigQuery Storage API client. If supplied, use the faster + BigQuery Storage API to fetch rows from BigQuery. This + API is a billable API. + + This method requires the ``fastavro`` and + ``google-cloud-bigquery-storage`` libraries. + + Reading from a specific partition or snapshot is not + currently supported by this method. + + dtypes (Optional[Map[str, Union[str, pandas.Series.dtype]]]): + A dictionary of column names pandas ``dtype``s. The provided + ``dtype`` is used when constructing the series for the column + specified. Otherwise, the default pandas behavior is used. + + progress_bar_type (Optional[str]): + If set, use the `tqdm `_ library to + display a progress bar while the data downloads. Install the + ``tqdm`` package to use this feature. + + See + :func:`~google.cloud.bigquery.table.RowIterator.to_dataframe` + for details. + + .. versionadded:: 1.11.0 + create_bqstorage_client (Optional[bool]): + If ``True`` (default), create a BigQuery Storage API client + using the default API settings. The BigQuery Storage API + is a faster way to fetch rows from BigQuery. See the + ``bqstorage_client`` parameter for more information. + + This argument does nothing if ``bqstorage_client`` is supplied. + + .. versionadded:: 1.24.0 + + date_as_object (Optional[bool]): + If ``True`` (default), cast dates to objects. If ``False``, convert + to datetime64[ns] dtype. + + .. versionadded:: 1.26.0 + + max_results (Optional[int]): + Maximum number of rows to include in the result. No limit by default. + + .. versionadded:: 2.21.0 + + geography_column (Optional[str]): + If there are more than one GEOGRAPHY columns, which one to use + to construct a geopandas GeoDataFrame. This option can be ommitted + if there's only one GEOGRAPHY column. + + Returns: + A :class:`~pandas.DataFrame` populated with row data and column + headers from the query results. The column headers are derived + from the destination table's schema. + + Raises: + ValueError: If the `pandas` library cannot be imported. + """ + query_result = wait_for_query(self, progress_bar_type, max_results=max_results) + return query_result.to_geodataframe( + bqstorage_client=bqstorage_client, + dtypes=dtypes, + progress_bar_type=progress_bar_type, + create_bqstorage_client=create_bqstorage_client, + date_as_object=date_as_object, + geography_column=geography_column, ) def __iter__(self): diff --git a/google/cloud/bigquery/table.py b/google/cloud/bigquery/table.py index daade1ac6..06f08f181 100644 --- a/google/cloud/bigquery/table.py +++ b/google/cloud/bigquery/table.py @@ -30,6 +30,18 @@ except ImportError: # pragma: NO COVER pandas = None +try: + import geopandas +except ImportError: + geopandas = None +else: + CRS = 'EPSG:4326' + +try: + from shapely import wkt +except ImportError: + wkt = None + try: import pyarrow except ImportError: # pragma: NO COVER @@ -61,6 +73,14 @@ "The pandas library is not installed, please install " "pandas to use the to_dataframe() function." ) +_NO_GEOPANDAS_ERROR = ( + "The geopandas library is not installed, please install " + "geopandas to use the to_geodataframe() function." +) +_NO_SHAPELY_ERROR = ( + "The shapely library is not installed, please install " + "shapely to use the geography_as_object option." +) _NO_PYARROW_ERROR = ( "The pyarrow library is not installed, please install " "pyarrow to use the to_arrow() function." @@ -1841,6 +1861,7 @@ def to_dataframe( progress_bar_type: str = None, create_bqstorage_client: bool = True, date_as_object: bool = True, + geography_as_object: bool = False, ) -> "pandas.DataFrame": """Create a pandas DataFrame by loading all pages of a query. @@ -1896,6 +1917,12 @@ def to_dataframe( .. versionadded:: 1.26.0 + geography_as_object (Optional[bool]): + If ``True``, convert geography data to shapely objects. + If ``False`` (default), don't cast geography data to shapely objects. + + .. versionadded:: ??? + Returns: pandas.DataFrame: A :class:`~pandas.DataFrame` populated with row data and column @@ -1911,6 +1938,9 @@ def to_dataframe( """ if pandas is None: raise ValueError(_NO_PANDAS_ERROR) + if geography_as_object and wkt is None: + raise ValueError(_NO_SHAPELY_ERROR) + if dtypes is None: dtypes = {} @@ -1951,8 +1981,130 @@ def to_dataframe( for column in dtypes: df[column] = pandas.Series(df[column], dtype=dtypes[column]) + if geography_as_object: + for field in self.schema: + if field.field_type.upper() == 'GEOGRAPHY': + df[field.name] = df[field.name].dropna().apply(wkt.loads) + return df + # If changing the signature of this method, make sure to apply the same + # changes to job.QueryJob.to_geodataframe() + def to_geodataframe( + self, + bqstorage_client: "bigquery_storage.BigQueryReadClient" = None, + dtypes: Dict[str, Any] = None, + progress_bar_type: str = None, + create_bqstorage_client: bool = True, + date_as_object: bool = True, + geography_column: Optional[str] = None, + ) -> "pandas.DataFrame": + """Create a pandas DataFrame by loading all pages of a query. + + Args: + bqstorage_client (Optional[google.cloud.bigquery_storage_v1.BigQueryReadClient]): + A BigQuery Storage API client. If supplied, use the faster + BigQuery Storage API to fetch rows from BigQuery. + + This method requires the ``pyarrow`` and + ``google-cloud-bigquery-storage`` libraries. + + This method only exposes a subset of the capabilities of the + BigQuery Storage API. For full access to all features + (projections, filters, snapshots) use the Storage API directly. + + dtypes (Optional[Map[str, Union[str, pandas.Series.dtype]]]): + A dictionary of column names pandas ``dtype``s. The provided + ``dtype`` is used when constructing the series for the column + specified. Otherwise, the default pandas behavior is used. + progress_bar_type (Optional[str]): + If set, use the `tqdm `_ library to + display a progress bar while the data downloads. Install the + ``tqdm`` package to use this feature. + + Possible values of ``progress_bar_type`` include: + + ``None`` + No progress bar. + ``'tqdm'`` + Use the :func:`tqdm.tqdm` function to print a progress bar + to :data:`sys.stderr`. + ``'tqdm_notebook'`` + Use the :func:`tqdm.tqdm_notebook` function to display a + progress bar as a Jupyter notebook widget. + ``'tqdm_gui'`` + Use the :func:`tqdm.tqdm_gui` function to display a + progress bar as a graphical dialog box. + + create_bqstorage_client (Optional[bool]): + If ``True`` (default), create a BigQuery Storage API client + using the default API settings. The BigQuery Storage API + is a faster way to fetch rows from BigQuery. See the + ``bqstorage_client`` parameter for more information. + + This argument does nothing if ``bqstorage_client`` is supplied. + + date_as_object (Optional[bool]): + If ``True`` (default), cast dates to objects. If ``False``, convert + to datetime64[ns] dtype. + + geography_column (Optional[str]): + If there are more than one GEOGRAPHY columns, which one to use + to construct a geopandas GeoDataFrame. This option can be ommitted + if there's only one GEOGRAPHY column. + + Returns: + geopandas.GeoDataFrame: + A :class:`~pandas.DataFrame` populated with row data and column + headers from the query results. The column headers are derived + from the destination table's schema. + + Raises: + ValueError: + If the :mod:`pandas` library cannot be imported, or the + :mod:`google.cloud.bigquery_storage_v1` module is + required but cannot be imported. + + + .. versionadded:: ??? + + """ + if geopandas is None: + raise ValueError(_NO_GEOPANDAS_ERROR) + + geography_columns = set( + field.name + for field in self.schema + if field.field_type.upper() == 'GEOGRAPHY' + ) + if not geography_columns: + raise TypeError("There must be at least one GEOGRAPHY column" + " to create a GeoDataFrame") + + if geography_column: + if geography_column not in geography_columns: + raise ValueError( + f"The given geography column, {geography_column} doesn't name" + f" a GEOGRAPHY column in the result.") + elif len(geography_columns) == 1: + [geography_column] = geography_columns + else: + raise ValueError( + "There is more than one GEOGRAPHY column in the result. " + "The geography_column argument must be used to specify which " + "one to use to create a GeoDataFrame") + + df = self.to_dataframe( + bqstorage_client, + dtypes, + progress_bar_type, + create_bqstorage_client, + date_as_object, + geography_as_object=True, + ) + + return geopandas.GeoDataFrame(df, crs=CRS, geometry=geography_column) + class _EmptyRowIterator(RowIterator): """An empty row iterator. @@ -2005,6 +2157,7 @@ def to_dataframe( progress_bar_type=None, create_bqstorage_client=True, date_as_object=True, + geography_as_object=False, ) -> "pandas.DataFrame": """Create an empty dataframe. @@ -2022,6 +2175,31 @@ def to_dataframe( raise ValueError(_NO_PANDAS_ERROR) return pandas.DataFrame() + def to_geodataframe( + self, + bqstorage_client=None, + dtypes=None, + progress_bar_type=None, + create_bqstorage_client=True, + date_as_object=True, + geography_column: Optional[str] = None, + ) -> "pandas.DataFrame": + """Create an empty dataframe. + + Args: + bqstorage_client (Any): Ignored. Added for compatibility with RowIterator. + dtypes (Any): Ignored. Added for compatibility with RowIterator. + progress_bar_type (Any): Ignored. Added for compatibility with RowIterator. + create_bqstorage_client (bool): Ignored. Added for compatibility with RowIterator. + date_as_object (bool): Ignored. Added for compatibility with RowIterator. + + Returns: + pandas.DataFrame: An empty :class:`~pandas.DataFrame`. + """ + if geopandas is None: + raise ValueError(_NO_GEOPANDAS_ERROR) + return geopandas.GeoDataFrame() + def to_dataframe_iterable( self, bqstorage_client: Optional["bigquery_storage.BigQueryReadClient"] = None, diff --git a/setup.py b/setup.py index e9deaf117..d01f3dcf6 100644 --- a/setup.py +++ b/setup.py @@ -57,6 +57,7 @@ "pyarrow >= 1.0.0, < 6.0dev", ], "pandas": ["pandas>=0.23.0", "pyarrow >= 1.0.0, < 6.0dev"], + "geopandas": ["geopandas>=0.6.0"], "bignumeric_type": ["pyarrow >= 3.0.0, < 6.0dev"], "tqdm": ["tqdm >= 4.7.4, <5.0.0dev"], "opentelemetry": [ diff --git a/tests/system/test_pandas.py b/tests/system/test_pandas.py index 821b375e1..9bbdac642 100644 --- a/tests/system/test_pandas.py +++ b/tests/system/test_pandas.py @@ -804,3 +804,56 @@ def test_list_rows_max_results_w_bqstorage(bigquery_client): dataframe = row_iterator.to_dataframe(bqstorage_client=bqstorage_client) assert len(dataframe.index) == 100 + + +def test_to_dataframe_geography_as_objects(bigquery_client, dataset_id): + bigquery_client.query( + f"create table {dataset_id}.lake (name string, geog geography)" + ).result() + bigquery_client.query( + f""" + insert into {dataset_id}.lake (name, geog) values + ('foo', st_geogfromtext('point(0 0)')), + ('bar', st_geogfromtext('point(0 1)')), + ('baz', null) + """ + ).result() + df = (bigquery_client + .query(f"select * from {dataset_id}.lake order by name") + .to_dataframe(geography_as_object=True) + ) + assert str(df) == ( + ' name geog\n' + '0 bar POINT (0 1)\n' + '1 baz NaN\n' + '2 foo POINT (0 0)') + assert [v.__class__.__name__ for v in df['geog']] == ['Point', 'float', 'Point'] + + +def test_to_geodataframe(bigquery_client, dataset_id): + bigquery_client.query( + f"create table {dataset_id}.lake (name string, geog geography)" + ).result() + bigquery_client.query( + f""" + insert into {dataset_id}.lake (name, geog) values + ('foo', st_geogfromtext('point(0 0)')), + ('bar', st_geogfromtext('polygon((0 0, 1 1, 1 0, 0 0))')), + ('baz', null) + """ + ).result() + df = (bigquery_client + .query(f"select * from {dataset_id}.lake order by name") + .to_geodataframe() + ) + assert [v.__class__.__name__ for v in df['geog']] == [ + 'Polygon', 'NoneType', 'Point'] + assert df.__class__.__name__ == 'GeoDataFrame' + assert df['geog'].__class__.__name__ == 'GeoSeries' + assert list(map(str, df['geog'])) == [ + 'POLYGON ((1 0, 1 1, 0 0, 1 0))', 'None', 'POINT (0 0)'] + assert str(df.area) == ( + '0 0.5\n' + '1 NaN\n' + '2 0.0\n' + 'dtype: float64') diff --git a/tests/unit/job/test_query_pandas.py b/tests/unit/job/test_query_pandas.py index c537802f4..f973fb7ee 100644 --- a/tests/unit/job/test_query_pandas.py +++ b/tests/unit/job/test_query_pandas.py @@ -23,6 +23,14 @@ import pandas except (ImportError, AttributeError): # pragma: NO COVER pandas = None +try: + import shapely +except (ImportError, AttributeError): # pragma: NO COVER + shapely = None +try: + import geopandas +except (ImportError, AttributeError): # pragma: NO COVER + geopandas = None try: import pyarrow except (ImportError, AttributeError): # pragma: NO COVER @@ -425,29 +433,26 @@ def test_to_arrow_w_tqdm_wo_query_plan(): result_patch_tqdm.assert_called() -@pytest.mark.skipif(pandas is None, reason="Requires `pandas`") -def test_to_dataframe(): +def _make_job(schema=(), rows=()): from google.cloud.bigquery.job import QueryJob as target_class begun_resource = _make_job_resource(job_type="query") query_resource = { "jobComplete": True, "jobReference": begun_resource["jobReference"], - "totalRows": "4", + "totalRows": str(len(rows)), "schema": { "fields": [ - {"name": "name", "type": "STRING", "mode": "NULLABLE"}, - {"name": "age", "type": "INTEGER", "mode": "NULLABLE"}, - ] + dict(name=field[0], type=field[1], mode=field[2]) + for field in schema + ] }, } tabledata_resource = { "rows": [ - {"f": [{"v": "Phred Phlyntstone"}, {"v": "32"}]}, - {"f": [{"v": "Bharney Rhubble"}, {"v": "33"}]}, - {"f": [{"v": "Wylma Phlyntstone"}, {"v": "29"}]}, - {"f": [{"v": "Bhettye Rhubble"}, {"v": "27"}]}, - ] + {"f": [{"v": v} for v in row]} + for row in rows + ] } done_resource = copy.deepcopy(begun_resource) done_resource["status"] = {"state": "DONE"} @@ -455,8 +460,18 @@ def test_to_dataframe(): begun_resource, query_resource, done_resource, tabledata_resource ) client = _make_client(connection=connection) - job = target_class.from_api_repr(begun_resource, client) + return target_class.from_api_repr(begun_resource, client) + +@pytest.mark.skipif(pandas is None, reason="Requires `pandas`") +def test_to_dataframe(): + job = _make_job( + (("name", "STRING", "NULLABLE"), ("age", "INTEGER", "NULLABLE")), + (("Phred Phlyntstone", "32"), + ("Bharney Rhubble", "33"), + ("Wylma Phlyntstone", "29"), + ("Bhettye Rhubble", "27"), + )) df = job.to_dataframe(create_bqstorage_client=False) assert isinstance(df, pandas.DataFrame) @@ -868,3 +883,39 @@ def test_to_dataframe_w_tqdm_max_results(): result_patch_tqdm.assert_called_with( timeout=_PROGRESS_BAR_UPDATE_INTERVAL, max_results=3 ) + + +@pytest.mark.skipif(pandas is None, reason="Requires `pandas`") +@pytest.mark.skipif(shapely is None, reason="Requires `shapely`") +def test_to_dataframe_geography_as_object(): + job = _make_job( + (("name", "STRING", "NULLABLE"), ("geog", "GEOGRAPHY", "NULLABLE")), + (("Phred Phlyntstone", "Point(0 0)"), + ("Bharney Rhubble", "Point(0 1)"), + ("Wylma Phlyntstone", None), + )) + df = job.to_dataframe(create_bqstorage_client=False, geography_as_object=True) + + assert isinstance(df, pandas.DataFrame) + assert len(df) == 3 # verify the number of rows + assert list(df) == ["name", "geog"] # verify the column names + assert [v.__class__.__name__ for v in df.geog] == [ + 'Point', 'Point', 'float'] # float because nan + + +@pytest.mark.skipif(geopandas is None, reason="Requires `geopandas`") +def test_to_geodataframe(): + job = _make_job( + (("name", "STRING", "NULLABLE"), ("geog", "GEOGRAPHY", "NULLABLE")), + (("Phred Phlyntstone", "Point(0 0)"), + ("Bharney Rhubble", "Point(0 1)"), + ("Wylma Phlyntstone", None), + )) + df = job.to_geodataframe(create_bqstorage_client=False) + + assert isinstance(df, geopandas.GeoDataFrame) + assert len(df) == 3 # verify the number of rows + assert list(df) == ["name", "geog"] # verify the column names + assert [v.__class__.__name__ for v in df.geog] == [ + 'Point', 'Point', 'NoneType'] # float because nan + assert isinstance(df.geog, geopandas.GeoSeries) diff --git a/tests/unit/test_table.py b/tests/unit/test_table.py index 4b1fd833b..e09e85774 100644 --- a/tests/unit/test_table.py +++ b/tests/unit/test_table.py @@ -41,6 +41,11 @@ except (ImportError, AttributeError): # pragma: NO COVER pandas = None +try: + import geopandas +except (ImportError, AttributeError): # pragma: NO COVER + geopandas = None + try: import pyarrow import pyarrow.types @@ -1665,6 +1670,13 @@ def test_to_dataframe_iterable(self): self.assertEqual(len(df), 0) # Verify the number of rows. self.assertEqual(len(df.columns), 0) + @unittest.skipIf(geopandas is None, "Requires `geopandas`") + def test_to_geodataframe(self): + row_iterator = self._make_one() + df = row_iterator.to_geodataframe(create_bqstorage_client=False) + self.assertIsInstance(df, geopandas.GeoDataFrame) + self.assertEqual(len(df), 0) # verify the number of rows + class TestRowIterator(unittest.TestCase): def _class_under_test(self): @@ -1702,6 +1714,18 @@ def _make_one( client, api_request, path, schema, table=table, **kwargs ) + def _make_one_from_data(self, schema=(), rows=()): + from google.cloud.bigquery.schema import SchemaField + schema = [SchemaField(*a) for a in schema] + rows = [ + {"f": [{"v": v} for v in row]} + for row in rows + ] + + path = "/foo" + api_request = mock.Mock(return_value={"rows": rows}) + return self._make_one(_mock_client(), api_request, path, schema) + def test_constructor(self): from google.cloud.bigquery.table import _item_to_row from google.cloud.bigquery.table import _rows_page_start @@ -2996,6 +3020,13 @@ def test_to_dataframe_error_if_pandas_is_none(self): with self.assertRaises(ValueError): row_iterator.to_dataframe() + @mock.patch("google.cloud.bigquery.table.wkt", new=None) + def test_to_dataframe_error_if_shapely_wkt_is_none(self): + with self.assertRaises(ValueError): + # No can do if no shapely + self._make_one_from_data().to_dataframe(geography_as_object=True) + + @unittest.skipIf(pandas is None, "Requires `pandas`") def test_to_dataframe_max_results_w_bqstorage_warning(self): from google.cloud.bigquery.schema import SchemaField @@ -3753,6 +3784,97 @@ def test_to_dataframe_concat_categorical_dtype_w_pyarrow(self): # Don't close the client if it was passed in. bqstorage_client._transport.grpc_channel.close.assert_not_called() + @unittest.skipIf(geopandas is None, "Requires `geopandas`") + def test_to_dataframe_geography_as_object(self): + row_iterator = self._make_one_from_data( + (("name", "STRING"), ("geog", "GEOGRAPHY")), + (("foo", "Point(0 0)"), + ("bar", None), + ("baz", "Polygon((0 0, 0 1, 1 0, 0 0))"), + )) + df = row_iterator.to_dataframe( + create_bqstorage_client=False, geography_as_object=True, + ) + self.assertIsInstance(df, pandas.DataFrame) + self.assertEqual(len(df), 3) # verify the number of rows + self.assertEqual(list(df), ["name", "geog"]) # verify the column names + self.assertEqual(df.name.dtype.name, "object") + self.assertEqual(df.geog.dtype.name, "object") + self.assertIsInstance(df.geog, pandas.Series) + self.assertEqual([v.__class__.__name__ for v in df.geog], + ['Point', 'float', 'Polygon']) + + @mock.patch("google.cloud.bigquery.table.geopandas", new=None) + def test_to_geodataframe_error_if_geopandas_is_none(self): + with self.assertRaises(ValueError): + # No can do if no shapely + self._make_one_from_data().to_geodataframe() + + @unittest.skipIf(geopandas is None, "Requires `geopandas`") + def test_to_geodataframe(self): + row_iterator = self._make_one_from_data( + (("name", "STRING"), ("geog", "GEOGRAPHY")), + (("foo", "Point(0 0)"), + ("bar", None), + ("baz", "Polygon((0 0, 0 1, 1 0, 0 0))"), + )) + df = row_iterator.to_geodataframe(create_bqstorage_client=False) + self.assertIsInstance(df, geopandas.GeoDataFrame) + self.assertEqual(len(df), 3) # verify the number of rows + self.assertEqual(list(df), ["name", "geog"]) # verify the column names + self.assertEqual(df.name.dtype.name, "object") + self.assertEqual(df.geog.dtype.name, "geometry") + self.assertIsInstance(df.geog, geopandas.GeoSeries) + self.assertEqual(list(map(str, df.area)), ['0.0', 'nan', '0.5']) + self.assertEqual(list(map(str, df.geog.area)), ['0.0', 'nan', '0.5']) + + @unittest.skipIf(geopandas is None, "Requires `geopandas`") + def test_to_geodataframe_ambiguous_geog(self): + row_iterator = self._make_one_from_data( + (("name", "STRING"), ("geog", "GEOGRAPHY"), ("geog2", "GEOGRAPHY")), + ()) + with self.assertRaises(ValueError): + row_iterator.to_geodataframe(create_bqstorage_client=False) + + @unittest.skipIf(geopandas is None, "Requires `geopandas`") + def test_to_geodataframe_no_geog(self): + row_iterator = self._make_one_from_data( + (("name", "STRING"), ("geog", "STRING")), + ()) + with self.assertRaises(TypeError): + row_iterator.to_geodataframe(create_bqstorage_client=False) + + @unittest.skipIf(geopandas is None, "Requires `geopandas`") + def test_to_geodataframe_w_geography_column(self): + row_iterator = self._make_one_from_data( + (("name", "STRING"), ("geog", "GEOGRAPHY"), ("geog2", "GEOGRAPHY")), + (("foo", "Point(0 0)", "Point(1 1)"), + ("bar", None, "Point(2 2)"), + ("baz", "Polygon((0 0, 0 1, 1 0, 0 0))", "Point(3 3)"), + )) + df = row_iterator.to_geodataframe( + create_bqstorage_client=False, geography_column='geog' + ) + self.assertIsInstance(df, geopandas.GeoDataFrame) + self.assertEqual(len(df), 3) # verify the number of rows + self.assertEqual(list(df), ["name", "geog", "geog2"]) # verify the column names + self.assertEqual(df.name.dtype.name, "object") + self.assertEqual(df.geog.dtype.name, "geometry") + self.assertEqual(df.geog2.dtype.name, "object") + self.assertIsInstance(df.geog, geopandas.GeoSeries) + self.assertEqual(list(map(str, df.area)), ['0.0', 'nan', '0.5']) + self.assertEqual(list(map(str, df.geog.area)), ['0.0', 'nan', '0.5']) + self.assertEqual([v.__class__.__name__ for v in df.geog], + ['Point', 'NoneType', 'Polygon']) + + # Geog2 isn't a GeoSeries, but it contains geomentries: + self.assertIsInstance(df.geog2, pandas.Series) + self.assertEqual([v.__class__.__name__ for v in df.geog2], + ['Point', 'Point', 'Point']) + # and can easily be converted to a GeoSeries + self.assertEqual(list(map(str, geopandas.GeoSeries(df.geog2).area)), + ['0.0', '0.0', '0.0']) + class TestPartitionRange(unittest.TestCase): def _get_target_class(self): From ecff6f9c4b30b34ea58e599cff3194d207635d2b Mon Sep 17 00:00:00 2001 From: Jim Fulton Date: Tue, 3 Aug 2021 11:36:49 -0400 Subject: [PATCH 03/40] blacken --- google/cloud/bigquery/table.py | 20 +++++--- tests/system/test_pandas.py | 53 ++++++++++---------- tests/unit/job/test_query_pandas.py | 54 ++++++++++++--------- tests/unit/test_table.py | 75 ++++++++++++++++------------- 4 files changed, 110 insertions(+), 92 deletions(-) diff --git a/google/cloud/bigquery/table.py b/google/cloud/bigquery/table.py index 06f08f181..5f35a1520 100644 --- a/google/cloud/bigquery/table.py +++ b/google/cloud/bigquery/table.py @@ -35,7 +35,7 @@ except ImportError: geopandas = None else: - CRS = 'EPSG:4326' + CRS = "EPSG:4326" try: from shapely import wkt @@ -1983,7 +1983,7 @@ def to_dataframe( if geography_as_object: for field in self.schema: - if field.field_type.upper() == 'GEOGRAPHY': + if field.field_type.upper() == "GEOGRAPHY": df[field.name] = df[field.name].dropna().apply(wkt.loads) return df @@ -2075,24 +2075,28 @@ def to_geodataframe( geography_columns = set( field.name for field in self.schema - if field.field_type.upper() == 'GEOGRAPHY' + if field.field_type.upper() == "GEOGRAPHY" ) if not geography_columns: - raise TypeError("There must be at least one GEOGRAPHY column" - " to create a GeoDataFrame") + raise TypeError( + "There must be at least one GEOGRAPHY column" + " to create a GeoDataFrame" + ) if geography_column: if geography_column not in geography_columns: raise ValueError( f"The given geography column, {geography_column} doesn't name" - f" a GEOGRAPHY column in the result.") + f" a GEOGRAPHY column in the result." + ) elif len(geography_columns) == 1: [geography_column] = geography_columns else: raise ValueError( "There is more than one GEOGRAPHY column in the result. " "The geography_column argument must be used to specify which " - "one to use to create a GeoDataFrame") + "one to use to create a GeoDataFrame" + ) df = self.to_dataframe( bqstorage_client, @@ -2101,7 +2105,7 @@ def to_geodataframe( create_bqstorage_client, date_as_object, geography_as_object=True, - ) + ) return geopandas.GeoDataFrame(df, crs=CRS, geometry=geography_column) diff --git a/tests/system/test_pandas.py b/tests/system/test_pandas.py index 9bbdac642..3b9c6cf63 100644 --- a/tests/system/test_pandas.py +++ b/tests/system/test_pandas.py @@ -817,17 +817,17 @@ def test_to_dataframe_geography_as_objects(bigquery_client, dataset_id): ('bar', st_geogfromtext('point(0 1)')), ('baz', null) """ - ).result() - df = (bigquery_client - .query(f"select * from {dataset_id}.lake order by name") - .to_dataframe(geography_as_object=True) - ) + ).result() + df = bigquery_client.query( + f"select * from {dataset_id}.lake order by name" + ).to_dataframe(geography_as_object=True) assert str(df) == ( - ' name geog\n' - '0 bar POINT (0 1)\n' - '1 baz NaN\n' - '2 foo POINT (0 0)') - assert [v.__class__.__name__ for v in df['geog']] == ['Point', 'float', 'Point'] + " name geog\n" + "0 bar POINT (0 1)\n" + "1 baz NaN\n" + "2 foo POINT (0 0)" + ) + assert [v.__class__.__name__ for v in df["geog"]] == ["Point", "float", "Point"] def test_to_geodataframe(bigquery_client, dataset_id): @@ -841,19 +841,20 @@ def test_to_geodataframe(bigquery_client, dataset_id): ('bar', st_geogfromtext('polygon((0 0, 1 1, 1 0, 0 0))')), ('baz', null) """ - ).result() - df = (bigquery_client - .query(f"select * from {dataset_id}.lake order by name") - .to_geodataframe() - ) - assert [v.__class__.__name__ for v in df['geog']] == [ - 'Polygon', 'NoneType', 'Point'] - assert df.__class__.__name__ == 'GeoDataFrame' - assert df['geog'].__class__.__name__ == 'GeoSeries' - assert list(map(str, df['geog'])) == [ - 'POLYGON ((1 0, 1 1, 0 0, 1 0))', 'None', 'POINT (0 0)'] - assert str(df.area) == ( - '0 0.5\n' - '1 NaN\n' - '2 0.0\n' - 'dtype: float64') + ).result() + df = bigquery_client.query( + f"select * from {dataset_id}.lake order by name" + ).to_geodataframe() + assert [v.__class__.__name__ for v in df["geog"]] == [ + "Polygon", + "NoneType", + "Point", + ] + assert df.__class__.__name__ == "GeoDataFrame" + assert df["geog"].__class__.__name__ == "GeoSeries" + assert list(map(str, df["geog"])) == [ + "POLYGON ((1 0, 1 1, 0 0, 1 0))", + "None", + "POINT (0 0)", + ] + assert str(df.area) == ("0 0.5\n" "1 NaN\n" "2 0.0\n" "dtype: float64") diff --git a/tests/unit/job/test_query_pandas.py b/tests/unit/job/test_query_pandas.py index f973fb7ee..7454feab2 100644 --- a/tests/unit/job/test_query_pandas.py +++ b/tests/unit/job/test_query_pandas.py @@ -443,17 +443,11 @@ def _make_job(schema=(), rows=()): "totalRows": str(len(rows)), "schema": { "fields": [ - dict(name=field[0], type=field[1], mode=field[2]) - for field in schema - ] - }, - } - tabledata_resource = { - "rows": [ - {"f": [{"v": v} for v in row]} - for row in rows + dict(name=field[0], type=field[1], mode=field[2]) for field in schema ] + }, } + tabledata_resource = {"rows": [{"f": [{"v": v} for v in row]} for row in rows]} done_resource = copy.deepcopy(begun_resource) done_resource["status"] = {"state": "DONE"} connection = _make_connection( @@ -467,11 +461,13 @@ def _make_job(schema=(), rows=()): def test_to_dataframe(): job = _make_job( (("name", "STRING", "NULLABLE"), ("age", "INTEGER", "NULLABLE")), - (("Phred Phlyntstone", "32"), - ("Bharney Rhubble", "33"), - ("Wylma Phlyntstone", "29"), - ("Bhettye Rhubble", "27"), - )) + ( + ("Phred Phlyntstone", "32"), + ("Bharney Rhubble", "33"), + ("Wylma Phlyntstone", "29"), + ("Bhettye Rhubble", "27"), + ), + ) df = job.to_dataframe(create_bqstorage_client=False) assert isinstance(df, pandas.DataFrame) @@ -890,32 +886,42 @@ def test_to_dataframe_w_tqdm_max_results(): def test_to_dataframe_geography_as_object(): job = _make_job( (("name", "STRING", "NULLABLE"), ("geog", "GEOGRAPHY", "NULLABLE")), - (("Phred Phlyntstone", "Point(0 0)"), - ("Bharney Rhubble", "Point(0 1)"), - ("Wylma Phlyntstone", None), - )) + ( + ("Phred Phlyntstone", "Point(0 0)"), + ("Bharney Rhubble", "Point(0 1)"), + ("Wylma Phlyntstone", None), + ), + ) df = job.to_dataframe(create_bqstorage_client=False, geography_as_object=True) assert isinstance(df, pandas.DataFrame) assert len(df) == 3 # verify the number of rows assert list(df) == ["name", "geog"] # verify the column names assert [v.__class__.__name__ for v in df.geog] == [ - 'Point', 'Point', 'float'] # float because nan + "Point", + "Point", + "float", + ] # float because nan @pytest.mark.skipif(geopandas is None, reason="Requires `geopandas`") def test_to_geodataframe(): job = _make_job( (("name", "STRING", "NULLABLE"), ("geog", "GEOGRAPHY", "NULLABLE")), - (("Phred Phlyntstone", "Point(0 0)"), - ("Bharney Rhubble", "Point(0 1)"), - ("Wylma Phlyntstone", None), - )) + ( + ("Phred Phlyntstone", "Point(0 0)"), + ("Bharney Rhubble", "Point(0 1)"), + ("Wylma Phlyntstone", None), + ), + ) df = job.to_geodataframe(create_bqstorage_client=False) assert isinstance(df, geopandas.GeoDataFrame) assert len(df) == 3 # verify the number of rows assert list(df) == ["name", "geog"] # verify the column names assert [v.__class__.__name__ for v in df.geog] == [ - 'Point', 'Point', 'NoneType'] # float because nan + "Point", + "Point", + "NoneType", + ] # float because nan assert isinstance(df.geog, geopandas.GeoSeries) diff --git a/tests/unit/test_table.py b/tests/unit/test_table.py index e09e85774..8bc14a126 100644 --- a/tests/unit/test_table.py +++ b/tests/unit/test_table.py @@ -1716,11 +1716,9 @@ def _make_one( def _make_one_from_data(self, schema=(), rows=()): from google.cloud.bigquery.schema import SchemaField + schema = [SchemaField(*a) for a in schema] - rows = [ - {"f": [{"v": v} for v in row]} - for row in rows - ] + rows = [{"f": [{"v": v} for v in row]} for row in rows] path = "/foo" api_request = mock.Mock(return_value={"rows": rows}) @@ -3026,7 +3024,6 @@ def test_to_dataframe_error_if_shapely_wkt_is_none(self): # No can do if no shapely self._make_one_from_data().to_dataframe(geography_as_object=True) - @unittest.skipIf(pandas is None, "Requires `pandas`") def test_to_dataframe_max_results_w_bqstorage_warning(self): from google.cloud.bigquery.schema import SchemaField @@ -3788,10 +3785,12 @@ def test_to_dataframe_concat_categorical_dtype_w_pyarrow(self): def test_to_dataframe_geography_as_object(self): row_iterator = self._make_one_from_data( (("name", "STRING"), ("geog", "GEOGRAPHY")), - (("foo", "Point(0 0)"), - ("bar", None), - ("baz", "Polygon((0 0, 0 1, 1 0, 0 0))"), - )) + ( + ("foo", "Point(0 0)"), + ("bar", None), + ("baz", "Polygon((0 0, 0 1, 1 0, 0 0))"), + ), + ) df = row_iterator.to_dataframe( create_bqstorage_client=False, geography_as_object=True, ) @@ -3801,8 +3800,9 @@ def test_to_dataframe_geography_as_object(self): self.assertEqual(df.name.dtype.name, "object") self.assertEqual(df.geog.dtype.name, "object") self.assertIsInstance(df.geog, pandas.Series) - self.assertEqual([v.__class__.__name__ for v in df.geog], - ['Point', 'float', 'Polygon']) + self.assertEqual( + [v.__class__.__name__ for v in df.geog], ["Point", "float", "Polygon"] + ) @mock.patch("google.cloud.bigquery.table.geopandas", new=None) def test_to_geodataframe_error_if_geopandas_is_none(self): @@ -3814,10 +3814,12 @@ def test_to_geodataframe_error_if_geopandas_is_none(self): def test_to_geodataframe(self): row_iterator = self._make_one_from_data( (("name", "STRING"), ("geog", "GEOGRAPHY")), - (("foo", "Point(0 0)"), - ("bar", None), - ("baz", "Polygon((0 0, 0 1, 1 0, 0 0))"), - )) + ( + ("foo", "Point(0 0)"), + ("bar", None), + ("baz", "Polygon((0 0, 0 1, 1 0, 0 0))"), + ), + ) df = row_iterator.to_geodataframe(create_bqstorage_client=False) self.assertIsInstance(df, geopandas.GeoDataFrame) self.assertEqual(len(df), 3) # verify the number of rows @@ -3825,22 +3827,22 @@ def test_to_geodataframe(self): self.assertEqual(df.name.dtype.name, "object") self.assertEqual(df.geog.dtype.name, "geometry") self.assertIsInstance(df.geog, geopandas.GeoSeries) - self.assertEqual(list(map(str, df.area)), ['0.0', 'nan', '0.5']) - self.assertEqual(list(map(str, df.geog.area)), ['0.0', 'nan', '0.5']) + self.assertEqual(list(map(str, df.area)), ["0.0", "nan", "0.5"]) + self.assertEqual(list(map(str, df.geog.area)), ["0.0", "nan", "0.5"]) @unittest.skipIf(geopandas is None, "Requires `geopandas`") def test_to_geodataframe_ambiguous_geog(self): row_iterator = self._make_one_from_data( - (("name", "STRING"), ("geog", "GEOGRAPHY"), ("geog2", "GEOGRAPHY")), - ()) + (("name", "STRING"), ("geog", "GEOGRAPHY"), ("geog2", "GEOGRAPHY")), () + ) with self.assertRaises(ValueError): row_iterator.to_geodataframe(create_bqstorage_client=False) @unittest.skipIf(geopandas is None, "Requires `geopandas`") def test_to_geodataframe_no_geog(self): row_iterator = self._make_one_from_data( - (("name", "STRING"), ("geog", "STRING")), - ()) + (("name", "STRING"), ("geog", "STRING")), () + ) with self.assertRaises(TypeError): row_iterator.to_geodataframe(create_bqstorage_client=False) @@ -3848,12 +3850,14 @@ def test_to_geodataframe_no_geog(self): def test_to_geodataframe_w_geography_column(self): row_iterator = self._make_one_from_data( (("name", "STRING"), ("geog", "GEOGRAPHY"), ("geog2", "GEOGRAPHY")), - (("foo", "Point(0 0)", "Point(1 1)"), - ("bar", None, "Point(2 2)"), - ("baz", "Polygon((0 0, 0 1, 1 0, 0 0))", "Point(3 3)"), - )) + ( + ("foo", "Point(0 0)", "Point(1 1)"), + ("bar", None, "Point(2 2)"), + ("baz", "Polygon((0 0, 0 1, 1 0, 0 0))", "Point(3 3)"), + ), + ) df = row_iterator.to_geodataframe( - create_bqstorage_client=False, geography_column='geog' + create_bqstorage_client=False, geography_column="geog" ) self.assertIsInstance(df, geopandas.GeoDataFrame) self.assertEqual(len(df), 3) # verify the number of rows @@ -3862,18 +3866,21 @@ def test_to_geodataframe_w_geography_column(self): self.assertEqual(df.geog.dtype.name, "geometry") self.assertEqual(df.geog2.dtype.name, "object") self.assertIsInstance(df.geog, geopandas.GeoSeries) - self.assertEqual(list(map(str, df.area)), ['0.0', 'nan', '0.5']) - self.assertEqual(list(map(str, df.geog.area)), ['0.0', 'nan', '0.5']) - self.assertEqual([v.__class__.__name__ for v in df.geog], - ['Point', 'NoneType', 'Polygon']) + self.assertEqual(list(map(str, df.area)), ["0.0", "nan", "0.5"]) + self.assertEqual(list(map(str, df.geog.area)), ["0.0", "nan", "0.5"]) + self.assertEqual( + [v.__class__.__name__ for v in df.geog], ["Point", "NoneType", "Polygon"] + ) # Geog2 isn't a GeoSeries, but it contains geomentries: self.assertIsInstance(df.geog2, pandas.Series) - self.assertEqual([v.__class__.__name__ for v in df.geog2], - ['Point', 'Point', 'Point']) + self.assertEqual( + [v.__class__.__name__ for v in df.geog2], ["Point", "Point", "Point"] + ) # and can easily be converted to a GeoSeries - self.assertEqual(list(map(str, geopandas.GeoSeries(df.geog2).area)), - ['0.0', '0.0', '0.0']) + self.assertEqual( + list(map(str, geopandas.GeoSeries(df.geog2).area)), ["0.0", "0.0", "0.0"] + ) class TestPartitionRange(unittest.TestCase): From 766be7742f85b92f9139e9dbbd9c99bc7f8c8341 Mon Sep 17 00:00:00 2001 From: Jim Fulton Date: Tue, 3 Aug 2021 11:57:13 -0400 Subject: [PATCH 04/40] Need to use different table names in tests. --- tests/system/test_pandas.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/system/test_pandas.py b/tests/system/test_pandas.py index 3b9c6cf63..029001306 100644 --- a/tests/system/test_pandas.py +++ b/tests/system/test_pandas.py @@ -832,18 +832,18 @@ def test_to_dataframe_geography_as_objects(bigquery_client, dataset_id): def test_to_geodataframe(bigquery_client, dataset_id): bigquery_client.query( - f"create table {dataset_id}.lake (name string, geog geography)" + f"create table {dataset_id}.geolake (name string, geog geography)" ).result() bigquery_client.query( f""" - insert into {dataset_id}.lake (name, geog) values + insert into {dataset_id}.geolake (name, geog) values ('foo', st_geogfromtext('point(0 0)')), ('bar', st_geogfromtext('polygon((0 0, 1 1, 1 0, 0 0))')), ('baz', null) """ ).result() df = bigquery_client.query( - f"select * from {dataset_id}.lake order by name" + f"select * from {dataset_id}.geolake order by name" ).to_geodataframe() assert [v.__class__.__name__ for v in df["geog"]] == [ "Polygon", From a9045042da362c1def181328e90ea97f3d3aad0f Mon Sep 17 00:00:00 2001 From: Jim Fulton Date: Tue, 3 Aug 2021 13:40:42 -0400 Subject: [PATCH 05/40] check messages in error assertions (for geo) --- tests/unit/test_table.py | 32 ++++++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 6 deletions(-) diff --git a/tests/unit/test_table.py b/tests/unit/test_table.py index 8bc14a126..9f88bedb9 100644 --- a/tests/unit/test_table.py +++ b/tests/unit/test_table.py @@ -14,6 +14,7 @@ import datetime import logging +import re import time import types import unittest @@ -3020,8 +3021,13 @@ def test_to_dataframe_error_if_pandas_is_none(self): @mock.patch("google.cloud.bigquery.table.wkt", new=None) def test_to_dataframe_error_if_shapely_wkt_is_none(self): - with self.assertRaises(ValueError): - # No can do if no shapely + with self.assertRaisesRegex( + ValueError, + re.escape( + "The shapely library is not installed, please install " + "shapely to use the geography_as_object option." + ), + ): self._make_one_from_data().to_dataframe(geography_as_object=True) @unittest.skipIf(pandas is None, "Requires `pandas`") @@ -3806,8 +3812,12 @@ def test_to_dataframe_geography_as_object(self): @mock.patch("google.cloud.bigquery.table.geopandas", new=None) def test_to_geodataframe_error_if_geopandas_is_none(self): - with self.assertRaises(ValueError): - # No can do if no shapely + with self.assertRaisesRegex( + ValueError, + re.escape( + "The geopandas library is not installed, please install " + "geopandas to use the to_geodataframe() function."), + ): self._make_one_from_data().to_geodataframe() @unittest.skipIf(geopandas is None, "Requires `geopandas`") @@ -3835,7 +3845,13 @@ def test_to_geodataframe_ambiguous_geog(self): row_iterator = self._make_one_from_data( (("name", "STRING"), ("geog", "GEOGRAPHY"), ("geog2", "GEOGRAPHY")), () ) - with self.assertRaises(ValueError): + with self.assertRaisesRegex( + ValueError, + re.escape( + "There is more than one GEOGRAPHY column in the result. " + "The geography_column argument must be used to specify which " + "one to use to create a GeoDataFrame"), + ): row_iterator.to_geodataframe(create_bqstorage_client=False) @unittest.skipIf(geopandas is None, "Requires `geopandas`") @@ -3843,7 +3859,11 @@ def test_to_geodataframe_no_geog(self): row_iterator = self._make_one_from_data( (("name", "STRING"), ("geog", "STRING")), () ) - with self.assertRaises(TypeError): + with self.assertRaisesRegex( + TypeError, + re.escape("There must be at least one GEOGRAPHY column" + " to create a GeoDataFrame"), + ): row_iterator.to_geodataframe(create_bqstorage_client=False) @unittest.skipIf(geopandas is None, "Requires `geopandas`") From bc985f95ceffef6f2e1b5e0ae8940fe614f5c264 Mon Sep 17 00:00:00 2001 From: Jim Fulton Date: Tue, 3 Aug 2021 13:49:48 -0400 Subject: [PATCH 06/40] added missing comma --- google/cloud/bigquery/table.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/google/cloud/bigquery/table.py b/google/cloud/bigquery/table.py index 5f35a1520..490613b6d 100644 --- a/google/cloud/bigquery/table.py +++ b/google/cloud/bigquery/table.py @@ -2086,7 +2086,7 @@ def to_geodataframe( if geography_column: if geography_column not in geography_columns: raise ValueError( - f"The given geography column, {geography_column} doesn't name" + f"The given geography column, {geography_column}, doesn't name" f" a GEOGRAPHY column in the result." ) elif len(geography_columns) == 1: From fb6eafdb0ddda51c9570374fa14fa382ce7015d5 Mon Sep 17 00:00:00 2001 From: Jim Fulton Date: Tue, 3 Aug 2021 13:50:07 -0400 Subject: [PATCH 07/40] Added missing tests. --- tests/unit/test_table.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/tests/unit/test_table.py b/tests/unit/test_table.py index 9f88bedb9..93a9aee4c 100644 --- a/tests/unit/test_table.py +++ b/tests/unit/test_table.py @@ -1671,6 +1671,18 @@ def test_to_dataframe_iterable(self): self.assertEqual(len(df), 0) # Verify the number of rows. self.assertEqual(len(df.columns), 0) + @mock.patch("google.cloud.bigquery.table.geopandas", new=None) + def test_to_geodataframe_if_geopandas_is_none(self): + row_iterator = self._make_one() + with self.assertRaisesRegex( + ValueError, + re.escape( + "The geopandas library is not installed, please install " + "geopandas to use the to_geodataframe() function." + ), + ): + row_iterator.to_geodataframe(create_bqstorage_client=False) + @unittest.skipIf(geopandas is None, "Requires `geopandas`") def test_to_geodataframe(self): row_iterator = self._make_one() @@ -3854,6 +3866,20 @@ def test_to_geodataframe_ambiguous_geog(self): ): row_iterator.to_geodataframe(create_bqstorage_client=False) + @unittest.skipIf(geopandas is None, "Requires `geopandas`") + def test_to_geodataframe_bad_geography_column(self): + row_iterator = self._make_one_from_data( + (("name", "STRING"), ("geog", "GEOGRAPHY"), ("geog2", "GEOGRAPHY")), () + ) + with self.assertRaisesRegex( + ValueError, + re.escape( + "The given geography column, xxx, doesn't name" + " a GEOGRAPHY column in the result."), + ): + row_iterator.to_geodataframe(create_bqstorage_client=False, + geography_column="xxx") + @unittest.skipIf(geopandas is None, "Requires `geopandas`") def test_to_geodataframe_no_geog(self): row_iterator = self._make_one_from_data( From 0547801dad52504085bb365646305ec2dcd0cfff Mon Sep 17 00:00:00 2001 From: Jim Fulton Date: Tue, 3 Aug 2021 13:55:57 -0400 Subject: [PATCH 08/40] blacken --- tests/unit/test_table.py | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/tests/unit/test_table.py b/tests/unit/test_table.py index 93a9aee4c..ea61d3868 100644 --- a/tests/unit/test_table.py +++ b/tests/unit/test_table.py @@ -1679,7 +1679,7 @@ def test_to_geodataframe_if_geopandas_is_none(self): re.escape( "The geopandas library is not installed, please install " "geopandas to use the to_geodataframe() function." - ), + ), ): row_iterator.to_geodataframe(create_bqstorage_client=False) @@ -3038,7 +3038,7 @@ def test_to_dataframe_error_if_shapely_wkt_is_none(self): re.escape( "The shapely library is not installed, please install " "shapely to use the geography_as_object option." - ), + ), ): self._make_one_from_data().to_dataframe(geography_as_object=True) @@ -3828,7 +3828,8 @@ def test_to_geodataframe_error_if_geopandas_is_none(self): ValueError, re.escape( "The geopandas library is not installed, please install " - "geopandas to use the to_geodataframe() function."), + "geopandas to use the to_geodataframe() function." + ), ): self._make_one_from_data().to_geodataframe() @@ -3862,7 +3863,8 @@ def test_to_geodataframe_ambiguous_geog(self): re.escape( "There is more than one GEOGRAPHY column in the result. " "The geography_column argument must be used to specify which " - "one to use to create a GeoDataFrame"), + "one to use to create a GeoDataFrame" + ), ): row_iterator.to_geodataframe(create_bqstorage_client=False) @@ -3875,10 +3877,12 @@ def test_to_geodataframe_bad_geography_column(self): ValueError, re.escape( "The given geography column, xxx, doesn't name" - " a GEOGRAPHY column in the result."), + " a GEOGRAPHY column in the result." + ), ): - row_iterator.to_geodataframe(create_bqstorage_client=False, - geography_column="xxx") + row_iterator.to_geodataframe( + create_bqstorage_client=False, geography_column="xxx" + ) @unittest.skipIf(geopandas is None, "Requires `geopandas`") def test_to_geodataframe_no_geog(self): @@ -3887,8 +3891,10 @@ def test_to_geodataframe_no_geog(self): ) with self.assertRaisesRegex( TypeError, - re.escape("There must be at least one GEOGRAPHY column" - " to create a GeoDataFrame"), + re.escape( + "There must be at least one GEOGRAPHY column" + " to create a GeoDataFrame" + ), ): row_iterator.to_geodataframe(create_bqstorage_client=False) From e0dde7c85c4b8ff43ed26ce687c76eaaa052ae10 Mon Sep 17 00:00:00 2001 From: Jim Fulton Date: Tue, 3 Aug 2021 15:10:19 -0400 Subject: [PATCH 09/40] Added tests showing delegation in some to_geodataframe implementations --- tests/unit/job/test_query_pandas.py | 45 +++++++++++++++++++++++ tests/unit/test_table.py | 56 +++++++++++++++++++++++++++++ 2 files changed, 101 insertions(+) diff --git a/tests/unit/job/test_query_pandas.py b/tests/unit/job/test_query_pandas.py index 7454feab2..0bb5b8122 100644 --- a/tests/unit/job/test_query_pandas.py +++ b/tests/unit/job/test_query_pandas.py @@ -925,3 +925,48 @@ def test_to_geodataframe(): "NoneType", ] # float because nan assert isinstance(df.geog, geopandas.GeoSeries) + + +@pytest.mark.skipif(geopandas is None, reason="Requires `geopandas`") +@mock.patch("google.cloud.bigquery.job.query.wait_for_query") +def test_query_job_to_geodataframe_delegation(wait_for_query): + """ + QueryJob.to_geodataframe just delegates to RowIterator.to_geodataframe. + + This test just demonstrates that. We don't need to test all the + variations, which are tested for RowIterator. + """ + import numpy + job = _make_job() + bqstorage_client = object() + dtypes = dict(xxx=numpy.dtype('int64')) + progress_bar_type = 'normal' + create_bqstorage_client = False + date_as_object = False + max_results = 42 + geography_column = 'g' + + df = job.to_geodataframe( + bqstorage_client=bqstorage_client, + dtypes=dtypes, + progress_bar_type=progress_bar_type, + create_bqstorage_client=create_bqstorage_client, + date_as_object=date_as_object, + max_results=max_results, + geography_column=geography_column, + ) + + wait_for_query.assert_called_once_with( + job, progress_bar_type, max_results=max_results + ) + row_iterator = wait_for_query.return_value + row_iterator.to_geodataframe.assert_called_once_with( + bqstorage_client=bqstorage_client, + dtypes=dtypes, + progress_bar_type=progress_bar_type, + create_bqstorage_client=create_bqstorage_client, + date_as_object=date_as_object, + geography_column=geography_column, + ) + assert df is row_iterator.to_geodataframe.return_value + diff --git a/tests/unit/test_table.py b/tests/unit/test_table.py index ea61d3868..a045fd920 100644 --- a/tests/unit/test_table.py +++ b/tests/unit/test_table.py @@ -3934,6 +3934,62 @@ def test_to_geodataframe_w_geography_column(self): list(map(str, geopandas.GeoSeries(df.geog2).area)), ["0.0", "0.0", "0.0"] ) + @unittest.skipIf(geopandas is None, "Requires `geopandas`") + @mock.patch("google.cloud.bigquery.table.RowIterator.to_dataframe") + def test_rowiterator_to_geodataframe_delegation(self, to_dataframe): + """ + RowIterator.to_geodataframe just delegates to RowIterator.to_dataframe. + + This test just demonstrates that. We don't need to test all the + variations, which are tested for to_dataframe. + """ + import numpy + import shapely + + row_iterator = self._make_one_from_data( + (("name", "STRING"), ("g", "GEOGRAPHY")) + ) + bqstorage_client = object() + dtypes = dict(xxx=numpy.dtype('int64')) + progress_bar_type = 'normal' + create_bqstorage_client = False + date_as_object = False + geography_column = 'g' + + to_dataframe.return_value = pandas.DataFrame( + dict(name=["foo"], + g=[shapely.wkt.loads('point(0 0)')], + )) + + df = row_iterator.to_geodataframe( + bqstorage_client=bqstorage_client, + dtypes=dtypes, + progress_bar_type=progress_bar_type, + create_bqstorage_client=create_bqstorage_client, + date_as_object=date_as_object, + geography_column=geography_column, + ) + + to_dataframe.assert_called_once_with( + bqstorage_client, + dtypes, + progress_bar_type, + create_bqstorage_client, + date_as_object, + geography_as_object=True) + + self.assertIsInstance(df, geopandas.GeoDataFrame) + self.assertEqual(len(df), 1) # verify the number of rows + self.assertEqual(list(df), ["name", "g"]) # verify the column names + self.assertEqual(df.name.dtype.name, "object") + self.assertEqual(df.g.dtype.name, "geometry") + self.assertIsInstance(df.g, geopandas.GeoSeries) + self.assertEqual(list(map(str, df.area)), ["0.0"]) + self.assertEqual(list(map(str, df.g.area)), ["0.0"]) + self.assertEqual( + [v.__class__.__name__ for v in df.g], ["Point"] + ) + class TestPartitionRange(unittest.TestCase): def _get_target_class(self): From 177b099b3650f977a1bda011b2c56181055c99e0 Mon Sep 17 00:00:00 2001 From: Jim Fulton Date: Tue, 3 Aug 2021 15:19:58 -0400 Subject: [PATCH 10/40] Added a missing skip --- tests/unit/test_table.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/unit/test_table.py b/tests/unit/test_table.py index a045fd920..f4e38da14 100644 --- a/tests/unit/test_table.py +++ b/tests/unit/test_table.py @@ -3031,6 +3031,7 @@ def test_to_dataframe_error_if_pandas_is_none(self): with self.assertRaises(ValueError): row_iterator.to_dataframe() + @unittest.skipIf(pandas is None, "Requires `pandas`") @mock.patch("google.cloud.bigquery.table.wkt", new=None) def test_to_dataframe_error_if_shapely_wkt_is_none(self): with self.assertRaisesRegex( From 86820e29ff79bd7fa18e871fb18d09c26ddd2f12 Mon Sep 17 00:00:00 2001 From: Jim Fulton Date: Tue, 3 Aug 2021 16:17:18 -0400 Subject: [PATCH 11/40] blacken --- tests/unit/job/test_query_pandas.py | 12 ++++++------ tests/unit/test_table.py | 20 +++++++++----------- 2 files changed, 15 insertions(+), 17 deletions(-) diff --git a/tests/unit/job/test_query_pandas.py b/tests/unit/job/test_query_pandas.py index 0bb5b8122..b5af90c0b 100644 --- a/tests/unit/job/test_query_pandas.py +++ b/tests/unit/job/test_query_pandas.py @@ -937,14 +937,15 @@ def test_query_job_to_geodataframe_delegation(wait_for_query): variations, which are tested for RowIterator. """ import numpy + job = _make_job() bqstorage_client = object() - dtypes = dict(xxx=numpy.dtype('int64')) - progress_bar_type = 'normal' + dtypes = dict(xxx=numpy.dtype("int64")) + progress_bar_type = "normal" create_bqstorage_client = False date_as_object = False max_results = 42 - geography_column = 'g' + geography_column = "g" df = job.to_geodataframe( bqstorage_client=bqstorage_client, @@ -954,7 +955,7 @@ def test_query_job_to_geodataframe_delegation(wait_for_query): date_as_object=date_as_object, max_results=max_results, geography_column=geography_column, - ) + ) wait_for_query.assert_called_once_with( job, progress_bar_type, max_results=max_results @@ -967,6 +968,5 @@ def test_query_job_to_geodataframe_delegation(wait_for_query): create_bqstorage_client=create_bqstorage_client, date_as_object=date_as_object, geography_column=geography_column, - ) + ) assert df is row_iterator.to_geodataframe.return_value - diff --git a/tests/unit/test_table.py b/tests/unit/test_table.py index f4e38da14..7c0057404 100644 --- a/tests/unit/test_table.py +++ b/tests/unit/test_table.py @@ -3951,16 +3951,15 @@ def test_rowiterator_to_geodataframe_delegation(self, to_dataframe): (("name", "STRING"), ("g", "GEOGRAPHY")) ) bqstorage_client = object() - dtypes = dict(xxx=numpy.dtype('int64')) - progress_bar_type = 'normal' + dtypes = dict(xxx=numpy.dtype("int64")) + progress_bar_type = "normal" create_bqstorage_client = False date_as_object = False - geography_column = 'g' + geography_column = "g" to_dataframe.return_value = pandas.DataFrame( - dict(name=["foo"], - g=[shapely.wkt.loads('point(0 0)')], - )) + dict(name=["foo"], g=[shapely.wkt.loads("point(0 0)")],) + ) df = row_iterator.to_geodataframe( bqstorage_client=bqstorage_client, @@ -3969,7 +3968,7 @@ def test_rowiterator_to_geodataframe_delegation(self, to_dataframe): create_bqstorage_client=create_bqstorage_client, date_as_object=date_as_object, geography_column=geography_column, - ) + ) to_dataframe.assert_called_once_with( bqstorage_client, @@ -3977,7 +3976,8 @@ def test_rowiterator_to_geodataframe_delegation(self, to_dataframe): progress_bar_type, create_bqstorage_client, date_as_object, - geography_as_object=True) + geography_as_object=True, + ) self.assertIsInstance(df, geopandas.GeoDataFrame) self.assertEqual(len(df), 1) # verify the number of rows @@ -3987,9 +3987,7 @@ def test_rowiterator_to_geodataframe_delegation(self, to_dataframe): self.assertIsInstance(df.g, geopandas.GeoSeries) self.assertEqual(list(map(str, df.area)), ["0.0"]) self.assertEqual(list(map(str, df.g.area)), ["0.0"]) - self.assertEqual( - [v.__class__.__name__ for v in df.g], ["Point"] - ) + self.assertEqual([v.__class__.__name__ for v in df.g], ["Point"]) class TestPartitionRange(unittest.TestCase): From c298fbdd4b59748cafe7396d0561db833c68fa2a Mon Sep 17 00:00:00 2001 From: Jim Fulton Date: Wed, 4 Aug 2021 09:08:25 -0400 Subject: [PATCH 12/40] doc tweaks --- google/cloud/bigquery/job/query.py | 48 ++++++++++++++++++++++-------- google/cloud/bigquery/table.py | 43 +++++++++++++------------- 2 files changed, 58 insertions(+), 33 deletions(-) diff --git a/google/cloud/bigquery/job/query.py b/google/cloud/bigquery/job/query.py index eb7027131..c3c280462 100644 --- a/google/cloud/bigquery/job/query.py +++ b/google/cloud/bigquery/job/query.py @@ -1475,13 +1475,27 @@ def to_dataframe( .. versionadded:: 2.21.0 + geography_as_object (Optional[bool]): + If ``True``, convert GEOGRAPHY data to :mod:`shapely` + geometry objects. If ``False`` (default), don't cast + geography data to :mod:`shapely` geometry objects. + + .. versionadded:: 2.24.0 + Returns: - A :class:`~pandas.DataFrame` populated with row data and column - headers from the query results. The column headers are derived - from the destination table's schema. + `pandas.DataFrame`: + A :class:`~pandas.DataFrame` populated with row data + and column headers from the query results. The column + headers are derived from the destination table's + schema. Raises: - ValueError: If the `pandas` library cannot be imported. + ValueError: + If the :mod:`pandas` library cannot be imported, or + the :mod:`google.cloud.bigquery_storage_v1` module is + required but cannot be imported. Also if + `geography_as_object` is `True`, but the + :mod:`shapely` library cannot be imported. """ query_result = wait_for_query(self, progress_bar_type, max_results=max_results) return query_result.to_dataframe( @@ -1505,8 +1519,8 @@ def to_geodataframe( date_as_object: bool = True, max_results: Optional[int] = None, geography_column: Optional[str] = None, - ) -> "pandas.DataFrame": - """Return a pandas DataFrame from a QueryJob + ) -> "geopandas.GeoDataFrame": + """Return a GeoPandas GeoDataFrame from a QueryJob Args: bqstorage_client (Optional[google.cloud.bigquery_storage_v1.BigQueryReadClient]): @@ -1557,17 +1571,25 @@ def to_geodataframe( .. versionadded:: 2.21.0 geography_column (Optional[str]): - If there are more than one GEOGRAPHY columns, which one to use - to construct a geopandas GeoDataFrame. This option can be ommitted - if there's only one GEOGRAPHY column. + If there are more than one GEOGRAPHY column, + identifies which one to use to construct a GeoPandas + GeoDataFrame. This option can be ommitted if there's + only one GEOGRAPHY column. Returns: - A :class:`~pandas.DataFrame` populated with row data and column - headers from the query results. The column headers are derived - from the destination table's schema. + `geopandas.GeoDataFrame`: + A :class:`geopandas.GeoDataFrame` populated with row + data and column headers from the query results. The + column headers are derived from the destination + table's schema. Raises: - ValueError: If the `pandas` library cannot be imported. + ValueError: + If the :mod:`geopandas` library cannot be imported, or the + :mod:`google.cloud.bigquery_storage_v1` module is + required but cannot be imported. + + .. versionadded:: 2.24.0 """ query_result = wait_for_query(self, progress_bar_type, max_results=max_results) return query_result.to_geodataframe( diff --git a/google/cloud/bigquery/table.py b/google/cloud/bigquery/table.py index 490613b6d..b626defb1 100644 --- a/google/cloud/bigquery/table.py +++ b/google/cloud/bigquery/table.py @@ -1918,22 +1918,25 @@ def to_dataframe( .. versionadded:: 1.26.0 geography_as_object (Optional[bool]): - If ``True``, convert geography data to shapely objects. - If ``False`` (default), don't cast geography data to shapely objects. + If ``True``, convert GEOGRAPHY data to :mod:`shapely` + geometry objects. If ``False`` (default), don't cast + geography data to :mod:`shapely` geometry objects. - .. versionadded:: ??? + .. versionadded:: 2.24.0 Returns: - pandas.DataFrame: + `pandas.DataFrame`: A :class:`~pandas.DataFrame` populated with row data and column headers from the query results. The column headers are derived from the destination table's schema. Raises: ValueError: - If the :mod:`pandas` library cannot be imported, or the - :mod:`google.cloud.bigquery_storage_v1` module is - required but cannot be imported. + If the :mod:`pandas` library cannot be imported, or + the :mod:`google.cloud.bigquery_storage_v1` module is + required but cannot be imported. Also if + `geography_as_object` is `True`, but the + :mod:`shapely` library cannot be imported. """ if pandas is None: @@ -1998,8 +2001,8 @@ def to_geodataframe( create_bqstorage_client: bool = True, date_as_object: bool = True, geography_column: Optional[str] = None, - ) -> "pandas.DataFrame": - """Create a pandas DataFrame by loading all pages of a query. + ) -> "geopandas.GeoDataFrame": + """Create a GeoPandas GeoDataFrame by loading all pages of a query. Args: bqstorage_client (Optional[google.cloud.bigquery_storage_v1.BigQueryReadClient]): @@ -2049,25 +2052,25 @@ def to_geodataframe( to datetime64[ns] dtype. geography_column (Optional[str]): - If there are more than one GEOGRAPHY columns, which one to use - to construct a geopandas GeoDataFrame. This option can be ommitted - if there's only one GEOGRAPHY column. + If there are more than one GEOGRAPHY column, + identifies which one to use to construct a geopandas + GeoDataFrame. This option can be ommitted if there's + only one GEOGRAPHY column. Returns: - geopandas.GeoDataFrame: - A :class:`~pandas.DataFrame` populated with row data and column - headers from the query results. The column headers are derived - from the destination table's schema. + `geopandas.GeoDataFrame`: + A :class:`geopandas.GeoDataFrame` populated with row + data and column headers from the query results. The + column headers are derived from the destination + table's schema. Raises: ValueError: - If the :mod:`pandas` library cannot be imported, or the + If the :mod:`geopandas` library cannot be imported, or the :mod:`google.cloud.bigquery_storage_v1` module is required but cannot be imported. - - .. versionadded:: ??? - + .. versionadded:: 2.24.0 """ if geopandas is None: raise ValueError(_NO_GEOPANDAS_ERROR) From e9cc38f91830a09772a84888ebb7e95ca6a50e0a Mon Sep 17 00:00:00 2001 From: Jim Fulton Date: Wed, 4 Aug 2021 11:09:52 -0400 Subject: [PATCH 13/40] Updated dataframe_to_bq_schema to recognize geography data --- google/cloud/bigquery/_pandas_helpers.py | 15 ++++++++++++++ tests/unit/test__pandas_helpers.py | 25 ++++++++++++++++++++++++ 2 files changed, 40 insertions(+) diff --git a/google/cloud/bigquery/_pandas_helpers.py b/google/cloud/bigquery/_pandas_helpers.py index b381fa5f7..89aef4422 100644 --- a/google/cloud/bigquery/_pandas_helpers.py +++ b/google/cloud/bigquery/_pandas_helpers.py @@ -27,6 +27,11 @@ except ImportError: # pragma: NO COVER pandas = None +try: + from shapely.geometry.base import BaseGeometry +except ImportError: # pragma: NO COVER + BaseGeometry = type(None) + try: import pyarrow import pyarrow.parquet @@ -71,6 +76,7 @@ "uint8": "INTEGER", "uint16": "INTEGER", "uint32": "INTEGER", + "geometry": "GEOGRAPHY", } @@ -328,6 +334,15 @@ def dataframe_to_bq_schema(dataframe, bq_schema): # Otherwise, try to automatically determine the type based on the # pandas dtype. bq_type = _PANDAS_DTYPE_TO_BQ.get(dtype.name) + if bq_type is None: + column_data = dataframe[column] + first_valid_index = column_data.first_valid_index() + if first_valid_index is not None: + sample_data = column_data.at[first_valid_index] + if (isinstance(sample_data, BaseGeometry) + and not sample_data is None # Paranoia + ): + bq_type = 'GEOGRAPHY' bq_field = schema.SchemaField(column, bq_type) bq_schema_out.append(bq_field) diff --git a/tests/unit/test__pandas_helpers.py b/tests/unit/test__pandas_helpers.py index 0ba671cd9..a48aecbf6 100644 --- a/tests/unit/test__pandas_helpers.py +++ b/tests/unit/test__pandas_helpers.py @@ -36,6 +36,11 @@ # Mock out pyarrow when missing, because methods from pyarrow.types are # used in test parameterization. pyarrow = mock.Mock() +try: + import geopandas +except ImportError: # pragma: NO COVER + geopandas = None + import pytest import pytz @@ -1175,6 +1180,26 @@ def test_dataframe_to_bq_schema_pyarrow_fallback_fails(module_under_test): assert "struct_field" in str(expected_warnings[0]) +@pytest.mark.skipif(geopandas is None, reason="Requires `geopandas`") +def test_dataframe_to_bq_schema_geography(module_under_test): + from shapely import wkt + + df = geopandas.GeoDataFrame( + pandas.DataFrame( + dict(name=['foo', 'bar'], + geo1=[None, None], + geo2=[None, wkt.loads('Point(1 1)')]) + ), + geometry='geo1', + ) + bq_schema = module_under_test.dataframe_to_bq_schema(df, []) + assert bq_schema == ( + schema.SchemaField('name', 'STRING'), + schema.SchemaField('geo1', 'GEOGRAPHY'), + schema.SchemaField('geo2', 'GEOGRAPHY'), + ) + + @pytest.mark.skipif(pandas is None, reason="Requires `pandas`") @pytest.mark.skipif(isinstance(pyarrow, mock.Mock), reason="Requires `pyarrow`") def test_augment_schema_type_detection_succeeds(module_under_test): From 5f866ff6dbe4c2cf92761baa28cae7ed3f999f3f Mon Sep 17 00:00:00 2001 From: Jim Fulton Date: Wed, 4 Aug 2021 16:34:13 -0400 Subject: [PATCH 14/40] Make load_from_pandas work with GeoSeries, shapely data, and binary wkb data --- google/cloud/bigquery/_pandas_helpers.py | 60 ++++++++++++---- tests/system/test_pandas.py | 91 ++++++++++++++++++++++++ tests/unit/test__pandas_helpers.py | 70 ++++++++++++++++++ 3 files changed, 208 insertions(+), 13 deletions(-) diff --git a/google/cloud/bigquery/_pandas_helpers.py b/google/cloud/bigquery/_pandas_helpers.py index 89aef4422..a3b872575 100644 --- a/google/cloud/bigquery/_pandas_helpers.py +++ b/google/cloud/bigquery/_pandas_helpers.py @@ -26,11 +26,23 @@ import pandas except ImportError: # pragma: NO COVER pandas = None +else: + import numpy try: - from shapely.geometry.base import BaseGeometry + from shapely.geometry.base import BaseGeometry as _BaseGeometry except ImportError: # pragma: NO COVER - BaseGeometry = type(None) + _BaseGeometry = type(None) +else: + if pandas is not None: + def _to_wkb(): + from shapely.geos import WKBWriter, lgeos + write = WKBWriter(lgeos).write + notnull = pandas.notnull + def _to_wkb(v): + return write(v) if notnull(v) else v + return _to_wkb + _to_wkb = _to_wkb() try: import pyarrow @@ -130,6 +142,7 @@ def pyarrow_timestamp(): "STRING": pyarrow.string, "TIME": pyarrow_time, "TIMESTAMP": pyarrow_timestamp, + "GEOGRAPHY": pyarrow.string, # Unless we have ninary data } ARROW_SCALAR_IDS_TO_BQ = { # https://arrow.apache.org/docs/python/api/datatypes.html#type-classes @@ -208,14 +221,16 @@ def bq_to_arrow_data_type(field): return data_type_constructor() -def bq_to_arrow_field(bq_field): +def bq_to_arrow_field(bq_field, array_type=None): """Return the Arrow field, corresponding to a given BigQuery column. Returns: None: if the Arrow type cannot be determined. """ arrow_type = bq_to_arrow_data_type(bq_field) - if arrow_type: + if arrow_type is not None: + if array_type is not None: + arrow_type = array_type # For GEOGRAPHY, at least initially is_nullable = bq_field.mode.upper() == "NULLABLE" return pyarrow.field(bq_field.name, arrow_type, nullable=is_nullable) @@ -240,7 +255,24 @@ def bq_to_arrow_schema(bq_schema): def bq_to_arrow_array(series, bq_field): - arrow_type = bq_to_arrow_data_type(bq_field) + if bq_field.field_type.upper() == 'GEOGRAPHY': + arrow_type = None + first = _first_valid(series) + if first is not None: + if series.dtype.name == 'geometry' or isinstance(first, _BaseGeometry): + arrow_type = pyarrow.binary() + # Convert shapey geometry to WKB binary format: + series = series.apply(_to_wkb) + elif isinstance(first, bytes): + arrow_type = pyarrow.binary() + elif series.dtype.name == 'geometry': + # We have a GeoSeries containing all nulls, convert it to a pandas series + series = pandas.Series(numpy.array(series)) + + if arrow_type is None: + arrow_type = bq_to_arrow_data_type(bq_field) + else: + arrow_type = bq_to_arrow_data_type(bq_field) field_type_upper = bq_field.field_type.upper() if bq_field.field_type else "" @@ -294,6 +326,11 @@ def list_columns_and_indexes(dataframe): return columns_and_indexes +def _first_valid(series): + first_valid_index = series.first_valid_index() + if first_valid_index is not None: + return series.at[first_valid_index] + def dataframe_to_bq_schema(dataframe, bq_schema): """Convert a pandas DataFrame schema to a BigQuery schema. @@ -335,14 +372,11 @@ def dataframe_to_bq_schema(dataframe, bq_schema): # pandas dtype. bq_type = _PANDAS_DTYPE_TO_BQ.get(dtype.name) if bq_type is None: - column_data = dataframe[column] - first_valid_index = column_data.first_valid_index() - if first_valid_index is not None: - sample_data = column_data.at[first_valid_index] - if (isinstance(sample_data, BaseGeometry) - and not sample_data is None # Paranoia + sample_data = _first_valid(dataframe[column]) + if (isinstance(sample_data, _BaseGeometry) + and not sample_data is None # Paranoia ): - bq_type = 'GEOGRAPHY' + bq_type = 'GEOGRAPHY' bq_field = schema.SchemaField(column, bq_type) bq_schema_out.append(bq_field) @@ -474,11 +508,11 @@ def dataframe_to_arrow(dataframe, bq_schema): arrow_names = [] arrow_fields = [] for bq_field in bq_schema: - arrow_fields.append(bq_to_arrow_field(bq_field)) arrow_names.append(bq_field.name) arrow_arrays.append( bq_to_arrow_array(get_column_or_index(dataframe, bq_field.name), bq_field) ) + arrow_fields.append(bq_to_arrow_field(bq_field, arrow_arrays[-1].type)) if all((field is not None for field in arrow_fields)): return pyarrow.Table.from_arrays( diff --git a/tests/system/test_pandas.py b/tests/system/test_pandas.py index 029001306..d4bcd0f54 100644 --- a/tests/system/test_pandas.py +++ b/tests/system/test_pandas.py @@ -807,6 +807,7 @@ def test_list_rows_max_results_w_bqstorage(bigquery_client): def test_to_dataframe_geography_as_objects(bigquery_client, dataset_id): + pytest.importorskip("shapely") bigquery_client.query( f"create table {dataset_id}.lake (name string, geog geography)" ).result() @@ -831,6 +832,7 @@ def test_to_dataframe_geography_as_objects(bigquery_client, dataset_id): def test_to_geodataframe(bigquery_client, dataset_id): + pytest.importorskip("geopandas") bigquery_client.query( f"create table {dataset_id}.geolake (name string, geog geography)" ).result() @@ -858,3 +860,92 @@ def test_to_geodataframe(bigquery_client, dataset_id): "POINT (0 0)", ] assert str(df.area) == ("0 0.5\n" "1 NaN\n" "2 0.0\n" "dtype: float64") + + +def test_load_geodataframe(bigquery_client, dataset_id): + geopandas = pytest.importorskip("geopandas") + import pandas + from shapely import wkt + from google.cloud.bigquery.schema import SchemaField + + df = geopandas.GeoDataFrame( + pandas.DataFrame( + dict(name=['foo', 'bar'], + geo1=[None, None], + geo2=[None, wkt.loads('Point(1 1)')]) + ), + geometry='geo1', + ) + + table_id = f"{dataset_id}.lake_from_gp" + bigquery_client.load_table_from_dataframe(df, table_id).result() + + table = bigquery_client.get_table(table_id) + assert table.schema == [ + SchemaField('name', 'STRING', 'NULLABLE'), + SchemaField('geo1', 'GEOGRAPHY', 'NULLABLE'), + SchemaField('geo2', 'GEOGRAPHY', 'NULLABLE'), + ] + assert sorted(map(list, bigquery_client.list_rows(table_id))) == [ + ['bar', None, 'POINT(1 1)'], + ['foo', None, None], + ] + + +def test_load_dataframe_w_shapely(bigquery_client, dataset_id): + pandas = pytest.importorskip("pandas") + wkt = pytest.importorskip("shapely.wkt") + from google.cloud.bigquery.schema import SchemaField + + df = pandas.DataFrame( + dict(name=['foo', 'bar'], + geo=[None, wkt.loads('Point(1 1)')]) + ) + + table_id = f"{dataset_id}.lake_from_shapes" + bigquery_client.load_table_from_dataframe(df, table_id).result() + + table = bigquery_client.get_table(table_id) + assert table.schema == [ + SchemaField('name', 'STRING', 'NULLABLE'), + SchemaField('geo', 'GEOGRAPHY', 'NULLABLE'), + ] + assert sorted(map(list, bigquery_client.list_rows(table_id))) == [ + ['bar', 'POINT(1 1)'], + ['foo', None], + ] + + bigquery_client.load_table_from_dataframe(df, table_id).result() + assert sorted(map(list, bigquery_client.list_rows(table_id))) == [ + ['bar', 'POINT(1 1)'], + ['bar', 'POINT(1 1)'], + ['foo', None], + ['foo', None], + ] + + +def test_load_dataframe_w_wkb(bigquery_client, dataset_id): + pandas = pytest.importorskip("pandas") + wkt = pytest.importorskip("shapely.wkt") + from shapely import wkb + from google.cloud.bigquery.schema import SchemaField + + df = pandas.DataFrame( + dict(name=['foo', 'bar'], + geo=[None, wkb.dumps(wkt.loads('Point(1 1)'))]) + ) + + table_id = f"{dataset_id}.lake_from_wkb" + # We create the table first, to inform the interpretation of the wkb data + bigquery_client.query(f"create table {table_id} (name string, geo GEOGRAPHY)") + bigquery_client.load_table_from_dataframe(df, table_id).result() + + table = bigquery_client.get_table(table_id) + assert table.schema == [ + SchemaField('name', 'STRING', 'NULLABLE'), + SchemaField('geo', 'GEOGRAPHY', 'NULLABLE'), + ] + assert sorted(map(list, bigquery_client.list_rows(table_id))) == [ + ['bar', 'POINT(1 1)'], + ['foo', None], + ] diff --git a/tests/unit/test__pandas_helpers.py b/tests/unit/test__pandas_helpers.py index a48aecbf6..f66886804 100644 --- a/tests/unit/test__pandas_helpers.py +++ b/tests/unit/test__pandas_helpers.py @@ -604,6 +604,60 @@ def test_bq_to_arrow_array_w_special_floats(module_under_test): assert roundtrip[3] is None +@pytest.mark.skipif(geopandas is None, reason="Requires `geopandas`") +@pytest.mark.skipif(isinstance(pyarrow, mock.Mock), reason="Requires `pyarrow`") +def test_bq_to_arrow_array_w_geography_dtype(module_under_test): + from shapely import wkb, wkt + + bq_field = schema.SchemaField("field_name", "GEOGRAPHY") + + series = geopandas.GeoSeries([None, wkt.loads('point(0 0)')]) + array = module_under_test.bq_to_arrow_array(series, bq_field) + # The result is binary, because we use wkb format + assert array.type == pyarrow.binary() + assert array.to_pylist() == [None, wkb.dumps(series[1])] + + # All na: + series = geopandas.GeoSeries([None, None]) + array = module_under_test.bq_to_arrow_array(series, bq_field) + assert array.type == pyarrow.string() + assert array.to_pylist() == list(series) + + +@pytest.mark.skipif(geopandas is None, reason="Requires `geopandas`") +@pytest.mark.skipif(isinstance(pyarrow, mock.Mock), reason="Requires `pyarrow`") +def test_bq_to_arrow_array_w_geography_type_shapely_data(module_under_test): + from shapely import wkb, wkt + + bq_field = schema.SchemaField("field_name", "GEOGRAPHY") + + series = pandas.Series([None, wkt.loads('point(0 0)')]) + array = module_under_test.bq_to_arrow_array(series, bq_field) + # The result is binary, because we use wkb format + assert array.type == pyarrow.binary() + assert array.to_pylist() == [None, wkb.dumps(series[1])] + + # All na: + series = pandas.Series([None, None]) + array = module_under_test.bq_to_arrow_array(series, bq_field) + assert array.type == pyarrow.string() + assert array.to_pylist() == list(series) + + +@pytest.mark.skipif(geopandas is None, reason="Requires `geopandas`") +@pytest.mark.skipif(isinstance(pyarrow, mock.Mock), reason="Requires `pyarrow`") +def test_bq_to_arrow_array_w_geography_type_wkb_data(module_under_test): + from shapely import wkb, wkt + + bq_field = schema.SchemaField("field_name", "GEOGRAPHY") + + series = pandas.Series([None, wkb.dumps(wkt.loads('point(0 0)'))]) + array = module_under_test.bq_to_arrow_array(series, bq_field) + # The result is binary, because we use wkb format + assert array.type == pyarrow.binary() + assert array.to_pylist() == list(series) + + @pytest.mark.skipif(isinstance(pyarrow, mock.Mock), reason="Requires `pyarrow`") def test_bq_to_arrow_schema_w_unknown_type(module_under_test): fields = ( @@ -1602,3 +1656,19 @@ def test_download_dataframe_row_iterator_dict_sequence_schema(module_under_test) def test_table_data_listpage_to_dataframe_skips_stop_iteration(module_under_test): dataframe = module_under_test._row_iterator_page_to_dataframe([], [], {}) assert isinstance(dataframe, pandas.DataFrame) + + +@pytest.mark.skipif(isinstance(pyarrow, mock.Mock), reason="Requires `pyarrow`") +def test_bq_to_arrow_field_type_override(module_under_test): + # When loading pandas data, we may need to override the type + # decision based on data contents, because GEOGRAPHY data can be + # stored as either text or binary. + + assert module_under_test.bq_to_arrow_field( + schema.SchemaField('g', 'GEOGRAPHY') + ).type == pyarrow.string() + + assert module_under_test.bq_to_arrow_field( + schema.SchemaField('g', 'GEOGRAPHY'), + pyarrow.binary(), + ).type == pyarrow.binary() From e23b83012b37f7960034210174d4f4be5fa22486 Mon Sep 17 00:00:00 2001 From: Jim Fulton Date: Wed, 4 Aug 2021 16:43:49 -0400 Subject: [PATCH 15/40] Small (14% for small polygons) optimization for reading wkt data --- google/cloud/bigquery/table.py | 10 ++++++---- tests/unit/test_table.py | 8 ++++---- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/google/cloud/bigquery/table.py b/google/cloud/bigquery/table.py index b626defb1..ee5d615a3 100644 --- a/google/cloud/bigquery/table.py +++ b/google/cloud/bigquery/table.py @@ -38,9 +38,11 @@ CRS = "EPSG:4326" try: - from shapely import wkt + import shapely.geos except ImportError: - wkt = None + shapely = None +else: + _read_wkt = shapely.geos.WKTReader(shapely.geos.lgeos).read try: import pyarrow @@ -1941,7 +1943,7 @@ def to_dataframe( """ if pandas is None: raise ValueError(_NO_PANDAS_ERROR) - if geography_as_object and wkt is None: + if geography_as_object and shapely is None: raise ValueError(_NO_SHAPELY_ERROR) if dtypes is None: @@ -1987,7 +1989,7 @@ def to_dataframe( if geography_as_object: for field in self.schema: if field.field_type.upper() == "GEOGRAPHY": - df[field.name] = df[field.name].dropna().apply(wkt.loads) + df[field.name] = df[field.name].dropna().apply(_read_wkt) return df diff --git a/tests/unit/test_table.py b/tests/unit/test_table.py index 7c0057404..b8bb62faf 100644 --- a/tests/unit/test_table.py +++ b/tests/unit/test_table.py @@ -3032,8 +3032,8 @@ def test_to_dataframe_error_if_pandas_is_none(self): row_iterator.to_dataframe() @unittest.skipIf(pandas is None, "Requires `pandas`") - @mock.patch("google.cloud.bigquery.table.wkt", new=None) - def test_to_dataframe_error_if_shapely_wkt_is_none(self): + @mock.patch("google.cloud.bigquery.table.shapely", new=None) + def test_to_dataframe_error_if_shapely_is_none(self): with self.assertRaisesRegex( ValueError, re.escape( @@ -3945,7 +3945,7 @@ def test_rowiterator_to_geodataframe_delegation(self, to_dataframe): variations, which are tested for to_dataframe. """ import numpy - import shapely + from shapely import wkt row_iterator = self._make_one_from_data( (("name", "STRING"), ("g", "GEOGRAPHY")) @@ -3958,7 +3958,7 @@ def test_rowiterator_to_geodataframe_delegation(self, to_dataframe): geography_column = "g" to_dataframe.return_value = pandas.DataFrame( - dict(name=["foo"], g=[shapely.wkt.loads("point(0 0)")],) + dict(name=["foo"], g=[wkt.loads("point(0 0)")],) ) df = row_iterator.to_geodataframe( From 950d957428afef407b9c7d3e927a3aad112d09ab Mon Sep 17 00:00:00 2001 From: Jim Fulton Date: Wed, 4 Aug 2021 16:48:35 -0400 Subject: [PATCH 16/40] blacken/lint --- google/cloud/bigquery/_pandas_helpers.py | 22 +++++---- google/cloud/bigquery/job/query.py | 1 + tests/system/test_pandas.py | 58 ++++++++++++------------ tests/unit/test__pandas_helpers.py | 43 ++++++++++-------- 4 files changed, 68 insertions(+), 56 deletions(-) diff --git a/google/cloud/bigquery/_pandas_helpers.py b/google/cloud/bigquery/_pandas_helpers.py index a3b872575..636167511 100644 --- a/google/cloud/bigquery/_pandas_helpers.py +++ b/google/cloud/bigquery/_pandas_helpers.py @@ -35,13 +35,18 @@ _BaseGeometry = type(None) else: if pandas is not None: + def _to_wkb(): from shapely.geos import WKBWriter, lgeos + write = WKBWriter(lgeos).write notnull = pandas.notnull + def _to_wkb(v): return write(v) if notnull(v) else v + return _to_wkb + _to_wkb = _to_wkb() try: @@ -142,7 +147,6 @@ def pyarrow_timestamp(): "STRING": pyarrow.string, "TIME": pyarrow_time, "TIMESTAMP": pyarrow_timestamp, - "GEOGRAPHY": pyarrow.string, # Unless we have ninary data } ARROW_SCALAR_IDS_TO_BQ = { # https://arrow.apache.org/docs/python/api/datatypes.html#type-classes @@ -255,17 +259,17 @@ def bq_to_arrow_schema(bq_schema): def bq_to_arrow_array(series, bq_field): - if bq_field.field_type.upper() == 'GEOGRAPHY': + if bq_field.field_type.upper() == "GEOGRAPHY": arrow_type = None first = _first_valid(series) if first is not None: - if series.dtype.name == 'geometry' or isinstance(first, _BaseGeometry): + if series.dtype.name == "geometry" or isinstance(first, _BaseGeometry): arrow_type = pyarrow.binary() # Convert shapey geometry to WKB binary format: series = series.apply(_to_wkb) elif isinstance(first, bytes): arrow_type = pyarrow.binary() - elif series.dtype.name == 'geometry': + elif series.dtype.name == "geometry": # We have a GeoSeries containing all nulls, convert it to a pandas series series = pandas.Series(numpy.array(series)) @@ -331,6 +335,7 @@ def _first_valid(series): if first_valid_index is not None: return series.at[first_valid_index] + def dataframe_to_bq_schema(dataframe, bq_schema): """Convert a pandas DataFrame schema to a BigQuery schema. @@ -373,10 +378,11 @@ def dataframe_to_bq_schema(dataframe, bq_schema): bq_type = _PANDAS_DTYPE_TO_BQ.get(dtype.name) if bq_type is None: sample_data = _first_valid(dataframe[column]) - if (isinstance(sample_data, _BaseGeometry) - and not sample_data is None # Paranoia - ): - bq_type = 'GEOGRAPHY' + if ( + isinstance(sample_data, _BaseGeometry) + and sample_data is not None # Paranoia + ): + bq_type = "GEOGRAPHY" bq_field = schema.SchemaField(column, bq_type) bq_schema_out.append(bq_field) diff --git a/google/cloud/bigquery/job/query.py b/google/cloud/bigquery/job/query.py index c3c280462..07e88dd60 100644 --- a/google/cloud/bigquery/job/query.py +++ b/google/cloud/bigquery/job/query.py @@ -53,6 +53,7 @@ # Assumption: type checks are only used by library developers and CI environments # that have all optional dependencies installed, thus no conditional imports. import pandas + import geopandas import pyarrow from google.api_core import retry as retries from google.cloud import bigquery_storage diff --git a/tests/system/test_pandas.py b/tests/system/test_pandas.py index d4bcd0f54..d79553c7e 100644 --- a/tests/system/test_pandas.py +++ b/tests/system/test_pandas.py @@ -870,25 +870,27 @@ def test_load_geodataframe(bigquery_client, dataset_id): df = geopandas.GeoDataFrame( pandas.DataFrame( - dict(name=['foo', 'bar'], - geo1=[None, None], - geo2=[None, wkt.loads('Point(1 1)')]) - ), - geometry='geo1', - ) + dict( + name=["foo", "bar"], + geo1=[None, None], + geo2=[None, wkt.loads("Point(1 1)")], + ) + ), + geometry="geo1", + ) table_id = f"{dataset_id}.lake_from_gp" bigquery_client.load_table_from_dataframe(df, table_id).result() table = bigquery_client.get_table(table_id) assert table.schema == [ - SchemaField('name', 'STRING', 'NULLABLE'), - SchemaField('geo1', 'GEOGRAPHY', 'NULLABLE'), - SchemaField('geo2', 'GEOGRAPHY', 'NULLABLE'), + SchemaField("name", "STRING", "NULLABLE"), + SchemaField("geo1", "GEOGRAPHY", "NULLABLE"), + SchemaField("geo2", "GEOGRAPHY", "NULLABLE"), ] assert sorted(map(list, bigquery_client.list_rows(table_id))) == [ - ['bar', None, 'POINT(1 1)'], - ['foo', None, None], + ["bar", None, "POINT(1 1)"], + ["foo", None, None], ] @@ -898,29 +900,28 @@ def test_load_dataframe_w_shapely(bigquery_client, dataset_id): from google.cloud.bigquery.schema import SchemaField df = pandas.DataFrame( - dict(name=['foo', 'bar'], - geo=[None, wkt.loads('Point(1 1)')]) - ) + dict(name=["foo", "bar"], geo=[None, wkt.loads("Point(1 1)")]) + ) table_id = f"{dataset_id}.lake_from_shapes" bigquery_client.load_table_from_dataframe(df, table_id).result() table = bigquery_client.get_table(table_id) assert table.schema == [ - SchemaField('name', 'STRING', 'NULLABLE'), - SchemaField('geo', 'GEOGRAPHY', 'NULLABLE'), + SchemaField("name", "STRING", "NULLABLE"), + SchemaField("geo", "GEOGRAPHY", "NULLABLE"), ] assert sorted(map(list, bigquery_client.list_rows(table_id))) == [ - ['bar', 'POINT(1 1)'], - ['foo', None], + ["bar", "POINT(1 1)"], + ["foo", None], ] bigquery_client.load_table_from_dataframe(df, table_id).result() assert sorted(map(list, bigquery_client.list_rows(table_id))) == [ - ['bar', 'POINT(1 1)'], - ['bar', 'POINT(1 1)'], - ['foo', None], - ['foo', None], + ["bar", "POINT(1 1)"], + ["bar", "POINT(1 1)"], + ["foo", None], + ["foo", None], ] @@ -931,9 +932,8 @@ def test_load_dataframe_w_wkb(bigquery_client, dataset_id): from google.cloud.bigquery.schema import SchemaField df = pandas.DataFrame( - dict(name=['foo', 'bar'], - geo=[None, wkb.dumps(wkt.loads('Point(1 1)'))]) - ) + dict(name=["foo", "bar"], geo=[None, wkb.dumps(wkt.loads("Point(1 1)"))]) + ) table_id = f"{dataset_id}.lake_from_wkb" # We create the table first, to inform the interpretation of the wkb data @@ -942,10 +942,10 @@ def test_load_dataframe_w_wkb(bigquery_client, dataset_id): table = bigquery_client.get_table(table_id) assert table.schema == [ - SchemaField('name', 'STRING', 'NULLABLE'), - SchemaField('geo', 'GEOGRAPHY', 'NULLABLE'), + SchemaField("name", "STRING", "NULLABLE"), + SchemaField("geo", "GEOGRAPHY", "NULLABLE"), ] assert sorted(map(list, bigquery_client.list_rows(table_id))) == [ - ['bar', 'POINT(1 1)'], - ['foo', None], + ["bar", "POINT(1 1)"], + ["foo", None], ] diff --git a/tests/unit/test__pandas_helpers.py b/tests/unit/test__pandas_helpers.py index f66886804..e3635e52e 100644 --- a/tests/unit/test__pandas_helpers.py +++ b/tests/unit/test__pandas_helpers.py @@ -611,7 +611,7 @@ def test_bq_to_arrow_array_w_geography_dtype(module_under_test): bq_field = schema.SchemaField("field_name", "GEOGRAPHY") - series = geopandas.GeoSeries([None, wkt.loads('point(0 0)')]) + series = geopandas.GeoSeries([None, wkt.loads("point(0 0)")]) array = module_under_test.bq_to_arrow_array(series, bq_field) # The result is binary, because we use wkb format assert array.type == pyarrow.binary() @@ -631,7 +631,7 @@ def test_bq_to_arrow_array_w_geography_type_shapely_data(module_under_test): bq_field = schema.SchemaField("field_name", "GEOGRAPHY") - series = pandas.Series([None, wkt.loads('point(0 0)')]) + series = pandas.Series([None, wkt.loads("point(0 0)")]) array = module_under_test.bq_to_arrow_array(series, bq_field) # The result is binary, because we use wkb format assert array.type == pyarrow.binary() @@ -651,7 +651,7 @@ def test_bq_to_arrow_array_w_geography_type_wkb_data(module_under_test): bq_field = schema.SchemaField("field_name", "GEOGRAPHY") - series = pandas.Series([None, wkb.dumps(wkt.loads('point(0 0)'))]) + series = pandas.Series([None, wkb.dumps(wkt.loads("point(0 0)"))]) array = module_under_test.bq_to_arrow_array(series, bq_field) # The result is binary, because we use wkb format assert array.type == pyarrow.binary() @@ -1240,17 +1240,19 @@ def test_dataframe_to_bq_schema_geography(module_under_test): df = geopandas.GeoDataFrame( pandas.DataFrame( - dict(name=['foo', 'bar'], - geo1=[None, None], - geo2=[None, wkt.loads('Point(1 1)')]) - ), - geometry='geo1', - ) + dict( + name=["foo", "bar"], + geo1=[None, None], + geo2=[None, wkt.loads("Point(1 1)")], + ) + ), + geometry="geo1", + ) bq_schema = module_under_test.dataframe_to_bq_schema(df, []) assert bq_schema == ( - schema.SchemaField('name', 'STRING'), - schema.SchemaField('geo1', 'GEOGRAPHY'), - schema.SchemaField('geo2', 'GEOGRAPHY'), + schema.SchemaField("name", "STRING"), + schema.SchemaField("geo1", "GEOGRAPHY"), + schema.SchemaField("geo2", "GEOGRAPHY"), ) @@ -1664,11 +1666,14 @@ def test_bq_to_arrow_field_type_override(module_under_test): # decision based on data contents, because GEOGRAPHY data can be # stored as either text or binary. - assert module_under_test.bq_to_arrow_field( - schema.SchemaField('g', 'GEOGRAPHY') - ).type == pyarrow.string() + assert ( + module_under_test.bq_to_arrow_field(schema.SchemaField("g", "GEOGRAPHY")).type + == pyarrow.string() + ) - assert module_under_test.bq_to_arrow_field( - schema.SchemaField('g', 'GEOGRAPHY'), - pyarrow.binary(), - ).type == pyarrow.binary() + assert ( + module_under_test.bq_to_arrow_field( + schema.SchemaField("g", "GEOGRAPHY"), pyarrow.binary(), + ).type + == pyarrow.binary() + ) From 0493acfa1349309c151775a0fba5efc7572854a1 Mon Sep 17 00:00:00 2001 From: Jim Fulton Date: Wed, 4 Aug 2021 17:13:58 -0400 Subject: [PATCH 17/40] wait for it ... --- tests/system/test_pandas.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/system/test_pandas.py b/tests/system/test_pandas.py index d79553c7e..4508302df 100644 --- a/tests/system/test_pandas.py +++ b/tests/system/test_pandas.py @@ -937,7 +937,8 @@ def test_load_dataframe_w_wkb(bigquery_client, dataset_id): table_id = f"{dataset_id}.lake_from_wkb" # We create the table first, to inform the interpretation of the wkb data - bigquery_client.query(f"create table {table_id} (name string, geo GEOGRAPHY)") + bigquery_client.query( + f"create table {table_id} (name string, geo GEOGRAPHY)").result() bigquery_client.load_table_from_dataframe(df, table_id).result() table = bigquery_client.get_table(table_id) From 5d986be6616e0b6421436f7a3b531075cd3ff681 Mon Sep 17 00:00:00 2001 From: Jim Fulton Date: Thu, 5 Aug 2021 09:17:36 -0400 Subject: [PATCH 18/40] geopandas needs a slightly newer (but still old) version of pandas. --- setup.py | 4 ++-- testing/constraints-3.6.txt | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index d01f3dcf6..b79cc2940 100644 --- a/setup.py +++ b/setup.py @@ -56,8 +56,8 @@ "grpcio >= 1.38.1, < 2.0dev", "pyarrow >= 1.0.0, < 6.0dev", ], - "pandas": ["pandas>=0.23.0", "pyarrow >= 1.0.0, < 6.0dev"], - "geopandas": ["geopandas>=0.6.0"], + "pandas": ["pandas>=0.24.2", "pyarrow >= 1.0.0, < 6.0dev"], + "geopandas": ["geopandas>=0.6.0, <1.0dev"], "bignumeric_type": ["pyarrow >= 3.0.0, < 6.0dev"], "tqdm": ["tqdm >= 4.7.4, <5.0.0dev"], "opentelemetry": [ diff --git a/testing/constraints-3.6.txt b/testing/constraints-3.6.txt index af6e82efd..8d119d5ea 100644 --- a/testing/constraints-3.6.txt +++ b/testing/constraints-3.6.txt @@ -13,7 +13,7 @@ grpcio==1.38.1 opentelemetry-api==0.11b0 opentelemetry-instrumentation==0.11b0 opentelemetry-sdk==0.11b0 -pandas==0.23.0 +pandas==0.24.2 proto-plus==1.10.0 protobuf==3.12.0 pyarrow==1.0.0 From f5e2f621c1f37036df555907f616bdd23b5f09ae Mon Sep 17 00:00:00 2001 From: Jim Fulton Date: Thu, 5 Aug 2021 09:19:35 -0400 Subject: [PATCH 19/40] skip coverage on section of code that isn't reachable by tests --- google/cloud/bigquery/_pandas_helpers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/google/cloud/bigquery/_pandas_helpers.py b/google/cloud/bigquery/_pandas_helpers.py index 636167511..9e3021ed0 100644 --- a/google/cloud/bigquery/_pandas_helpers.py +++ b/google/cloud/bigquery/_pandas_helpers.py @@ -34,7 +34,7 @@ except ImportError: # pragma: NO COVER _BaseGeometry = type(None) else: - if pandas is not None: + if pandas is not None: # pragma: NO COVER def _to_wkb(): from shapely.geos import WKBWriter, lgeos From 1084c4e51d2d6fbe3818c7bab608cfe1b8626765 Mon Sep 17 00:00:00 2001 From: Jim Fulton Date: Thu, 5 Aug 2021 09:42:03 -0400 Subject: [PATCH 20/40] assuage pytype --- google/cloud/bigquery/table.py | 1 + 1 file changed, 1 insertion(+) diff --git a/google/cloud/bigquery/table.py b/google/cloud/bigquery/table.py index ee5d615a3..09ac44787 100644 --- a/google/cloud/bigquery/table.py +++ b/google/cloud/bigquery/table.py @@ -67,6 +67,7 @@ # Unconditionally import optional dependencies again to tell pytype that # they are not None, avoiding false "no attribute" errors. import pandas + import geopandas import pyarrow from google.cloud import bigquery_storage From cdb821d6fa3b7cdad0475fba4c56611badbcfb8d Mon Sep 17 00:00:00 2001 From: Jim Fulton Date: Thu, 5 Aug 2021 09:51:56 -0400 Subject: [PATCH 21/40] blacken --- tests/system/test_pandas.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/system/test_pandas.py b/tests/system/test_pandas.py index 4508302df..0477799ff 100644 --- a/tests/system/test_pandas.py +++ b/tests/system/test_pandas.py @@ -938,7 +938,8 @@ def test_load_dataframe_w_wkb(bigquery_client, dataset_id): table_id = f"{dataset_id}.lake_from_wkb" # We create the table first, to inform the interpretation of the wkb data bigquery_client.query( - f"create table {table_id} (name string, geo GEOGRAPHY)").result() + f"create table {table_id} (name string, geo GEOGRAPHY)" + ).result() bigquery_client.load_table_from_dataframe(df, table_id).result() table = bigquery_client.get_table(table_id) From 054a85761dba1c149742f8dcba1dccd2cd1228cf Mon Sep 17 00:00:00 2001 From: Jim Fulton Date: Thu, 5 Aug 2021 10:34:07 -0400 Subject: [PATCH 22/40] Added geopandas doc snippit --- docs/snippets.py | 24 ++++++++++++++++++++++++ docs/usage/pandas.rst | 15 +++++++++++++++ 2 files changed, 39 insertions(+) diff --git a/docs/snippets.py b/docs/snippets.py index 3f9b9a88c..e3a6746dc 100644 --- a/docs/snippets.py +++ b/docs/snippets.py @@ -30,6 +30,10 @@ import pandas except (ImportError, AttributeError): pandas = None +try: + import geopandas +except (ImportError, AttributeError): + geopandas = None try: import pyarrow except (ImportError, AttributeError): @@ -934,5 +938,25 @@ def test_list_rows_as_dataframe(client): assert len(df) == table.num_rows # verify the number of rows +@pytest.mark.skipif(geopandas is None, reason="Requires `geopandas`") +def test_query_results_as_geodataframe(client): + # [START bigquery_query_results_geodataframe] + # from google.cloud import bigquery + # client = bigquery.Client() + + sql = """ + SELECT created_date, complaint_description, + ST_GEOGPOINT(longitude, latitude) as location + FROM bigquery-public-data.austin_311.311_service_requests + LIMIT 10 + """ + + df = client.query(sql).to_geodataframe() + # [END bigquery_query_results_geodataframe] + assert isinstance(df, geopandas.GeoDataFrame) + assert len(list(df)) == 3 # verify the number of columns + assert len(df) == 10 # verify the number of rows + + if __name__ == "__main__": pytest.main() diff --git a/docs/usage/pandas.rst b/docs/usage/pandas.rst index 9db98dfbb..658ee627c 100644 --- a/docs/usage/pandas.rst +++ b/docs/usage/pandas.rst @@ -37,6 +37,21 @@ To retrieve table rows as a :class:`pandas.DataFrame`: :start-after: [START bigquery_list_rows_dataframe] :end-before: [END bigquery_list_rows_dataframe] + +Retrieve BigQuery GEOGRAPHY data as a GeoPandas GeoDataFrame +------------------------------------------------------------ + +`GeoPandas `_ adds geospatial analytics +capabilities to Pandas. To retrieve query results containing +GEOGRAPHY data as a :class:`geopandas.GeoDataFrame`: + +.. literalinclude:: ../snippets.py + :language: python + :dedent: 4 + :start-after: [START bigquery_query_results_geodataframe] + :end-before: [END bigquery_query_results_geodataframe] + + Load a Pandas DataFrame to a BigQuery Table ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ From 0924a034c69ac5eb9207680fe4fcce679f413566 Mon Sep 17 00:00:00 2001 From: Jim Fulton Date: Thu, 5 Aug 2021 10:52:09 -0400 Subject: [PATCH 23/40] make sure coordinate system is set correctly --- google/cloud/bigquery/table.py | 2 +- tests/system/test_pandas.py | 4 ++++ tests/unit/test_table.py | 6 ++++++ 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/google/cloud/bigquery/table.py b/google/cloud/bigquery/table.py index 09ac44787..1bc3574b3 100644 --- a/google/cloud/bigquery/table.py +++ b/google/cloud/bigquery/table.py @@ -2208,7 +2208,7 @@ def to_geodataframe( """ if geopandas is None: raise ValueError(_NO_GEOPANDAS_ERROR) - return geopandas.GeoDataFrame() + return geopandas.GeoDataFrame(crs=CRS) def to_dataframe_iterable( self, diff --git a/tests/system/test_pandas.py b/tests/system/test_pandas.py index 0477799ff..4324fa519 100644 --- a/tests/system/test_pandas.py +++ b/tests/system/test_pandas.py @@ -860,6 +860,10 @@ def test_to_geodataframe(bigquery_client, dataset_id): "POINT (0 0)", ] assert str(df.area) == ("0 0.5\n" "1 NaN\n" "2 0.0\n" "dtype: float64") + assert df.crs.srs == 'EPSG:4326' + assert df.crs.name == 'WGS 84' + assert df.geog.crs.srs == 'EPSG:4326' + assert df.geog.crs.name == 'WGS 84' def test_load_geodataframe(bigquery_client, dataset_id): diff --git a/tests/unit/test_table.py b/tests/unit/test_table.py index b8bb62faf..b9b44ce7c 100644 --- a/tests/unit/test_table.py +++ b/tests/unit/test_table.py @@ -1689,6 +1689,8 @@ def test_to_geodataframe(self): df = row_iterator.to_geodataframe(create_bqstorage_client=False) self.assertIsInstance(df, geopandas.GeoDataFrame) self.assertEqual(len(df), 0) # verify the number of rows + self.assertEqual(df.crs.srs, 'EPSG:4326') + self.assertEqual(df.crs.name, 'WGS 84') class TestRowIterator(unittest.TestCase): @@ -3853,6 +3855,10 @@ def test_to_geodataframe(self): self.assertIsInstance(df.geog, geopandas.GeoSeries) self.assertEqual(list(map(str, df.area)), ["0.0", "nan", "0.5"]) self.assertEqual(list(map(str, df.geog.area)), ["0.0", "nan", "0.5"]) + self.assertEqual(df.crs.srs, 'EPSG:4326') + self.assertEqual(df.crs.name, 'WGS 84') + self.assertEqual(df.geog.crs.srs, 'EPSG:4326') + self.assertEqual(df.geog.crs.name, 'WGS 84') @unittest.skipIf(geopandas is None, "Requires `geopandas`") def test_to_geodataframe_ambiguous_geog(self): From c9f8ab6798633d2cebe7fa4bdbc941c93ff0b4b7 Mon Sep 17 00:00:00 2001 From: Jim Fulton Date: Thu, 5 Aug 2021 11:02:32 -0400 Subject: [PATCH 24/40] We need at least geopandas 0.9.0, which happens to be the current version --- setup.py | 2 +- testing/constraints-3.6.txt | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index b79cc2940..174962625 100644 --- a/setup.py +++ b/setup.py @@ -57,7 +57,7 @@ "pyarrow >= 1.0.0, < 6.0dev", ], "pandas": ["pandas>=0.24.2", "pyarrow >= 1.0.0, < 6.0dev"], - "geopandas": ["geopandas>=0.6.0, <1.0dev"], + "geopandas": ["geopandas>=0.9.0, <1.0dev"], "bignumeric_type": ["pyarrow >= 3.0.0, < 6.0dev"], "tqdm": ["tqdm >= 4.7.4, <5.0.0dev"], "opentelemetry": [ diff --git a/testing/constraints-3.6.txt b/testing/constraints-3.6.txt index 8d119d5ea..b4aa8cf3a 100644 --- a/testing/constraints-3.6.txt +++ b/testing/constraints-3.6.txt @@ -5,6 +5,7 @@ # # e.g., if setup.py has "foo >= 1.14.0, < 2.0.0dev", # Then this file should have foo==1.14.0 +geopandas=0.9.0 google-api-core==1.29.0 google-cloud-bigquery-storage==2.0.0 google-cloud-core==1.4.1 From b61bebc75a794983185659f60212062dd357d8cb Mon Sep 17 00:00:00 2001 From: Jim Fulton Date: Thu, 5 Aug 2021 11:11:59 -0400 Subject: [PATCH 25/40] Set version bounds for shapely and fixed constraint for geopandas --- setup.py | 2 +- testing/constraints-3.6.txt | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 174962625..62e926b76 100644 --- a/setup.py +++ b/setup.py @@ -57,7 +57,7 @@ "pyarrow >= 1.0.0, < 6.0dev", ], "pandas": ["pandas>=0.24.2", "pyarrow >= 1.0.0, < 6.0dev"], - "geopandas": ["geopandas>=0.9.0, <1.0dev"], + "geopandas": ["geopandas>=0.9.0, <1.0dev", "shapely>=1.6.0, <2.0dev"], "bignumeric_type": ["pyarrow >= 3.0.0, < 6.0dev"], "tqdm": ["tqdm >= 4.7.4, <5.0.0dev"], "opentelemetry": [ diff --git a/testing/constraints-3.6.txt b/testing/constraints-3.6.txt index b4aa8cf3a..7571817d0 100644 --- a/testing/constraints-3.6.txt +++ b/testing/constraints-3.6.txt @@ -5,7 +5,7 @@ # # e.g., if setup.py has "foo >= 1.14.0, < 2.0.0dev", # Then this file should have foo==1.14.0 -geopandas=0.9.0 +geopandas==0.9.0 google-api-core==1.29.0 google-cloud-bigquery-storage==2.0.0 google-cloud-core==1.4.1 @@ -19,5 +19,6 @@ proto-plus==1.10.0 protobuf==3.12.0 pyarrow==1.0.0 requests==2.18.0 +shapely==1.6.0 six==1.13.0 tqdm==4.7.4 From edbea72ae173060821de85fc0a6e3b8a4c4eeab0 Mon Sep 17 00:00:00 2001 From: Jim Fulton Date: Thu, 5 Aug 2021 12:07:50 -0400 Subject: [PATCH 26/40] blacken --- tests/system/test_pandas.py | 8 ++++---- tests/unit/test_table.py | 12 ++++++------ 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/tests/system/test_pandas.py b/tests/system/test_pandas.py index 4324fa519..48233506d 100644 --- a/tests/system/test_pandas.py +++ b/tests/system/test_pandas.py @@ -860,10 +860,10 @@ def test_to_geodataframe(bigquery_client, dataset_id): "POINT (0 0)", ] assert str(df.area) == ("0 0.5\n" "1 NaN\n" "2 0.0\n" "dtype: float64") - assert df.crs.srs == 'EPSG:4326' - assert df.crs.name == 'WGS 84' - assert df.geog.crs.srs == 'EPSG:4326' - assert df.geog.crs.name == 'WGS 84' + assert df.crs.srs == "EPSG:4326" + assert df.crs.name == "WGS 84" + assert df.geog.crs.srs == "EPSG:4326" + assert df.geog.crs.name == "WGS 84" def test_load_geodataframe(bigquery_client, dataset_id): diff --git a/tests/unit/test_table.py b/tests/unit/test_table.py index b9b44ce7c..8965fff18 100644 --- a/tests/unit/test_table.py +++ b/tests/unit/test_table.py @@ -1689,8 +1689,8 @@ def test_to_geodataframe(self): df = row_iterator.to_geodataframe(create_bqstorage_client=False) self.assertIsInstance(df, geopandas.GeoDataFrame) self.assertEqual(len(df), 0) # verify the number of rows - self.assertEqual(df.crs.srs, 'EPSG:4326') - self.assertEqual(df.crs.name, 'WGS 84') + self.assertEqual(df.crs.srs, "EPSG:4326") + self.assertEqual(df.crs.name, "WGS 84") class TestRowIterator(unittest.TestCase): @@ -3855,10 +3855,10 @@ def test_to_geodataframe(self): self.assertIsInstance(df.geog, geopandas.GeoSeries) self.assertEqual(list(map(str, df.area)), ["0.0", "nan", "0.5"]) self.assertEqual(list(map(str, df.geog.area)), ["0.0", "nan", "0.5"]) - self.assertEqual(df.crs.srs, 'EPSG:4326') - self.assertEqual(df.crs.name, 'WGS 84') - self.assertEqual(df.geog.crs.srs, 'EPSG:4326') - self.assertEqual(df.geog.crs.name, 'WGS 84') + self.assertEqual(df.crs.srs, "EPSG:4326") + self.assertEqual(df.crs.name, "WGS 84") + self.assertEqual(df.geog.crs.srs, "EPSG:4326") + self.assertEqual(df.geog.crs.name, "WGS 84") @unittest.skipIf(geopandas is None, "Requires `geopandas`") def test_to_geodataframe_ambiguous_geog(self): From b6f19c998014b195a888523d2a60f3119a25cf8d Mon Sep 17 00:00:00 2001 From: Jim Fulton Date: Mon, 16 Aug 2021 15:32:50 -0400 Subject: [PATCH 27/40] rename CRS to _COORDINATE_REFERENCE_SYSTEM --- google/cloud/bigquery/table.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/google/cloud/bigquery/table.py b/google/cloud/bigquery/table.py index 1bc3574b3..bf1943b14 100644 --- a/google/cloud/bigquery/table.py +++ b/google/cloud/bigquery/table.py @@ -35,7 +35,7 @@ except ImportError: geopandas = None else: - CRS = "EPSG:4326" + _COORDINATE_REFERENCE_SYSTEM = "EPSG:4326" try: import shapely.geos @@ -2113,7 +2113,9 @@ def to_geodataframe( geography_as_object=True, ) - return geopandas.GeoDataFrame(df, crs=CRS, geometry=geography_column) + return geopandas.GeoDataFrame( + df, crs=_COORDINATE_REFERENCE_SYSTEM, geometry=geography_column + ) class _EmptyRowIterator(RowIterator): @@ -2208,7 +2210,7 @@ def to_geodataframe( """ if geopandas is None: raise ValueError(_NO_GEOPANDAS_ERROR) - return geopandas.GeoDataFrame(crs=CRS) + return geopandas.GeoDataFrame(crs=_COORDINATE_REFERENCE_SYSTEM) def to_dataframe_iterable( self, From 30435857e308e353d3851292deff8f97ccdde2cd Mon Sep 17 00:00:00 2001 From: Jim Fulton Date: Mon, 16 Aug 2021 15:58:55 -0400 Subject: [PATCH 28/40] Fixed intersphinx links --- docs/conf.py | 2 ++ google/cloud/bigquery/job/query.py | 4 ++-- google/cloud/bigquery/table.py | 4 ++-- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index cb347160d..4249a8281 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -365,6 +365,8 @@ "grpc": ("https://grpc.github.io/grpc/python/", None), "proto-plus": ("https://proto-plus-python.readthedocs.io/en/latest/", None), "protobuf": ("https://googleapis.dev/python/protobuf/latest/", None), + "pandas": ('http://pandas.pydata.org/pandas-docs/dev', None), + "geopandas": ("https://geopandas.org/", None) } diff --git a/google/cloud/bigquery/job/query.py b/google/cloud/bigquery/job/query.py index 07e88dd60..11cec75bd 100644 --- a/google/cloud/bigquery/job/query.py +++ b/google/cloud/bigquery/job/query.py @@ -1484,7 +1484,7 @@ def to_dataframe( .. versionadded:: 2.24.0 Returns: - `pandas.DataFrame`: + pandas.DataFrame: A :class:`~pandas.DataFrame` populated with row data and column headers from the query results. The column headers are derived from the destination table's @@ -1578,7 +1578,7 @@ def to_geodataframe( only one GEOGRAPHY column. Returns: - `geopandas.GeoDataFrame`: + geopandas.GeoDataFrame: A :class:`geopandas.GeoDataFrame` populated with row data and column headers from the query results. The column headers are derived from the destination diff --git a/google/cloud/bigquery/table.py b/google/cloud/bigquery/table.py index c9cb844b8..807dc1888 100644 --- a/google/cloud/bigquery/table.py +++ b/google/cloud/bigquery/table.py @@ -1966,7 +1966,7 @@ def to_dataframe( .. versionadded:: 2.24.0 Returns: - `pandas.DataFrame`: + pandas.DataFrame: A :class:`~pandas.DataFrame` populated with row data and column headers from the query results. The column headers are derived from the destination table's schema. @@ -2099,7 +2099,7 @@ def to_geodataframe( only one GEOGRAPHY column. Returns: - `geopandas.GeoDataFrame`: + geopandas.GeoDataFrame: A :class:`geopandas.GeoDataFrame` populated with row data and column headers from the query results. The column headers are derived from the destination From fe8ea8a2a0ca8a1ab47157e3ee5d1497de4d005b Mon Sep 17 00:00:00 2001 From: Jim Fulton Date: Mon, 16 Aug 2021 16:06:20 -0400 Subject: [PATCH 29/40] capitalize shapely --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 62e926b76..37d72bf4f 100644 --- a/setup.py +++ b/setup.py @@ -57,7 +57,7 @@ "pyarrow >= 1.0.0, < 6.0dev", ], "pandas": ["pandas>=0.24.2", "pyarrow >= 1.0.0, < 6.0dev"], - "geopandas": ["geopandas>=0.9.0, <1.0dev", "shapely>=1.6.0, <2.0dev"], + "geopandas": ["geopandas>=0.9.0, <1.0dev", "Shapely>=1.6.0, <2.0dev"], "bignumeric_type": ["pyarrow >= 3.0.0, < 6.0dev"], "tqdm": ["tqdm >= 4.7.4, <5.0.0dev"], "opentelemetry": [ From 2d3c37212f28fcee93bce3605a7f09dfbd96ef49 Mon Sep 17 00:00:00 2001 From: Jim Fulton Date: Tue, 17 Aug 2021 09:40:16 -0400 Subject: [PATCH 30/40] blacken ...whimper --- docs/conf.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 1958b6fce..59a2d8fb3 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -366,8 +366,8 @@ "grpc": ("https://grpc.github.io/grpc/python/", None), "proto-plus": ("https://proto-plus-python.readthedocs.io/en/latest/", None), "protobuf": ("https://googleapis.dev/python/protobuf/latest/", None), - "pandas": ('http://pandas.pydata.org/pandas-docs/dev', None), - "geopandas": ("https://geopandas.org/", None) + "pandas": ("http://pandas.pydata.org/pandas-docs/dev", None), + "geopandas": ("https://geopandas.org/", None), } From 1a977fb327594c64c9c78f6fd3dfae8f3bc563de Mon Sep 17 00:00:00 2001 From: Owl Bot Date: Tue, 17 Aug 2021 18:07:12 +0000 Subject: [PATCH 31/40] =?UTF-8?q?=F0=9F=A6=89=20Updates=20from=20OwlBot?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md --- docs/conf.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 59a2d8fb3..09f7ea414 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -366,8 +366,6 @@ "grpc": ("https://grpc.github.io/grpc/python/", None), "proto-plus": ("https://proto-plus-python.readthedocs.io/en/latest/", None), "protobuf": ("https://googleapis.dev/python/protobuf/latest/", None), - "pandas": ("http://pandas.pydata.org/pandas-docs/dev", None), - "geopandas": ("https://geopandas.org/", None), } From 7dd9f5edddd1c277700aa0378ae902939b9d82a4 Mon Sep 17 00:00:00 2001 From: Jim Fulton Date: Tue, 17 Aug 2021 14:23:48 -0400 Subject: [PATCH 32/40] tell owlbot about pandas and geopandas intersphinx --- docs/conf.py | 2 ++ owlbot.py | 4 ++++ 2 files changed, 6 insertions(+) diff --git a/docs/conf.py b/docs/conf.py index 09f7ea414..59a2d8fb3 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -366,6 +366,8 @@ "grpc": ("https://grpc.github.io/grpc/python/", None), "proto-plus": ("https://proto-plus-python.readthedocs.io/en/latest/", None), "protobuf": ("https://googleapis.dev/python/protobuf/latest/", None), + "pandas": ("http://pandas.pydata.org/pandas-docs/dev", None), + "geopandas": ("https://geopandas.org/", None), } diff --git a/owlbot.py b/owlbot.py index 09845480a..ea9904cdb 100644 --- a/owlbot.py +++ b/owlbot.py @@ -97,6 +97,10 @@ samples=True, microgenerator=True, split_system_tests=True, + intersphinx_dependencies={ + "pandas": 'http://pandas.pydata.org/pandas-docs/dev', + "geopandas": "https://geopandas.org/", + } ) # BigQuery has a custom multiprocessing note From 3c257d9940f7f741826ab20f97d7fda160a034ef Mon Sep 17 00:00:00 2001 From: Jim Fulton Date: Tue, 17 Aug 2021 15:21:31 -0400 Subject: [PATCH 33/40] Explain the _to_wkb hijinks --- google/cloud/bigquery/_pandas_helpers.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/google/cloud/bigquery/_pandas_helpers.py b/google/cloud/bigquery/_pandas_helpers.py index 4ed9cd279..f72bf0731 100644 --- a/google/cloud/bigquery/_pandas_helpers.py +++ b/google/cloud/bigquery/_pandas_helpers.py @@ -35,6 +35,12 @@ if pandas is not None: # pragma: NO COVER def _to_wkb(): + # Create a closure that: + # - Adds a not-null check. This allows the returned function to + # be used directly with apply, unlike `shapely.wkb.dumps`. + # - Avoid extra work done by `shapely.wkb.dumps` that we don't need. + # - Caches the WKBWriter (and write method lookup :) ) + # - Avoids adding WKBWriter, lgeos, and notnull to the module namespace. from shapely.geos import WKBWriter, lgeos write = WKBWriter(lgeos).write From c73b85683372a322ad957b2af7fa2767f7fbd68b Mon Sep 17 00:00:00 2001 From: Jim Fulton Date: Tue, 17 Aug 2021 15:37:32 -0400 Subject: [PATCH 34/40] moar comments --- google/cloud/bigquery/_pandas_helpers.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/google/cloud/bigquery/_pandas_helpers.py b/google/cloud/bigquery/_pandas_helpers.py index f72bf0731..ab58b1729 100644 --- a/google/cloud/bigquery/_pandas_helpers.py +++ b/google/cloud/bigquery/_pandas_helpers.py @@ -28,8 +28,10 @@ import numpy try: + # _BaseGeometry is used to detect shapely objevys in `bq_to_arrow_array` from shapely.geometry.base import BaseGeometry as _BaseGeometry except ImportError: # pragma: NO COVER + # No shapely, use NoneType for _BaseGeometry as a placeholder. _BaseGeometry = type(None) else: if pandas is not None: # pragma: NO COVER From 5988052944a034c1a2ecd586bcd6eaa32aa4e9e5 Mon Sep 17 00:00:00 2001 From: Jim Fulton Date: Tue, 17 Aug 2021 16:24:26 -0400 Subject: [PATCH 35/40] Don't rely on string representations in assertions --- tests/system/test_pandas.py | 39 +++++++++++++++---------------------- 1 file changed, 16 insertions(+), 23 deletions(-) diff --git a/tests/system/test_pandas.py b/tests/system/test_pandas.py index 3a210b3a2..febb6c97f 100644 --- a/tests/system/test_pandas.py +++ b/tests/system/test_pandas.py @@ -801,7 +801,7 @@ def test_list_rows_max_results_w_bqstorage(bigquery_client): def test_to_dataframe_geography_as_objects(bigquery_client, dataset_id): - pytest.importorskip("shapely") + wkt = pytest.importorskip("shapely.wkt") bigquery_client.query( f"create table {dataset_id}.lake (name string, geog geography)" ).result() @@ -816,17 +816,15 @@ def test_to_dataframe_geography_as_objects(bigquery_client, dataset_id): df = bigquery_client.query( f"select * from {dataset_id}.lake order by name" ).to_dataframe(geography_as_object=True) - assert str(df) == ( - " name geog\n" - "0 bar POINT (0 1)\n" - "1 baz NaN\n" - "2 foo POINT (0 0)" - ) - assert [v.__class__.__name__ for v in df["geog"]] == ["Point", "float", "Point"] + assert list(df['name']) == ['bar', 'baz', 'foo'] + assert df['geog'][0] == wkt.loads('point(0 1)') + assert pandas.isna(df['geog'][1]) + assert df['geog'][2] == wkt.loads('point(0 0)') def test_to_geodataframe(bigquery_client, dataset_id): - pytest.importorskip("geopandas") + geopandas = pytest.importorskip("geopandas") + from shapely import wkt bigquery_client.query( f"create table {dataset_id}.geolake (name string, geog geography)" ).result() @@ -834,26 +832,21 @@ def test_to_geodataframe(bigquery_client, dataset_id): f""" insert into {dataset_id}.geolake (name, geog) values ('foo', st_geogfromtext('point(0 0)')), - ('bar', st_geogfromtext('polygon((0 0, 1 1, 1 0, 0 0))')), + ('bar', st_geogfromtext('polygon((0 0, 1 0, 1 1, 0 0))')), ('baz', null) """ ).result() df = bigquery_client.query( f"select * from {dataset_id}.geolake order by name" ).to_geodataframe() - assert [v.__class__.__name__ for v in df["geog"]] == [ - "Polygon", - "NoneType", - "Point", - ] - assert df.__class__.__name__ == "GeoDataFrame" - assert df["geog"].__class__.__name__ == "GeoSeries" - assert list(map(str, df["geog"])) == [ - "POLYGON ((1 0, 1 1, 0 0, 1 0))", - "None", - "POINT (0 0)", - ] - assert str(df.area) == ("0 0.5\n" "1 NaN\n" "2 0.0\n" "dtype: float64") + assert df["geog"][0] == wkt.loads('polygon((0 0, 1 0, 1 1, 0 0))') + assert pandas.isna(df['geog'][1]) + assert df["geog"][2] == wkt.loads('point(0 0)') + assert isinstance(df, geopandas.GeoDataFrame) + assert isinstance(df["geog"], geopandas.GeoSeries) + assert df.area[0] == 0.5 + assert pandas.isna(df.area[1]) + assert df.area[2] == 0.0 assert df.crs.srs == "EPSG:4326" assert df.crs.name == "WGS 84" assert df.geog.crs.srs == "EPSG:4326" From 3207e578638766da7e3fa48ea0be72db7ab39fc7 Mon Sep 17 00:00:00 2001 From: Jim Fulton Date: Tue, 17 Aug 2021 17:26:37 -0400 Subject: [PATCH 36/40] move to_geodataframe snippet to samples --- docs/snippets.py | 24 ------------- docs/usage/pandas.rst | 2 +- samples/geography/requirements.txt | 44 +++++++++++++++++++++++ samples/geography/to_geodataframe.py | 32 +++++++++++++++++ samples/geography/to_geodataframe_test.py | 25 +++++++++++++ 5 files changed, 102 insertions(+), 25 deletions(-) create mode 100644 samples/geography/to_geodataframe.py create mode 100644 samples/geography/to_geodataframe_test.py diff --git a/docs/snippets.py b/docs/snippets.py index dd1f07e80..c62001fc0 100644 --- a/docs/snippets.py +++ b/docs/snippets.py @@ -30,10 +30,6 @@ import pandas except (ImportError, AttributeError): pandas = None -try: - import geopandas -except (ImportError, AttributeError): - geopandas = None try: import pyarrow except (ImportError, AttributeError): @@ -939,25 +935,5 @@ def test_list_rows_as_dataframe(client): assert len(df) == table.num_rows # verify the number of rows -@pytest.mark.skipif(geopandas is None, reason="Requires `geopandas`") -def test_query_results_as_geodataframe(client): - # [START bigquery_query_results_geodataframe] - # from google.cloud import bigquery - # client = bigquery.Client() - - sql = """ - SELECT created_date, complaint_description, - ST_GEOGPOINT(longitude, latitude) as location - FROM bigquery-public-data.austin_311.311_service_requests - LIMIT 10 - """ - - df = client.query(sql).to_geodataframe() - # [END bigquery_query_results_geodataframe] - assert isinstance(df, geopandas.GeoDataFrame) - assert len(list(df)) == 3 # verify the number of columns - assert len(df) == 10 # verify the number of rows - - if __name__ == "__main__": pytest.main() diff --git a/docs/usage/pandas.rst b/docs/usage/pandas.rst index 658ee627c..92eee67cf 100644 --- a/docs/usage/pandas.rst +++ b/docs/usage/pandas.rst @@ -45,7 +45,7 @@ Retrieve BigQuery GEOGRAPHY data as a GeoPandas GeoDataFrame capabilities to Pandas. To retrieve query results containing GEOGRAPHY data as a :class:`geopandas.GeoDataFrame`: -.. literalinclude:: ../snippets.py +.. literalinclude:: ../samples/geography/to_geodataframe.py :language: python :dedent: 4 :start-after: [START bigquery_query_results_geodataframe] diff --git a/samples/geography/requirements.txt b/samples/geography/requirements.txt index dfee339d4..18a0e921f 100644 --- a/samples/geography/requirements.txt +++ b/samples/geography/requirements.txt @@ -1,4 +1,48 @@ +attrs==21.2.0 +cachetools==4.2.2 +certifi==2021.5.30 +cffi==1.14.6 +charset-normalizer==2.0.4 +click==8.0.1 +click-plugins==1.1.1 +cligj==0.7.2 +dataclasses==0.8 +Fiona==1.8.20 geojson==2.5.0 +geopandas==0.9.0 +google-api-core==1.31.2 +google-auth==1.35.0 google-cloud-bigquery==2.24.0 google-cloud-bigquery-storage==2.6.3 +google-cloud-core==1.7.2 +google-crc32c==1.1.2 +google-resumable-media==1.3.3 +googleapis-common-protos==1.53.0 +grpcio==1.39.0 +idna==3.2 +importlib-metadata==4.6.4 +libcst==0.3.20 +munch==2.5.0 +mypy-extensions==0.4.3 +numpy==1.19.5 +packaging==21.0 +pandas==1.1.5 +proto-plus==1.19.0 +protobuf==3.17.3 +pyarrow==5.0.0 +pyasn1==0.4.8 +pyasn1-modules==0.2.8 +pycparser==2.20 +pyparsing==2.4.7 +pyproj==3.0.1 +python-dateutil==2.8.2 +pytz==2021.1 +PyYAML==5.4.1 +requests==2.26.0 +rsa==4.7.2 Shapely==1.7.1 +six==1.16.0 +typing-extensions==3.10.0.0 +typing-inspect==0.7.1 +urllib3==1.26.6 +zipp==3.5.0 diff --git a/samples/geography/to_geodataframe.py b/samples/geography/to_geodataframe.py new file mode 100644 index 000000000..fa8073fef --- /dev/null +++ b/samples/geography/to_geodataframe.py @@ -0,0 +1,32 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from google.cloud import bigquery + +client = bigquery.Client() + + +def get_austin_service_requests_as_geography(): + # [START bigquery_query_results_geodataframe] + + sql = """ + SELECT created_date, complaint_description, + ST_GEOGPOINT(longitude, latitude) as location + FROM bigquery-public-data.austin_311.311_service_requests + LIMIT 10 + """ + + df = client.query(sql).to_geodataframe() + # [END bigquery_query_results_geodataframe] + return df diff --git a/samples/geography/to_geodataframe_test.py b/samples/geography/to_geodataframe_test.py new file mode 100644 index 000000000..7a2ba6937 --- /dev/null +++ b/samples/geography/to_geodataframe_test.py @@ -0,0 +1,25 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +from .to_geodataframe import get_austin_service_requests_as_geography + + +def test_get_austin_service_requests_as_geography(): + geopandas = pytest.importorskip("geopandas") + df = get_austin_service_requests_as_geography() + assert isinstance(df, geopandas.GeoDataFrame) + assert len(list(df)) == 3 # verify the number of columns + assert len(df) == 10 # verify the number of rows From 27071a09546a81079e85ac7375c204e80a93f837 Mon Sep 17 00:00:00 2001 From: Jim Fulton Date: Tue, 17 Aug 2021 17:33:30 -0400 Subject: [PATCH 37/40] Removed pointless importskips --- tests/system/test_pandas.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/system/test_pandas.py b/tests/system/test_pandas.py index febb6c97f..fadf6d684 100644 --- a/tests/system/test_pandas.py +++ b/tests/system/test_pandas.py @@ -886,7 +886,6 @@ def test_load_geodataframe(bigquery_client, dataset_id): def test_load_dataframe_w_shapely(bigquery_client, dataset_id): - pandas = pytest.importorskip("pandas") wkt = pytest.importorskip("shapely.wkt") from google.cloud.bigquery.schema import SchemaField @@ -917,7 +916,6 @@ def test_load_dataframe_w_shapely(bigquery_client, dataset_id): def test_load_dataframe_w_wkb(bigquery_client, dataset_id): - pandas = pytest.importorskip("pandas") wkt = pytest.importorskip("shapely.wkt") from shapely import wkb from google.cloud.bigquery.schema import SchemaField From abf473e6c2539e48960c876d60e49bb1d0459197 Mon Sep 17 00:00:00 2001 From: Jim Fulton Date: Wed, 18 Aug 2021 08:29:47 -0400 Subject: [PATCH 38/40] blacken --- tests/system/test_pandas.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/tests/system/test_pandas.py b/tests/system/test_pandas.py index fadf6d684..836f93210 100644 --- a/tests/system/test_pandas.py +++ b/tests/system/test_pandas.py @@ -816,15 +816,16 @@ def test_to_dataframe_geography_as_objects(bigquery_client, dataset_id): df = bigquery_client.query( f"select * from {dataset_id}.lake order by name" ).to_dataframe(geography_as_object=True) - assert list(df['name']) == ['bar', 'baz', 'foo'] - assert df['geog'][0] == wkt.loads('point(0 1)') - assert pandas.isna(df['geog'][1]) - assert df['geog'][2] == wkt.loads('point(0 0)') + assert list(df["name"]) == ["bar", "baz", "foo"] + assert df["geog"][0] == wkt.loads("point(0 1)") + assert pandas.isna(df["geog"][1]) + assert df["geog"][2] == wkt.loads("point(0 0)") def test_to_geodataframe(bigquery_client, dataset_id): geopandas = pytest.importorskip("geopandas") from shapely import wkt + bigquery_client.query( f"create table {dataset_id}.geolake (name string, geog geography)" ).result() @@ -839,9 +840,9 @@ def test_to_geodataframe(bigquery_client, dataset_id): df = bigquery_client.query( f"select * from {dataset_id}.geolake order by name" ).to_geodataframe() - assert df["geog"][0] == wkt.loads('polygon((0 0, 1 0, 1 1, 0 0))') - assert pandas.isna(df['geog'][1]) - assert df["geog"][2] == wkt.loads('point(0 0)') + assert df["geog"][0] == wkt.loads("polygon((0 0, 1 0, 1 1, 0 0))") + assert pandas.isna(df["geog"][1]) + assert df["geog"][2] == wkt.loads("point(0 0)") assert isinstance(df, geopandas.GeoDataFrame) assert isinstance(df["geog"], geopandas.GeoSeries) assert df.area[0] == 0.5 From 88b0653ec9a6b46b9ffc4944a467572d11d4adde Mon Sep 17 00:00:00 2001 From: Jim Fulton Date: Wed, 18 Aug 2021 08:34:09 -0400 Subject: [PATCH 39/40] use older dataclasses to try to deal with pip error in CI --- samples/geography/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/samples/geography/requirements.txt b/samples/geography/requirements.txt index 18a0e921f..b1e1f3b75 100644 --- a/samples/geography/requirements.txt +++ b/samples/geography/requirements.txt @@ -6,7 +6,7 @@ charset-normalizer==2.0.4 click==8.0.1 click-plugins==1.1.1 cligj==0.7.2 -dataclasses==0.8 +dataclasses==0.6 Fiona==1.8.20 geojson==2.5.0 geopandas==0.9.0 From 1b6b1d76b08061b6b3128c201f43668b62251e9c Mon Sep 17 00:00:00 2001 From: Jim Fulton Date: Mon, 23 Aug 2021 14:59:19 -0600 Subject: [PATCH 40/40] Don't need dataclasses after Python 3.6 It's built in. --- samples/geography/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/samples/geography/requirements.txt b/samples/geography/requirements.txt index 3b902da80..7a76b4033 100644 --- a/samples/geography/requirements.txt +++ b/samples/geography/requirements.txt @@ -6,7 +6,7 @@ charset-normalizer==2.0.4 click==8.0.1 click-plugins==1.1.1 cligj==0.7.2 -dataclasses==0.6 +dataclasses==0.6; python_version < '3.7' Fiona==1.8.20 geojson==2.5.0 geopandas==0.9.0