fix!: use nullable Int64 and boolean dtypes in to_dataframe (#786)

To override this behavior, specify the types for the desired columns with the `dtype` argument. BREAKING CHANGE: uses Int64 type by default to avoid loss-of-precision in results with large integer values Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://github.com/googleapis/python-bigquery/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes https://issuetracker.google.com/144712110 🦕 Fixes #793
googleapis · Aug 16, 2021 · dcd78c7 · dcd78c7
1 parent 66014c3
commit dcd78c7
Show file tree

Hide file tree

Showing 11 changed files with 340 additions and 39 deletions.
diff --git a/docs/conf.py b/docs/conf.py
@@ -110,6 +110,7 @@
 # directories to ignore when looking for source files.
 exclude_patterns = [
     "_build",
+    "**/.nox/**/*",
     "samples/AUTHORING_GUIDE.md",
     "samples/CONTRIBUTING.md",
     "samples/snippets/README.rst",

diff --git a/docs/usage/pandas.rst b/docs/usage/pandas.rst
@@ -14,12 +14,12 @@ First, ensure that the :mod:`pandas` library is installed by running:
 
    pip install --upgrade pandas
 
-Alternatively, you can install the BigQuery python client library with
+Alternatively, you can install the BigQuery Python client library with
 :mod:`pandas` by running:
 
 .. code-block:: bash
 
-   pip install --upgrade google-cloud-bigquery[pandas]
+   pip install --upgrade 'google-cloud-bigquery[pandas]'
 
 To retrieve query results as a :class:`pandas.DataFrame`:
 
@@ -37,6 +37,27 @@ To retrieve table rows as a :class:`pandas.DataFrame`:
    :start-after: [START bigquery_list_rows_dataframe]
    :end-before: [END bigquery_list_rows_dataframe]
 
+The following data types are used when creating a pandas DataFrame.
+
+.. list-table:: Pandas Data Type Mapping
+   :header-rows: 1
+
+   * - BigQuery
+     - pandas
+     - Notes
+   * - BOOL
+     - boolean
+     -
+   * - DATETIME
+     - datetime64[ns], object
+     - object is used when there are values not representable in pandas
+   * - FLOAT64
+     - float64
+     -
+   * - INT64
+     - Int64
+     -
+
 Load a Pandas DataFrame to a BigQuery Table
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -45,7 +66,7 @@ As of version 1.3.0, you can use the
 to load data from a :class:`pandas.DataFrame` to a
 :class:`~google.cloud.bigquery.table.Table`. To use this function, in addition
 to :mod:`pandas`, you will need to install the :mod:`pyarrow` library. You can
-install the BigQuery python client library with :mod:`pandas` and
+install the BigQuery Python client library with :mod:`pandas` and
 :mod:`pyarrow` by running:
 
 .. code-block:: bash

diff --git a/google/cloud/bigquery/_pandas_helpers.py b/google/cloud/bigquery/_pandas_helpers.py
@@ -18,6 +18,7 @@
 import functools
 import logging
 import queue
+from typing import Dict, Sequence
 import warnings
 
 try:
@@ -42,15 +43,19 @@
 
 _LOGGER = logging.getLogger(__name__)
 
-_NO_BQSTORAGE_ERROR = (
-    "The google-cloud-bigquery-storage library is not installed, "
-    "please install google-cloud-bigquery-storage to use bqstorage features."
-)
-
 _PROGRESS_INTERVAL = 0.2  # Maximum time between download status checks, in seconds.
 
 _MAX_QUEUE_SIZE_DEFAULT = object()  # max queue size sentinel for BQ Storage downloads
 
+# If you update the default dtypes, also update the docs at docs/usage/pandas.rst.
+_BQ_TO_PANDAS_DTYPE_NULLSAFE = {
+    "BOOL": "boolean",
+    "BOOLEAN": "boolean",
+    "FLOAT": "float64",
+    "FLOAT64": "float64",
+    "INT64": "Int64",
+    "INTEGER": "Int64",
+}
 _PANDAS_DTYPE_TO_BQ = {
     "bool": "BOOLEAN",
     "datetime64[ns, UTC]": "TIMESTAMP",
@@ -217,6 +222,28 @@ def bq_to_arrow_schema(bq_schema):
     return pyarrow.schema(arrow_fields)
 
 
+def bq_schema_to_nullsafe_pandas_dtypes(
+    bq_schema: Sequence[schema.SchemaField],
+) -> Dict[str, str]:
+    """Return the default dtypes to use for columns in a BigQuery schema.
+
+    Only returns default dtypes which are safe to have NULL values. This
+    includes Int64, which has pandas.NA values and does not result in
+    loss-of-precision.
+
+    Returns:
+        A mapping from column names to pandas dtypes.
+    """
+    dtypes = {}
+    for bq_field in bq_schema:
+        if bq_field.mode.upper() not in {"NULLABLE", "REQUIRED"}:
+            continue
+        field_type = bq_field.field_type.upper()
+        if field_type in _BQ_TO_PANDAS_DTYPE_NULLSAFE:
+            dtypes[bq_field.name] = _BQ_TO_PANDAS_DTYPE_NULLSAFE[field_type]
+    return dtypes
+
+
 def bq_to_arrow_array(series, bq_field):
     arrow_type = bq_to_arrow_data_type(bq_field)
 

diff --git a/google/cloud/bigquery/table.py b/google/cloud/bigquery/table.py
@@ -1933,6 +1933,13 @@ def to_dataframe(
             bqstorage_client=bqstorage_client,
             create_bqstorage_client=create_bqstorage_client,
         )
+        default_dtypes = _pandas_helpers.bq_schema_to_nullsafe_pandas_dtypes(
+            self.schema
+        )
+
+        # Let the user-defined dtypes override the default ones.
+        # https://stackoverflow.com/a/26853961/101923
+        dtypes = {**default_dtypes, **dtypes}
 
         # When converting timestamp values to nanosecond precision, the result
         # can be out of pyarrow bounds. To avoid the error when converting to
@@ -1954,7 +1961,9 @@ def to_dataframe(
 
         extra_kwargs = {"timestamp_as_object": timestamp_as_object}
 
-        df = record_batch.to_pandas(date_as_object=date_as_object, **extra_kwargs)
+        df = record_batch.to_pandas(
+            date_as_object=date_as_object, integer_object_nulls=True, **extra_kwargs
+        )
 
         for column in dtypes:
             df[column] = pandas.Series(df[column], dtype=dtypes[column])

diff --git a/setup.py b/setup.py
@@ -50,7 +50,7 @@
     # Keep the no-op bqstorage extra for backward compatibility.
     # See: https://github.com/googleapis/python-bigquery/issues/757
     "bqstorage": [],
-    "pandas": ["pandas>=0.23.0"],
+    "pandas": ["pandas>=1.0.0"],
     "tqdm": ["tqdm >= 4.7.4, <5.0.0dev"],
     "opentelemetry": [
         "opentelemetry-api >= 0.11b0",

diff --git a/testing/constraints-3.6.txt b/testing/constraints-3.6.txt
@@ -13,7 +13,7 @@ grpcio==1.38.1
 opentelemetry-api==0.11b0
 opentelemetry-instrumentation==0.11b0
 opentelemetry-sdk==0.11b0
-pandas==0.23.0
+pandas==1.0.0
 proto-plus==1.10.0
 protobuf==3.12.0
 pyarrow==3.0.0

diff --git a/tests/system/test_arrow.py b/tests/system/test_arrow.py
@@ -14,12 +14,9 @@
 
 """System tests for Arrow connector."""
 
+import pyarrow
 import pytest
 
-pyarrow = pytest.importorskip(
-    "pyarrow", minversion="3.0.0"
-)  # Needs decimal256 for BIGNUMERIC columns.
-
 
 @pytest.mark.parametrize(
     ("max_results", "scalars_table_name"),

diff --git a/tests/system/test_pandas.py b/tests/system/test_pandas.py
@@ -567,7 +567,7 @@ def test_query_results_to_dataframe(bigquery_client):
     for _, row in df.iterrows():
         for col in column_names:
             # all the schema fields are nullable, so None is acceptable
-            if not row[col] is None:
+            if not pandas.isna(row[col]):
                 assert isinstance(row[col], exp_datatypes[col])
 
 
@@ -597,7 +597,7 @@ def test_query_results_to_dataframe_w_bqstorage(bigquery_client):
     for index, row in df.iterrows():
         for col in column_names:
             # all the schema fields are nullable, so None is acceptable
-            if not row[col] is None:
+            if not pandas.isna(row[col]):
                 assert isinstance(row[col], exp_datatypes[col])
 
 
@@ -795,3 +795,71 @@ def test_list_rows_max_results_w_bqstorage(bigquery_client):
         dataframe = row_iterator.to_dataframe(bqstorage_client=bqstorage_client)
 
     assert len(dataframe.index) == 100
+
+
+@pytest.mark.parametrize(
+    ("max_results",), ((None,), (10,),)  # Use BQ Storage API.  # Use REST API.
+)
+def test_list_rows_nullable_scalars_dtypes(bigquery_client, scalars_table, max_results):
+    df = bigquery_client.list_rows(
+        scalars_table, max_results=max_results,
+    ).to_dataframe()
+
+    assert df.dtypes["bool_col"].name == "boolean"
+    assert df.dtypes["datetime_col"].name == "datetime64[ns]"
+    assert df.dtypes["float64_col"].name == "float64"
+    assert df.dtypes["int64_col"].name == "Int64"
+    assert df.dtypes["timestamp_col"].name == "datetime64[ns, UTC]"
+
+    # object is used by default, but we can use "datetime64[ns]" automatically
+    # when data is within the supported range.
+    # https://github.com/googleapis/python-bigquery/issues/861
+    assert df.dtypes["date_col"].name == "object"
+
+    # object is used by default, but we can use "timedelta64[ns]" automatically
+    # https://github.com/googleapis/python-bigquery/issues/862
+    assert df.dtypes["time_col"].name == "object"
+
+    # decimal.Decimal is used to avoid loss of precision.
+    assert df.dtypes["bignumeric_col"].name == "object"
+    assert df.dtypes["numeric_col"].name == "object"
+
+    # pandas uses Python string and bytes objects.
+    assert df.dtypes["bytes_col"].name == "object"
+    assert df.dtypes["string_col"].name == "object"
+
+
+@pytest.mark.parametrize(
+    ("max_results",), ((None,), (10,),)  # Use BQ Storage API.  # Use REST API.
+)
+def test_list_rows_nullable_scalars_extreme_dtypes(
+    bigquery_client, scalars_extreme_table, max_results
+):
+    df = bigquery_client.list_rows(
+        scalars_extreme_table, max_results=max_results
+    ).to_dataframe()
+
+    # Extreme values are out-of-bounds for pandas datetime64 values, which use
+    # nanosecond precision.  Values before 1677-09-21 and after 2262-04-11 must
+    # be represented with object.
+    # https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#timestamp-limitations
+    assert df.dtypes["date_col"].name == "object"
+    assert df.dtypes["datetime_col"].name == "object"
+    assert df.dtypes["timestamp_col"].name == "object"
+
+    # These pandas dtypes can handle the same ranges as BigQuery.
+    assert df.dtypes["bool_col"].name == "boolean"
+    assert df.dtypes["float64_col"].name == "float64"
+    assert df.dtypes["int64_col"].name == "Int64"
+
+    # object is used by default, but we can use "timedelta64[ns]" automatically
+    # https://github.com/googleapis/python-bigquery/issues/862
+    assert df.dtypes["time_col"].name == "object"
+
+    # decimal.Decimal is used to avoid loss of precision.
+    assert df.dtypes["numeric_col"].name == "object"
+    assert df.dtypes["bignumeric_col"].name == "object"
+
+    # pandas uses Python string and bytes objects.
+    assert df.dtypes["bytes_col"].name == "object"
+    assert df.dtypes["string_col"].name == "object"
diff --git a/tests/unit/job/test_query_pandas.py b/tests/unit/job/test_query_pandas.py
@@ -20,11 +20,6 @@
 import pyarrow
 import pytest
 
-try:
-    import pandas
-except (ImportError, AttributeError):  # pragma: NO COVER
-    pandas = None
-
 from google.cloud import bigquery_storage
 
 try:
@@ -36,6 +31,8 @@
 from .helpers import _make_connection
 from .helpers import _make_job_resource
 
+pandas = pytest.importorskip("pandas")
+
 
 @pytest.fixture
 def table_read_options_kwarg():
@@ -78,7 +75,6 @@ def test__contains_order_by(query, expected):
         assert not mut._contains_order_by(query)
 
 
-@pytest.mark.skipif(pandas is None, reason="Requires `pandas`")
 @pytest.mark.parametrize(
     "query",
     (
@@ -413,7 +409,6 @@ def test_to_arrow_w_tqdm_wo_query_plan():
     result_patch_tqdm.assert_called()
 
 
-@pytest.mark.skipif(pandas is None, reason="Requires `pandas`")
 def test_to_dataframe():
     from google.cloud.bigquery.job import QueryJob as target_class
 
@@ -452,7 +447,6 @@ def test_to_dataframe():
     assert list(df) == ["name", "age"]  # verify the column names
 
 
-@pytest.mark.skipif(pandas is None, reason="Requires `pandas`")
 def test_to_dataframe_ddl_query():
     from google.cloud.bigquery.job import QueryJob as target_class
 
@@ -472,7 +466,6 @@ def test_to_dataframe_ddl_query():
     assert len(df) == 0
 
 
-@pytest.mark.skipif(pandas is None, reason="Requires `pandas`")
 def test_to_dataframe_bqstorage(table_read_options_kwarg):
     from google.cloud.bigquery.job import QueryJob as target_class
 
@@ -522,7 +515,6 @@ def test_to_dataframe_bqstorage(table_read_options_kwarg):
     )
 
 
-@pytest.mark.skipif(pandas is None, reason="Requires `pandas`")
 def test_to_dataframe_bqstorage_no_pyarrow_compression():
     from google.cloud.bigquery.job import QueryJob as target_class
 
@@ -565,7 +557,6 @@ def test_to_dataframe_bqstorage_no_pyarrow_compression():
     )
 
 
-@pytest.mark.skipif(pandas is None, reason="Requires `pandas`")
 def test_to_dataframe_column_dtypes():
     from google.cloud.bigquery.job import QueryJob as target_class
 
@@ -617,15 +608,14 @@ def test_to_dataframe_column_dtypes():
     assert list(df) == exp_columns  # verify the column names
 
     assert df.start_timestamp.dtype.name == "datetime64[ns, UTC]"
-    assert df.seconds.dtype.name == "int64"
+    assert df.seconds.dtype.name == "Int64"
     assert df.miles.dtype.name == "float64"
     assert df.km.dtype.name == "float16"
     assert df.payment_type.dtype.name == "object"
-    assert df.complete.dtype.name == "bool"
+    assert df.complete.dtype.name == "boolean"
     assert df.date.dtype.name == "object"
 
 
-@pytest.mark.skipif(pandas is None, reason="Requires `pandas`")
 def test_to_dataframe_column_date_dtypes():
     from google.cloud.bigquery.job import QueryJob as target_class
 
@@ -657,7 +647,6 @@ def test_to_dataframe_column_date_dtypes():
     assert df.date.dtype.name == "datetime64[ns]"
 
 
-@pytest.mark.skipif(pandas is None, reason="Requires `pandas`")
 @pytest.mark.skipif(tqdm is None, reason="Requires `tqdm`")
 @mock.patch("tqdm.tqdm")
 def test_to_dataframe_with_progress_bar(tqdm_mock):
@@ -685,7 +674,6 @@ def test_to_dataframe_with_progress_bar(tqdm_mock):
     tqdm_mock.assert_called()
 
 
-@pytest.mark.skipif(pandas is None, reason="Requires `pandas`")
 @pytest.mark.skipif(tqdm is None, reason="Requires `tqdm`")
 def test_to_dataframe_w_tqdm_pending():
     from google.cloud.bigquery import table
@@ -741,7 +729,6 @@ def test_to_dataframe_w_tqdm_pending():
     )
 
 
-@pytest.mark.skipif(pandas is None, reason="Requires `pandas`")
 @pytest.mark.skipif(tqdm is None, reason="Requires `tqdm`")
 def test_to_dataframe_w_tqdm():
     from google.cloud.bigquery import table
@@ -801,7 +788,6 @@ def test_to_dataframe_w_tqdm():
     )
 
 
-@pytest.mark.skipif(pandas is None, reason="Requires `pandas`")
 @pytest.mark.skipif(tqdm is None, reason="Requires `tqdm`")
 def test_to_dataframe_w_tqdm_max_results():
     from google.cloud.bigquery import table