From d44d7004f9a60e3ab4bbae8e2a5e6986915638bd Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Wed, 30 Jun 2021 16:13:50 -0500 Subject: [PATCH 1/6] docs: pandas DataFrame samples are more standalone --- samples/conftest.py | 23 +++++++ samples/quickstart/quickstart_test.py | 5 -- samples/to_dataframe/main_test.py | 9 ++- samples/to_dataframe/noxfile.py | 2 +- samples/to_dataframe/read_query_results.py | 47 +++++++++++++ .../to_dataframe/read_query_results_test.py | 21 ++++++ samples/to_dataframe/read_table_bigquery.py | 42 ++++++++++++ .../to_dataframe/read_table_bigquery_test.py | 21 ++++++ samples/to_dataframe/read_table_bqstorage.py | 68 +++++++++++++++++++ .../to_dataframe/read_table_bqstorage_test.py | 21 ++++++ samples/to_dataframe/requirements.txt | 8 +-- 11 files changed, 255 insertions(+), 12 deletions(-) create mode 100644 samples/conftest.py create mode 100644 samples/to_dataframe/read_query_results.py create mode 100644 samples/to_dataframe/read_query_results_test.py create mode 100644 samples/to_dataframe/read_table_bigquery.py create mode 100644 samples/to_dataframe/read_table_bigquery_test.py create mode 100644 samples/to_dataframe/read_table_bqstorage.py create mode 100644 samples/to_dataframe/read_table_bqstorage_test.py diff --git a/samples/conftest.py b/samples/conftest.py new file mode 100644 index 00000000..25b5afae --- /dev/null +++ b/samples/conftest.py @@ -0,0 +1,23 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import pytest + + +@pytest.fixture(scope="session") +def project_id(): + return os.environ["GOOGLE_CLOUD_PROJECT"] + diff --git a/samples/quickstart/quickstart_test.py b/samples/quickstart/quickstart_test.py index 23f3c350..0ae6848b 100644 --- a/samples/quickstart/quickstart_test.py +++ b/samples/quickstart/quickstart_test.py @@ -27,11 +27,6 @@ def now_millis(): ) -@pytest.fixture() -def project_id(): - return os.environ["GOOGLE_CLOUD_PROJECT"] - - def test_quickstart_wo_snapshot(capsys, project_id): quickstart.main(project_id) out, _ = capsys.readouterr() diff --git a/samples/to_dataframe/main_test.py b/samples/to_dataframe/main_test.py index bda6d601..6e817f09 100644 --- a/samples/to_dataframe/main_test.py +++ b/samples/to_dataframe/main_test.py @@ -31,7 +31,10 @@ def clients(): ) # Make clients. - bqclient = bigquery.Client(credentials=credentials, project=your_project_id,) + bqclient = bigquery.Client( + credentials=credentials, + project=your_project_id, + ) bqstorageclient = bigquery_storage.BigQueryReadClient(credentials=credentials) # [END bigquerystorage_pandas_tutorial_create_client] # [END bigquerystorage_pandas_tutorial_all] @@ -124,7 +127,9 @@ def test_session_to_dataframe(capsys, clients): read_options=read_options, ) read_session = bqstorageclient.create_read_session( - parent=parent, read_session=requested_session, max_stream_count=1, + parent=parent, + read_session=requested_session, + max_stream_count=1, ) # This example reads from only a single stream. Read from multiple streams diff --git a/samples/to_dataframe/noxfile.py b/samples/to_dataframe/noxfile.py index 160fe728..b3c8658a 100644 --- a/samples/to_dataframe/noxfile.py +++ b/samples/to_dataframe/noxfile.py @@ -226,7 +226,7 @@ def py(session: nox.sessions.Session) -> None: def _get_repo_root() -> Optional[str]: - """ Returns the root folder of the project. """ + """Returns the root folder of the project.""" # Get root of this repository. Assume we don't have directories nested deeper than 10 items. p = Path(os.getcwd()) for i in range(10): diff --git a/samples/to_dataframe/read_query_results.py b/samples/to_dataframe/read_query_results.py new file mode 100644 index 00000000..6551eff5 --- /dev/null +++ b/samples/to_dataframe/read_query_results.py @@ -0,0 +1,47 @@ +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def read_query_results(): + # [START bigquerystorage_pandas_tutorial_read_query_results] + from google.cloud import bigquery + + bqclient = bigquery.Client() + + # Download query results. + query_string = """ + SELECT + CONCAT( + 'https://stackoverflow.com/questions/', + CAST(id as STRING)) as url, + view_count + FROM `bigquery-public-data.stackoverflow.posts_questions` + WHERE tags like '%google-bigquery%' + ORDER BY view_count DESC + """ + + dataframe = ( + bqclient.query(query_string) + .result() + .to_dataframe( + # Optionally, explicitly request to use the BigQuery Storage API. As of + # google-cloud-bigquery version 1.26.0 and above, the BigQuery Storage + # API is used by default. + create_bqstorage_client=True, + ) + ) + print(dataframe.head()) + # [END bigquerystorage_pandas_tutorial_read_query_results + + return dataframe diff --git a/samples/to_dataframe/read_query_results_test.py b/samples/to_dataframe/read_query_results_test.py new file mode 100644 index 00000000..55b55a08 --- /dev/null +++ b/samples/to_dataframe/read_query_results_test.py @@ -0,0 +1,21 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from . import read_query_results + + +def test_read_query_results(capsys): + read_query_results.read_query_results() + out, _ = capsys.readouterr() + assert "stackoverflow" in out diff --git a/samples/to_dataframe/read_table_bigquery.py b/samples/to_dataframe/read_table_bigquery.py new file mode 100644 index 00000000..82d8879b --- /dev/null +++ b/samples/to_dataframe/read_table_bigquery.py @@ -0,0 +1,42 @@ +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def read_table(): + # [START bigquerystorage_pandas_tutorial_read_table] + from google.cloud import bigquery + + bqclient = bigquery.Client() + + # Download a table. + table = bigquery.TableReference.from_string( + "bigquery-public-data.utility_us.country_code_iso" + ) + rows = bqclient.list_rows( + table, + selected_fields=[ + bigquery.SchemaField("country_name", "STRING"), + bigquery.SchemaField("fips_code", "STRING"), + ], + ) + dataframe = rows.to_dataframe( + # Optionally, explicitly request to use the BigQuery Storage API. As of + # google-cloud-bigquery version 1.26.0 and above, the BigQuery Storage + # API is used by default. + create_bqstorage_client=True, + ) + print(dataframe.head()) + # [END bigquerystorage_pandas_tutorial_read_table] + + return dataframe diff --git a/samples/to_dataframe/read_table_bigquery_test.py b/samples/to_dataframe/read_table_bigquery_test.py new file mode 100644 index 00000000..c8301857 --- /dev/null +++ b/samples/to_dataframe/read_table_bigquery_test.py @@ -0,0 +1,21 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from . import read_table_bigquery + + +def test_read_table(capsys): + read_table_bigquery.read_table() + out, _ = capsys.readouterr() + assert "country_name" in out diff --git a/samples/to_dataframe/read_table_bqstorage.py b/samples/to_dataframe/read_table_bqstorage.py new file mode 100644 index 00000000..be3aac9e --- /dev/null +++ b/samples/to_dataframe/read_table_bqstorage.py @@ -0,0 +1,68 @@ +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def read_table(your_project_id): + original_your_project_id = your_project_id + # [START bigquerystorage_pandas_tutorial_read_session] + your_project_id = "project-for-read-session" + # [END bigquerystorage_pandas_tutorial_read_session] + your_project_id = original_your_project_id + + # [START bigquerystorage_pandas_tutorial_read_session] + from google.cloud import bigquery_storage + from google.cloud.bigquery_storage import types + + bqstorageclient = bigquery_storage.BigQueryReadClient() + + project_id = "bigquery-public-data" + dataset_id = "new_york_trees" + table_id = "tree_species" + table = f"projects/{project_id}/datasets/{dataset_id}/tables/{table_id}" + + # Select columns to read with read options. If no read options are + # specified, the whole table is read. + read_options = types.ReadSession.TableReadOptions( + selected_fields=["species_common_name", "fall_color"] + ) + + parent = "projects/{}".format(your_project_id) + + requested_session = types.ReadSession( + table=table, + # Avro is also supported, but the Arrow data format is optimized to + # work well with column-oriented data structures such as pandas + # DataFrames. + data_format=types.DataFormat.ARROW, + read_options=read_options, + ) + read_session = bqstorageclient.create_read_session( + parent=parent, + read_session=requested_session, + max_stream_count=1, + ) + + # This example reads from only a single stream. Read from multiple streams + # to fetch data faster. Note that the session may not contain any streams + # if there are no rows to read. + stream = read_session.streams[0] + reader = bqstorageclient.read_rows(stream.name) + + # Parse all Arrow blocks and create a dataframe. This call requires a + # session, because the session contains the schema for the row blocks. + dataframe = reader.to_dataframe(read_session) + print(dataframe.head()) + # [END bigquerystorage_pandas_tutorial_read_session] + + return dataframe diff --git a/samples/to_dataframe/read_table_bqstorage_test.py b/samples/to_dataframe/read_table_bqstorage_test.py new file mode 100644 index 00000000..cc093078 --- /dev/null +++ b/samples/to_dataframe/read_table_bqstorage_test.py @@ -0,0 +1,21 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from . import read_table_bqstorage + + +def test_read_table(capsys, project_id): + read_table_bqstorage.read_table(your_project_id=project_id) + out, _ = capsys.readouterr() + assert "species_common_name" in out diff --git a/samples/to_dataframe/requirements.txt b/samples/to_dataframe/requirements.txt index 455e6894..2f8dc500 100644 --- a/samples/to_dataframe/requirements.txt +++ b/samples/to_dataframe/requirements.txt @@ -2,7 +2,7 @@ google-auth==1.32.0 google-cloud-bigquery-storage==2.4.0 google-cloud-bigquery==2.20.0 pyarrow==4.0.1 -ipython==7.10.2; python_version > '3.0' -ipython==5.9.0; python_version < '3.0' -pandas==0.25.3; python_version > '3.0' -pandas==0.24.2; python_version < '3.0' +ipython==7.24.0; python_version > '3.6' +ipython==7.16.1; python_version <= '3.6' +pandas==1.2.5; python_version > '3.6' +pandas==1.1.5; python_version <= '3.6' From 94df06394019a86da87a5514de9b9cbaf0fbe703 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Wed, 30 Jun 2021 16:18:30 -0500 Subject: [PATCH 2/6] fix region tag --- samples/to_dataframe/read_query_results.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/samples/to_dataframe/read_query_results.py b/samples/to_dataframe/read_query_results.py index 6551eff5..1660ee7b 100644 --- a/samples/to_dataframe/read_query_results.py +++ b/samples/to_dataframe/read_query_results.py @@ -42,6 +42,6 @@ def read_query_results(): ) ) print(dataframe.head()) - # [END bigquerystorage_pandas_tutorial_read_query_results + # [END bigquerystorage_pandas_tutorial_read_query_result] return dataframe From d4221828874811cc8e211da1329ad8b2e85b89e0 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Wed, 30 Jun 2021 16:19:19 -0500 Subject: [PATCH 3/6] fix region tag --- samples/to_dataframe/read_query_results.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/samples/to_dataframe/read_query_results.py b/samples/to_dataframe/read_query_results.py index 1660ee7b..45bae1ea 100644 --- a/samples/to_dataframe/read_query_results.py +++ b/samples/to_dataframe/read_query_results.py @@ -42,6 +42,6 @@ def read_query_results(): ) ) print(dataframe.head()) - # [END bigquerystorage_pandas_tutorial_read_query_result] + # [END bigquerystorage_pandas_tutorial_read_query_results] return dataframe From 87ff7fac7d10062cd865828644b85ea0fe7ed047 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Wed, 30 Jun 2021 16:38:40 -0500 Subject: [PATCH 4/6] remove unused imports --- samples/quickstart/quickstart_test.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/samples/quickstart/quickstart_test.py b/samples/quickstart/quickstart_test.py index 0ae6848b..8e1e0dfd 100644 --- a/samples/quickstart/quickstart_test.py +++ b/samples/quickstart/quickstart_test.py @@ -13,9 +13,6 @@ # limitations under the License. import datetime -import os - -import pytest from . import quickstart From fdd8b6b3932e313ef7c559cf85285dba8a5e47d9 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Thu, 1 Jul 2021 09:53:10 -0500 Subject: [PATCH 5/6] blacken --- samples/conftest.py | 1 - samples/to_dataframe/main_test.py | 9 ++------- samples/to_dataframe/read_table_bqstorage.py | 4 +--- 3 files changed, 3 insertions(+), 11 deletions(-) diff --git a/samples/conftest.py b/samples/conftest.py index 25b5afae..92068ef5 100644 --- a/samples/conftest.py +++ b/samples/conftest.py @@ -20,4 +20,3 @@ @pytest.fixture(scope="session") def project_id(): return os.environ["GOOGLE_CLOUD_PROJECT"] - diff --git a/samples/to_dataframe/main_test.py b/samples/to_dataframe/main_test.py index 6e817f09..bda6d601 100644 --- a/samples/to_dataframe/main_test.py +++ b/samples/to_dataframe/main_test.py @@ -31,10 +31,7 @@ def clients(): ) # Make clients. - bqclient = bigquery.Client( - credentials=credentials, - project=your_project_id, - ) + bqclient = bigquery.Client(credentials=credentials, project=your_project_id,) bqstorageclient = bigquery_storage.BigQueryReadClient(credentials=credentials) # [END bigquerystorage_pandas_tutorial_create_client] # [END bigquerystorage_pandas_tutorial_all] @@ -127,9 +124,7 @@ def test_session_to_dataframe(capsys, clients): read_options=read_options, ) read_session = bqstorageclient.create_read_session( - parent=parent, - read_session=requested_session, - max_stream_count=1, + parent=parent, read_session=requested_session, max_stream_count=1, ) # This example reads from only a single stream. Read from multiple streams diff --git a/samples/to_dataframe/read_table_bqstorage.py b/samples/to_dataframe/read_table_bqstorage.py index be3aac9e..63914ea0 100644 --- a/samples/to_dataframe/read_table_bqstorage.py +++ b/samples/to_dataframe/read_table_bqstorage.py @@ -48,9 +48,7 @@ def read_table(your_project_id): read_options=read_options, ) read_session = bqstorageclient.create_read_session( - parent=parent, - read_session=requested_session, - max_stream_count=1, + parent=parent, read_session=requested_session, max_stream_count=1, ) # This example reads from only a single stream. Read from multiple streams From 0b7fc64f1a4136db50fc078af0826fa053de767c Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Fri, 9 Jul 2021 16:37:08 -0500 Subject: [PATCH 6/6] remove session from call to rows/to_dataframe --- samples/to_dataframe/read_table_bqstorage.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/samples/to_dataframe/read_table_bqstorage.py b/samples/to_dataframe/read_table_bqstorage.py index 63914ea0..0a3ae777 100644 --- a/samples/to_dataframe/read_table_bqstorage.py +++ b/samples/to_dataframe/read_table_bqstorage.py @@ -23,6 +23,7 @@ def read_table(your_project_id): # [START bigquerystorage_pandas_tutorial_read_session] from google.cloud import bigquery_storage from google.cloud.bigquery_storage import types + import pandas bqstorageclient = bigquery_storage.BigQueryReadClient() @@ -57,9 +58,11 @@ def read_table(your_project_id): stream = read_session.streams[0] reader = bqstorageclient.read_rows(stream.name) - # Parse all Arrow blocks and create a dataframe. This call requires a - # session, because the session contains the schema for the row blocks. - dataframe = reader.to_dataframe(read_session) + # Parse all Arrow blocks and create a dataframe. + frames = [] + for message in reader.rows().pages: + frames.append(message.to_dataframe()) + dataframe = pandas.concat(frames) print(dataframe.head()) # [END bigquerystorage_pandas_tutorial_read_session]