This problem got a little easier with the introduction of dynamic SQL to BigQuery scripting in 2020. Now, we can build up a query dynamically and execute it via EXECUTE IMMEDIATE
.
Somthing like this would do for most circumstances where all the datasets are in region-us
:
DECLARE dataset_names ARRAY<STRING>;
SET dataset_names = (
SELECT ARRAY_AGG(SCHEMA_NAME) FROM `region-us.INFORMATION_SCHEMA.SCHEMATA`
);
EXECUTE IMMEDIATE (
SELECT STRING_AGG(
(SELECT """
SELECT project_id, dataset_id, table_id, row_count, size_bytes
FROM `""" || s ||
""".__TABLES__`"""),
" UNION ALL ")
FROM UNNEST(dataset_names) AS s);
If there are a large number of datasets then this may return a rate limit error when trying to read all the meta data concurrently.
If this happens then we can then fall back on a "batched" approach that's a little more complicated to read and slower/less-efficient but still gets the job done:
DECLARE dataset_names ARRAY<STRING>;
DECLARE batch ARRAY<STRING>;
DECLARE batch_size INT64 DEFAULT 25;
CREATE TEMP TABLE results (
project_id STRING,
dataset_id STRING,
table_id STRING,
row_count INT64,
size_bytes INT64
);
SET dataset_names = (
SELECT ARRAY_AGG(SCHEMA_NAME)
FROM `region-us.INFORMATION_SCHEMA.SCHEMATA`
);
LOOP
IF ARRAY_LENGTH(dataset_names) < 1 THEN
LEAVE;
END IF;
SET batch = (
SELECT ARRAY_AGG(d)
FROM UNNEST(dataset_names) AS d WITH OFFSET i
WHERE i < batch_size);
EXECUTE IMMEDIATE (
SELECT """INSERT INTO results """
|| STRING_AGG(
(SELECT """
SELECT project_id, dataset_id, table_id, row_count, size_bytes
FROM `""" || s || """.__TABLES__`"""),
" UNION ALL ")
FROM UNNEST(batch) AS s);
SET dataset_names = (
SELECT ARRAY_AGG(d)
FROM UNNEST(dataset_names) AS d
WHERE d NOT IN (SELECT * FROM UNNEST(batch)));
END LOOP;
SELECT * FROM results;