relud · August 20, 2019 00:08
diff --git a/dataproc_run_export.sh b/dataproc_run_export.sh
 #!/usr/bin/env bash

 set -e

 CLUSTER="${CLUSTER:-export-to-firestore}"

 gcloud beta dataproc clusters create "$CLUSTER"
    --max-idle=10m \
    --metadata='PIP_PACKAGES=google-cloud-firestore==1.3.0' \
    --initialization-actions gs://dataproc-initialization-actions/python/pip-install.sh

 gcloud dataproc jobs submit pyspark export_to_firestore.py \
    --cluster="$CLUSTER" \
    --jars=gs://spark-lib/bigquery/spark-bigquery-latest.jar
diff --git a/export_to_firestore.py b/export_to_firestore.py
 #!/usr/bin/env python

 """Read a table from the BigQuery Storage API and write to Cloud Firestore."""

 from pyspark.sql import SparkSession
 from google.cloud import firestore
 import hashlib


 def batch_load(group):
    client = firestore.Client()
    batch = client.batch()
    for row in group[1]:
        doc = row.asDict()
        collection = "{}-{}".format(doc.pop("channel"), doc.pop("app_version"))
        doc_id = hashlib.blake2b(
            "{metric}-{build_id}-{os}-{agg_type}".format(**doc).encode()
        ).hexdigest()
        batch.set(FIRESTORE.collection(collection).document(doc_id), doc)
    batch.commit()


 # run spark job from parsed args
 (
    SparkSession.builder.appName("export_to_firestore")
    .getOrCreate()
    .read.format("bigquery")
    .option("table", "telemetry.client_probe_counts_v1")
    .load()
    .selectExpr(
        "app_version",
        "channel",
        "metric",
        "app_build_id AS build_id",
        "os",
        "agg_type",
        "aggregates.key_value AS aggregates",
    )
    .where("app_version IS NOT NULL AND channel IS NOT NULL")
    .rdd.zipWithIndex()
    .map(lambda pair: (pair[1] // 500, pair[0]))
    .groupByKey()
    .foreach(batch_load)
 )
	#!/usr/bin/env bash

	set -e

	CLUSTER="${CLUSTER:-export-to-firestore}"

	gcloud beta dataproc clusters create "$CLUSTER"
	--max-idle=10m \
	--metadata='PIP_PACKAGES=google-cloud-firestore==1.3.0' \
	--initialization-actions gs://dataproc-initialization-actions/python/pip-install.sh

	gcloud dataproc jobs submit pyspark export_to_firestore.py \
	--cluster="$CLUSTER" \
	--jars=gs://spark-lib/bigquery/spark-bigquery-latest.jar
	#!/usr/bin/env python

	"""Read a table from the BigQuery Storage API and write to Cloud Firestore."""

	from pyspark.sql import SparkSession
	from google.cloud import firestore
	import hashlib


	def batch_load(group):
	client = firestore.Client()
	batch = client.batch()
	for row in group[1]:
	doc = row.asDict()
	collection = "{}-{}".format(doc.pop("channel"), doc.pop("app_version"))
	doc_id = hashlib.blake2b(
	"{metric}-{build_id}-{os}-{agg_type}".format(**doc).encode()
	).hexdigest()
	batch.set(FIRESTORE.collection(collection).document(doc_id), doc)
	batch.commit()


	# run spark job from parsed args
	(
	SparkSession.builder.appName("export_to_firestore")
	.getOrCreate()
	.read.format("bigquery")
	.option("table", "telemetry.client_probe_counts_v1")
	.load()
	.selectExpr(
	"app_version",
	"channel",
	"metric",
	"app_build_id AS build_id",
	"os",
	"agg_type",
	"aggregates.key_value AS aggregates",
	)
	.where("app_version IS NOT NULL AND channel IS NOT NULL")
	.rdd.zipWithIndex()
	.map(lambda pair: (pair[1] // 500, pair[0]))
	.groupByKey()
	.foreach(batch_load)
	)