ingenieroariel · September 27, 2023 21:03
diff --git a/devenv.nix b/devenv.nix
 { pkgs, ... }:

 {
  packages = with pkgs; [
               git
               google-cloud-sdk
               (python311.withPackages(ps: with ps; [
                    pyarrow
                    duckdb
                    h3
               ]))
  ]; 

  scripts.google.exec = ''gsutil -m rsync -avhP gs://open-buildings-data/v3/points_s2_level_4_gzip $1'';
  scripts.arrow.exec = ''time python -W ignore google_to_arrow.py $1 $2'';
  scripts.pstac.exec = ''echo "psych"'';

  enterShell = ''
    git --version
  '';
 }
diff --git a/google_to_arrow.py b/google_to_arrow.py
 import os
 import sys
 import glob
 import subprocess
 import multiprocessing
 import pyarrow as pa
 import duckdb as db
 import h3.unstable.vect as vect


 def process_file(filename):
  print(f"Processing {filename}")
  b = db.sql(f"SELECT latitude, longitude FROM '{filename}'").arrow()
  lats = b["latitude"].combine_chunks().to_numpy()
  lons =  b["longitude"].combine_chunks().to_numpy()
  h3_15 = vect.geo_to_h3(lats, lons, 15)
  h3_7 = vect.h3_to_parent(h3_15, 7)
  h3_15_arrow = pa.array(h3_15, type=pa.uint64())
  h3_7_arrow = pa.array(h3_7, type=pa.uint64())
  table = pa.table({'h3_15': h3_15_arrow, 'h3_7': h3_7_arrow})
  out_dir = sys.argv[2]
  os.makedirs(out_dir, exist_ok=True)
  target_file = os.path.join(out_dir, os.path.basename(filename)) + ".arrow"

  with pa.OSFile(target_file, "wb") as sink:
    with pa.RecordBatchFileWriter(sink, table.schema) as writer:
      writer.write_table(table)
  print(f"Wrote {target_file}")

 if __name__ == '__main__':
  google_dir = sys.argv[1]
  print(f"Entering {google_dir}")
  with multiprocessing.Pool(multiprocessing.cpu_count() - 2) as processing_pool:
    processing_pool.map(process_file, glob.glob(f"{google_dir}/*.csv.gz"))
diff --git a/read.py b/read.py
 import duckdb as db
 import pandas as pd
 import pyarrow as pa
 from glob import glob

 file_paths = glob.glob("/Users/x/data/points_arrow_h3_15_and_7/*")
 tm = lambda pa_dtype : pd.ArrowDtype(pa_dtype)

 # Open and read each file, storing the tables in a list
 tables = []
 for path in file_paths:
    with pa.ipc.open_file(path) as f:
        tables.append(f.read_all().to_pandas(types_mapper=tm)[["h3_15"]])
        
 df = pd.concat(tables)
	{ pkgs, ... }:

	{
	packages = with pkgs; [
	git
	google-cloud-sdk
	(python311.withPackages(ps: with ps; [
	pyarrow
	duckdb
	h3
	]))
	];

	scripts.google.exec = ''gsutil -m rsync -avhP gs://open-buildings-data/v3/points_s2_level_4_gzip $1'';
	scripts.arrow.exec = ''time python -W ignore google_to_arrow.py $1 $2'';
	scripts.pstac.exec = ''echo "psych"'';

	enterShell = ''
	git --version
	'';
	}
	import os
	import sys
	import glob
	import subprocess
	import multiprocessing
	import pyarrow as pa
	import duckdb as db
	import h3.unstable.vect as vect


	def process_file(filename):
	print(f"Processing {filename}")
	b = db.sql(f"SELECT latitude, longitude FROM '{filename}'").arrow()
	lats = b["latitude"].combine_chunks().to_numpy()
	lons = b["longitude"].combine_chunks().to_numpy()
	h3_15 = vect.geo_to_h3(lats, lons, 15)
	h3_7 = vect.h3_to_parent(h3_15, 7)
	h3_15_arrow = pa.array(h3_15, type=pa.uint64())
	h3_7_arrow = pa.array(h3_7, type=pa.uint64())
	table = pa.table({'h3_15': h3_15_arrow, 'h3_7': h3_7_arrow})
	out_dir = sys.argv[2]
	os.makedirs(out_dir, exist_ok=True)
	target_file = os.path.join(out_dir, os.path.basename(filename)) + ".arrow"

	with pa.OSFile(target_file, "wb") as sink:
	with pa.RecordBatchFileWriter(sink, table.schema) as writer:
	writer.write_table(table)
	print(f"Wrote {target_file}")

	if __name__ == '__main__':
	google_dir = sys.argv[1]
	print(f"Entering {google_dir}")
	with multiprocessing.Pool(multiprocessing.cpu_count() - 2) as processing_pool:
	processing_pool.map(process_file, glob.glob(f"{google_dir}/*.csv.gz"))
	import duckdb as db
	import pandas as pd
	import pyarrow as pa
	from glob import glob

	file_paths = glob.glob("/Users/x/data/points_arrow_h3_15_and_7/*")
	tm = lambda pa_dtype : pd.ArrowDtype(pa_dtype)

	# Open and read each file, storing the tables in a list
	tables = []
	for path in file_paths:
	with pa.ipc.open_file(path) as f:
	tables.append(f.read_all().to_pandas(types_mapper=tm)[["h3_15"]])

	df = pd.concat(tables)