Last active
August 13, 2023 03:58
-
-
Save andycasey/1106d51aaec54633eedb451437fce2b7 to your computer and use it in GitHub Desktop.
This demonstrates a way to use bitfield flags to store all targeting information for SDSS-V
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
This minimum reproducible example demonstrates a way to use bitfield flags to store the cartons | |
that a source is assigned to. | |
What does it do? | |
---------------- | |
1. Creates a `Source` table and inserts 10,000 random sources. | |
2. Creates some `SimplifiedCarton` entries based on unique names currently in targetdb. | |
3. Assigns sources to random cartons (many more than what would exist in reality). | |
4. Checks that when querying for sources based on a carton, it retrieves only the sources expected. | |
How does it do it? | |
------------------ | |
A single bitfield is used to store all the primary keys of the cartons that a source is assigned. | |
The minimum length of the bitfield flag used here is as small as 34 bytes (way less than some of | |
the file paths that are stored in the database). | |
What's bad about it? | |
-------------------- | |
The length of the bytearray varies per source, and is limited by the maximum primary key that a | |
carton is assigned. In this example I have created a `SimplifiedCarton` just to make things more | |
efficient, but this could in principle be used to map directly to the primary keys in | |
`targetdb.carton.pk`. There are many gaps in primary keys in `targetdb.carton`, which makes the | |
bitfield storage less efficient. | |
Storing the carton assignments in a bitfield flag makes it a little less easy to query (than | |
compared to a normal `peewee.BitField`), but I have included some hybrid methods to make it easy | |
to query by carton primary key. | |
Caveats | |
------- | |
This example works in PostgreSQL. You should be able to store the bitfield flags in SQLite, but | |
I think you won't be able to query with them in SQLite, because SQLite does not have as many | |
bitwise string operators available (I *think*; it would be great if we can do it in SQLite too). | |
""" | |
import numpy as np | |
from peewee import ( | |
fn, | |
AutoField, | |
FloatField, | |
TextField, | |
Model, | |
BigBitField, | |
PostgresqlDatabase, | |
) | |
from playhouse.hybrid import hybrid_method | |
database = PostgresqlDatabase(...) | |
class BaseModel(Model): | |
class Meta: | |
database = database | |
schema = None | |
legacy_table_names = False | |
class SimplifiedCarton(BaseModel): | |
""" | |
A simplified carton for use with targeting flags. | |
Simplified cartons have sanitised names, and do not track versions of cartons. | |
A simplified carton is considered unique by its 'sanitised' name. The 'sanitised' | |
name is where the dashes '-' are replaced with underscores '_'. For example, the | |
cartons with names 'bhm_aqmes_bonus_bright' and 'bhm_aqmes_bonus-bright' in | |
`targetdb.carton` are considered the same simplified carton with the name | |
'bhm_aqmes_bonus_bright'. | |
If a source was assigned to a carton in `targetdb` in some late version, then | |
that will not be tracked by this `SimplifiedCarton`. This `SimplifiedCarton` | |
should only track whether or not a source was assigned to a carton, not when. | |
""" | |
id = AutoField() | |
carton = TextField(unique=True, index=True) | |
class Source(BaseModel): | |
""" An astronomical source. """ | |
# Identifiers | |
id = AutoField() | |
# Astrometry | |
ra = FloatField() | |
dec = FloatField() | |
# Targeting | |
carton_0 = TextField(null=True) | |
carton_flags = BigBitField(null=True) | |
@hybrid_method | |
def is_in_simplified_carton(self, carton_id): | |
""" Return whether this source is assigned to the given (simplified) carton. """ | |
return ( | |
(fn.length(self.carton_flags) > int(carton_id / 8)) | |
& (fn.get_bit(self.carton_flags, carton_id) > 0) | |
) | |
@hybrid_method | |
def is_in_any_simplified_carton(self, *carton_ids): | |
return fn.OR(*[self.is_in_simplified_carton(carton_id) for carton_id in carton_ids]) | |
@property | |
def simplified_carton_ids(self): | |
""" Return the primary keys of the cartons that this source is assigned. """ | |
i, cartons, cur_size = (0, [], len(self.carton_flags._buffer)) | |
while True: | |
byte_num, byte_offset = divmod(i, 8) | |
if byte_num >= cur_size: | |
break | |
if bool(self.carton_flags._buffer[byte_num] & (1 << byte_offset)): | |
cartons.append(i) | |
i += 1 | |
return cartons | |
if __name__ == "__main__": | |
import numpy as np | |
from tqdm import tqdm | |
database.drop_tables([ | |
Source, | |
SimplifiedCarton | |
]) | |
database.create_tables([ | |
Source, | |
SimplifiedCarton, | |
]) | |
carton_names = ( | |
"bhm_aqmes_bonus_bright", | |
"bhm_aqmes_bonus-bright", | |
"bhm_aqmes_bonus_core", | |
"bhm_aqmes_bonus-dark", | |
"bhm_aqmes_bonus_faint", | |
"bhm_aqmes_med", | |
"bhm_aqmes_med_faint", | |
"bhm_aqmes_med-faint", | |
"bhm_aqmes_wide2", | |
"bhm_aqmes_wide2_faint", | |
"bhm_aqmes_wide2-faint", | |
"bhm_aqmes_wide3", | |
"bhm_aqmes_wide3-faint", | |
"bhm_colr_galaxies_lsdr10", | |
"bhm_colr_galaxies_lsdr8", | |
"bhm_csc_apogee", | |
"bhm_csc_boss", | |
"bhm_csc_boss_bright", | |
"bhm_csc_boss-bright", | |
"bhm_csc_boss_dark", | |
"bhm_csc_boss-dark", | |
"bhm_gua_bright", | |
"bhm_gua_dark", | |
"bhm_rm_ancillary", | |
"bhm_rm_core", | |
"bhm_rm_known_spec", | |
"bhm_rm_known-spec", | |
"bhm_rm_var", | |
"bhm_rm_xrayqso", | |
"bhm_spiders_agn-efeds", | |
"bhm_spiders_agn_efeds_stragglers", | |
"bhm_spiders_agn_gaiadr2", | |
"bhm_spiders_agn_gaiadr3", | |
"bhm_spiders_agn_hard", | |
"bhm_spiders_agn_lsdr10", | |
"bhm_spiders_agn_lsdr8", | |
"bhm_spiders_agn_ps1dr2", | |
"bhm_spiders_agn_sep", | |
"bhm_spiders_agn_skymapperdr2", | |
"bhm_spiders_agn_supercosmos", | |
"bhm_spiders_agn_tda", | |
"bhm_spiders_clusters-efeds-erosita", | |
"bhm_spiders_clusters-efeds-hsc-redmapper", | |
"bhm_spiders_clusters-efeds-ls-redmapper", | |
"bhm_spiders_clusters-efeds-sdss-redmapper", | |
"bhm_spiders_clusters_efeds_stragglers", | |
"bhm_spiders_clusters_lsdr10", | |
"bhm_spiders_clusters_lsdr8", | |
"bhm_spiders_clusters_ps1dr2", | |
"comm_pleiades", | |
"comm_spectrophoto", | |
"manual_bhm_spiders_comm", | |
"manual_bhm_spiders_comm_lco", | |
"manual_bright_target", | |
"manual_bright_target_offsets_1", | |
"manual_bright_target_offsets_1_g13", | |
"manual_bright_target_offsets_2", | |
"manual_bright_target_offsets_2_g13", | |
"manual_bright_target_offsets_3", | |
"manual_bright_targets", | |
"manual_bright_targets_g13", | |
"manual_bright_targets_g13_offset_fixed_1", | |
"manual_bright_targets_g13_offset_fixed_2", | |
"manual_bright_targets_g13_offset_fixed_3", | |
"manual_bright_targets_g13_offset_fixed_4", | |
"manual_bright_targets_g13_offset_fixed_5", | |
"manual_bright_targets_g13_offset_fixed_6", | |
"manual_bright_targets_g13_offset_fixed_7", | |
"manual_fps_position_stars", | |
"manual_fps_position_stars_10", | |
"manual_fps_position_stars_apogee_10", | |
"manual_fps_position_stars_lco_apogee_10", | |
"manual_mwm_crosscalib_apogee", | |
"manual_mwm_crosscalib_yso_apogee", | |
"manual_mwm_crosscalib_yso_boss", | |
"manual_mwm_halo_distant_bhb", | |
"manual_mwm_halo_distant_kgiant", | |
"manual_mwm_halo_mp_bbb", | |
"manual_mwm_magcloud_massive_apogee", | |
"manual_mwm_magcloud_massive_boss", | |
"manual_mwm_magcloud_symbiotic_apogee", | |
"manual_mwm_planet_ca_legacy_v1", | |
"manual_mwm_planet_gaia_astrometry_v1", | |
"manual_mwm_planet_gpi_v1", | |
"manual_mwm_planet_harps_v1", | |
"manual_mwm_planet_known_v1", | |
"manual_mwm_planet_sophie_v1", | |
"manual_mwm_planet_sphere_v1", | |
"manual_mwm_planet_tess_eb_v1", | |
"manual_mwm_planet_tess_pc_v1", | |
"manual_mwm_planet_transiting_bd_v1", | |
"manual_mwm_tess_ob", | |
"manual_mwm_validation_cool_apogee", | |
"manual_mwm_validation_cool_boss", | |
"manual_mwm_validation_hot_apogee", | |
"manual_mwm_validation_hot_boss", | |
"manual_mwm_validation_rv", | |
"manual_nsbh_apogee", | |
"manual_nsbh_boss", | |
"manual_offset_mwmhalo_off00", | |
"manual_offset_mwmhalo_off05", | |
"manual_offset_mwmhalo_off10", | |
"manual_offset_mwmhalo_off20", | |
"manual_offset_mwmhalo_off30", | |
"manual_offset_mwmhalo_offa", | |
"manual_offset_mwmhalo_offb", | |
"manual_planet_ca_legacy_v0", | |
"manual_planet_gaia_astrometry_v0", | |
"manual_planet_gpi_v0", | |
"manual_planet_harps_v0", | |
"manual_planet_known_v0", | |
"manual_planet_sophie_v0", | |
"manual_planet_sphere_v0", | |
"manual_planet_tess_eb_v0", | |
"manual_planet_tess_pc_v0", | |
"manual_planet_transiting_bd_v0", | |
"manual_validation_apogee", | |
"manual_validation_boss", | |
"manual_validation_cool_apogee", | |
"manual_validation_cool_boss", | |
"manual_validation_rv", | |
"mwm_bin_rv_long", | |
"mwm_bin_rv_short", | |
"mwm_cb_300pc", | |
"mwm_cb_300pc_apogee", | |
"mwm_cb_300pc_boss", | |
"mwm_cb_cvcandidates", | |
"mwm_cb_cvcandidates_apogee", | |
"mwm_cb_cvcandidates_boss", | |
"mwm_cb_gaiagalex", | |
"mwm_cb_gaiagalex_apogee", | |
"mwm_cb_gaiagalex_boss", | |
"mwm_cb_uvex1", | |
"mwm_cb_uvex2", | |
"mwm_cb_uvex3", | |
"mwm_cb_uvex4", | |
"mwm_cb_uvex5", | |
"mwm_dust_core", | |
"mwm_erosita_compact", | |
"mwm_erosita_compact_deep", | |
"mwm_erosita_compact_gen", | |
"mwm_erosita_compact_var", | |
"mwm_erosita_stars", | |
"mwm_galactic_core", | |
"mwm_galactic_core_dist", | |
"mwm_gg_core", | |
"mwm_halo_bb", | |
"mwm_halo_bb_apogee", | |
"mwm_halo_bb_boss", | |
"mwm_halo_sm", | |
"mwm_halo_sm_apogee", | |
"mwm_halo_sm_boss", | |
"mwm_legacy_ir2opt", | |
"mwm_ob_cepheids", | |
"mwm_ob_core", | |
"mwm_planet_tess", | |
"mwm_rv_long_bplates", | |
"mwm_rv_long-bplates", | |
"mwm_rv_long_fps", | |
"mwm_rv_long-fps", | |
"mwm_rv_long_rm", | |
"mwm_rv_long-rm", | |
"mwm_rv_short_bplates", | |
"mwm_rv_short-bplates", | |
"mwm_rv_short_fps", | |
"mwm_rv_short-fps", | |
"mwm_rv_short_rm", | |
"mwm_rv_short-rm", | |
"mwm_snc_100pc", | |
"mwm_snc_100pc_apogee", | |
"mwm_snc_100pc_boss", | |
"mwm_snc_250pc", | |
"mwm_snc_250pc_apogee", | |
"mwm_snc_250pc_boss", | |
"mwm_tess_2min", | |
"mwm_tess_ob", | |
"mwm_tess_planet", | |
"mwm_tess_rgb", | |
"mwm_tessrgb_core", | |
"mwm_wd_core", | |
"mwm_wd_pwd", | |
"mwm_yso_cluster", | |
"mwm_yso_cluster_apogee", | |
"mwm_yso_cluster_boss", | |
"mwm_yso_cmz", | |
"mwm_yso_cmz_apogee", | |
"mwm_yso_disk_apogee", | |
"mwm_yso_disk_boss", | |
"mwm_yso_embedded_apogee", | |
"mwm_yso_nebula_apogee", | |
"mwm_yso_ob", | |
"mwm_yso_ob_apogee", | |
"mwm_yso_ob_boss", | |
"mwm_yso_pms_apogee", | |
"mwm_yso_pms_apogee_sagitta_edr3", | |
"mwm_yso_pms_apogee_zari18pms", | |
"mwm_yso_pms_boss", | |
"mwm_yso_pms_boss_sagitta_edr3", | |
"mwm_yso_pms_boss_zari18pms", | |
"mwm_yso_s1", | |
"mwm_yso_s2", | |
"mwm_yso_s2-5", | |
"mwm_yso_s3", | |
"mwm_yso_variable_apogee", | |
"mwm_yso_variable_boss", | |
"openfiberstargets_test", | |
"openfibertargets_nov2020_10", | |
"openfibertargets_nov2020_1000", | |
"openfibertargets_nov2020_1001a", | |
"openfibertargets_nov2020_1001b", | |
"openfibertargets_nov2020_11", | |
"openfibertargets_nov2020_12", | |
"openfibertargets_nov2020_14", | |
"openfibertargets_nov2020_15", | |
"openfibertargets_nov2020_17", | |
"openfibertargets_nov2020_18", | |
"openfibertargets_nov2020_19a", | |
"openfibertargets_nov2020_19b", | |
"openfibertargets_nov2020_19c", | |
"openfibertargets_nov2020_22", | |
"openfibertargets_nov2020_24", | |
"openfibertargets_nov2020_25", | |
"openfibertargets_nov2020_26", | |
"openfibertargets_nov2020_27", | |
"openfibertargets_nov2020_28a", | |
"openfibertargets_nov2020_28b", | |
"openfibertargets_nov2020_28c", | |
"openfibertargets_nov2020_29", | |
"openfibertargets_nov2020_3", | |
"openfibertargets_nov2020_30", | |
"openfibertargets_nov2020_31", | |
"openfibertargets_nov2020_32", | |
"openfibertargets_nov2020_33", | |
"openfibertargets_nov2020_34a", | |
"openfibertargets_nov2020_34b", | |
"openfibertargets_nov2020_35a", | |
"openfibertargets_nov2020_35b", | |
"openfibertargets_nov2020_35c", | |
"openfibertargets_nov2020_46", | |
"openfibertargets_nov2020_47a", | |
"openfibertargets_nov2020_47b", | |
"openfibertargets_nov2020_47c", | |
"openfibertargets_nov2020_47d", | |
"openfibertargets_nov2020_47e", | |
"openfibertargets_nov2020_5", | |
"openfibertargets_nov2020_6a", | |
"openfibertargets_nov2020_6b", | |
"openfibertargets_nov2020_6c", | |
"openfibertargets_nov2020_8", | |
"openfibertargets_nov2020_9", | |
"ops_2mass_psc_brightneighbors", | |
"ops_apogee_stds", | |
"ops_gaia_brightneighbors", | |
"ops_sky_apogee", | |
"ops_sky_apogee_best", | |
"ops_sky_apogee_good", | |
"ops_sky_boss", | |
"ops_sky_boss_best", | |
"ops_sky_boss_fallback", | |
"ops_sky_boss_good", | |
"ops_std_apogee", | |
"ops_std_boss", | |
"ops_std_boss_gdr2", | |
"ops_std_boss_lsdr10", | |
"ops_std_boss_lsdr8", | |
"ops_std_boss_ps1dr2", | |
"ops_std_boss_red", | |
"ops_std_boss-red", | |
"ops_std_boss_tic", | |
"ops_std_eboss", | |
"ops_tycho2_brightneighbors" | |
) | |
cartons = [SimplifiedCarton(carton=carton_name) for carton_name in carton_names] | |
with database.atomic(): | |
( | |
SimplifiedCarton | |
.bulk_create(cartons) | |
) | |
np.random.seed(0) | |
C = len(cartons) | |
N = 10_000 | |
N_cartons_assigned_to = np.random.randint(0, C, size=N) | |
print("Preparing sources") | |
assignments = {} | |
sources = [] | |
for i in tqdm(range(N)): | |
ra, dec = np.random.uniform(size=2) | |
source = Source.create(ra=ra, dec=dec) | |
carton_assignments = np.random.choice(cartons, size=N_cartons_assigned_to[i], replace=False) | |
for carton in carton_assignments: | |
source.carton_flags.set_bit(carton.id) | |
assignments.setdefault(carton.id, []) | |
assignments[carton.id].append(source.id) | |
source.save() | |
sources.append(source) | |
for carton_id, expected_source_ids in tqdm(assignments.items(), desc="Checking.."): | |
q = ( | |
Source | |
.select(Source.id) | |
.where(Source.is_in_simplified_carton(carton_id)) | |
.tuples() | |
) | |
actual_source_ids = [source_id for source_id, in q] | |
diff = set(expected_source_ids).symmetric_difference(actual_source_ids) | |
assert len(diff) == 0 | |
print("Done") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment