Skip to content

Instantly share code, notes, and snippets.

@bede
Last active May 26, 2023 15:49
Show Gist options
  • Save bede/c2cd27a12add680648fde39c427ae752 to your computer and use it in GitHub Desktop.
Save bede/c2cd27a12add680648fde39c427ae752 to your computer and use it in GitHub Desktop.
Pandera MWE – I want a single failure case when region_is_valid fails indicating the sample_name of the row that failed (cDNA-VOC-1-v4-1)
from io import StringIO
import pandas as pd
import pandera as pa
import pandera.extensions as extensions
from pandera.typing import Index, Series
csv_string = """
sample_name,country,region
cDNA-VOC-1-v4-1,USA,Bretagne
cDNA-VOC-1-v4-2,USA,Texas
"""
# Bretagne is not in the USA, so should fail the check
countries_subdivisions = {'USA': ['Texas'], 'France': ['Bretagne']}
COUNTRIES_ALPHA_3 = set(countries_subdivisions.keys())
REGIONS = {i for l in countries_subdivisions.values() for i in l}
@extensions.register_check_method()
def region_is_valid(df):
"""
Validate the region field using ISO-3166
"""
def validate_region(row):
if not pd.isna(row["region"]) and row["region"] not in countries_subdivisions.get(
row["country"]
):
valid = False
else:
valid = True
return valid
return df.apply(validate_region, axis=1)
# @extensions.register_check_method()
# def is_texas(df):
# return df['region'] == 'Texas'
class BaseSchema(pa.SchemaModel):
"""
Validate generic GPAS upload CSVs
"""
sample_name: Index[str] = pa.Field(str_matches=r"^[A-Za-z0-9._-]+$", unique=True, coerce=True, nullable=False)
country: Series[str] = pa.Field(isin=COUNTRIES_ALPHA_3, coerce=True, nullable=False)
region: Series[str] = pa.Field(nullable=True, isin=REGIONS, coerce=True)
class Config:
region_is_valid = ()
# is_texas = ()
def main():
df = pd.read_csv(StringIO(csv_string), index_col=0)
try:
BaseSchema.validate(df, lazy=True)
except pa.errors.SchemaErrors as e:
print(e.failure_cases)
if __name__ == '__main__':
main()
@bede
Copy link
Author

bede commented Jun 3, 2022

This script outputs the following:

% python mwe.py
    schema_context   column            check  check_number failure_case            index
0  DataFrameSchema  country  region_is_valid             0          USA  cDNA-VOC-1-v4-1
1  DataFrameSchema   region  region_is_valid             0     Bretagne  cDNA-VOC-1-v4-1

I'd like to have a single failure case per failing row – is this possible?

@bede
Copy link
Author

bede commented Jun 3, 2022

Clearly I could do e.failure_cases.groupby('index')['check'].unique(), but perhaps there is a Right Way to do this : ) @cosmicBboy

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment