Redshift Metadata

Redshift Tables Used

pg_table_def, stl_query, stl_querytext, stl_tr_conflict, stl_explain, stl_alert_event_log, stl_ddltext, stl_scan, stl_save, stl_hashjoin, stl_hash, stl_plan_info, stl_return, and information_schema.table_constraints.

Queries to Extract Features

execution time

select (endtime - starttime) as execution_time_in_ms 
from stl_query 
where query = QUERY_ID;

from tables

select replace(regexp_substr(sql_body, 'from\\s+[^\\(\\s]+'), 'from ', '') 
from (select listagg(text) within group (order by sequence) as sql_body 
      from stl_querytext 
      where query = QUERY_ID);

joined in tables

select distinct replace(regexp_substr(sql_body, 'join\\s+[^\\(\\s]+'), 'join ', '') 
from (select listagg(text) within group (order by sequence) as sql_body 
      from stl_querytext 
      where query = QUERY_ID);

from table size (gb)

select count(tbl) / 1000.0 as gb 
from (select distinct datname
        , id
        , name 
      from stv_tbl_perm 
      join pg_database 
      on pg_database.oid = db_id) as t 
join stv_blocklist 
on tbl = t.id 
where name = TABLE_NAME;

filters (this need a lot of work)

select distinct replace(regexp_substr(sql_body, '(where\\s+[^\\(]+\\s+)[^(\\s+group\\s+by|order\\s+by|having|limit)]+'), 'where ', '') 
from (select query
        , listagg(text) within group (order by sequence) as sql_body 
      from stl_querytext 
      where query = QUERY_ID);

from table locks

select count(*) 
from stl_tr_conflict 
where table_id = TABLE_ID 
and xact_start_ts >= QUERY_START_TIME 
and abort_time <= QUERY_END_TIME;

number of concurrent queries

select count(*) 
from stl_query 
where starttime >= QUERY_START_TIME 
and endtime <= QUERY_END_TIME;

user_id

select userid 
from stl_query 
where query = QUERY_ID;

query group

select label 
from stl_query 
where query = QUERY_ID;

query database

select database 
from stl_query
where query = QUERY_ID;

query aborted

select aborted 
from stl_query
where query = QUERY_ID;

explain steps (this requires some text cleanup)

select nodeid
   , plannode 
from stl_explain 
where query = QUERY_ID;

query-planner alerts

select event 
from stl_alert_event_log 
where query = QUERY_ID;

query-planner solutions

select solution 
from stl_alert_event_log
where query = QUERY_ID;

number of concurrent ddl statements

select count(*) 
from stl_ddltext 
where starttime >= QUERY_START_TIME
and endtime <= QUERY_END_TIME;

from table column encoding (PostgreSQL only)

select json_object_agg(columnname, encoding) 
from table_info 
where tablename = TABLE_NAME;

from table column sortkey (PostgreSQL only)

select json_object_agg(columnname, sortkey) 
from table_info 
where tablename = TABLE_NAME;

from table column distkey (PostgreSQL only)

select json_object_agg(columnname, distkey) 
from table_info 
where tablename = TABLE_NAME;

join table column encoding

...

join table column sortkey

...

join table column distkey

...

table constraints—i.e., foreign-to-primary key mappings

--in redshift
create view table_constraints as (
  select 
    trim(c.conname)         as constraint_name
    , trim(cn.nspname)      as constraint_schema
    , trim(cc.relname)      as constraint_table_name
    , trim(ca.attname)      as constraint_column_name
    , current_database()    as table_catalog
    , trim(fn.nspname)      as table_schema
    , trim(fc.relname)      as table_name
    , trim(fa.attname)      as column_name
    , decode(c.contype
      , 'p', 'PRIMARY KEY'
      , 'u', 'UNIQUE'
      , 'f', 'FOREIGN KEY') as constraint_type
    from pg_constraint      as c
    left join pg_attribute  as ca 
      on c.conrelid = ca.attrelid and ca.attnum = any(c.conkey)
    left join pg_attribute  as fa 
      on c.confrelid = fa.attrelid and fa.attnum = any(c.confkey)
    left join pg_class      as cc 
      on ca.attrelid = cc.oid
    left join pg_class      as fc 
      on c.confrelid = fc.oid
    left join pg_namespace  as cn 
      on c.connamespace = cn.oid
    left join pg_namespace  as fn 
      on fc.relnamespace = fn.oid
    where c.contype in ('p','u','f')
    and cc.relkind = 'r'
    and not ca.attisdropped
    order by constraint_schema
      , constraint_name
      , decode(c.contype,'p',1,'u',2,'f',3)
      , ca.attnum);

-- in postgresql
select json_object_agg(constraint_table_name || '.' || constraint_column_name, table_name || '.' || column_name)
from constraints 
where constraint_table_name = TABLE_NAME;

number of leader-node operations

select count(case when locus = 0 then 1 end)
from stl_plan_info 
where query = QUERY_ID;

percent of operations on leader node

select 1.0 * count(case when locus = 0 then 1 end) / nullif(count(*), 0) 
from stl_plan_info 
where query = QUERY_ID;

max number of rows processed on leader node

select max(case when locus = 0 then rows else 0 end) 
from stl_plan_info 
where query = QUERY_ID;

max number of bytes processed on leader node

select max(case when locus = 0 then bytes else 0 end) 
from stl_plan_info 
where query = QUERY_ID;

was task diskbased (PostgreSQL only)

select json_object_agg(distinct tasknum, is_diskbased) 
from save 
where query = QUERY_ID;

working memory used for task (PostgreSQL only)

select json_object_agg(distinct step, workmem) 
from save 
where query = QUERY_ID;

total number of steps in query

select count(*) 
from stl_explain 
where query = QUERY_ID;

types of steps in query

select json_object_agg(tasknum , type) 
from scan 
where query = QUERY_ID;

impact of filter(s)

select json_object_agg(tasknum , (rows - rows_pre_filter)) 
from scan 
where query = QUERY_ID;

is vacuum needed

select json_object_agg(tasknum , (rows_pre_filter - rows_pre_user_filter)) 
from scan 
where query = QUERY_ID;

githoov/redshift_metadata.md

Redshift Tables Used

Queries to Extract Features