Demonstrates how to create impala table from parquet data
#! /bin/bash
# script that creates impala table from parquet file
# prerequisite: convert avro to parquet
mtype=default
db_name=user
table_name=$mtype_metrics
impala_daemon=impala-host.rsol.org
data_path=/data/uer/parquet
# take a smaple file for schema inference
file=" $( hdfs dfs -ls $data_path /$mtype .parquet/part-r-00000-* .gz.parquet | awk -F " " ' {print $8}' | head) "
echo using this $file for schema inference
impala-shell --ssl -k -V -i $impala_daemon << EOF
create external table $db_name .$table_name LIKE PARQUET '$file ' stored as PARQUET;
load data inpath '$data_path /$mtype .parquet' into table $db_name .$table_name ;
EOF
impala-shell --ssl -k -V -i $impala_daemon -q " describe $db_name .$table_name ;"
Create impala from Guardium csv data
echo " staging results to imapal staging directory"
hdfs dfs -cp $HDFS_RESULTS_DIR /part-00000 $IMPALA_STAGING_PATH
# Testing impala connection
impala-shell -k -i $SERVER_NAME << EOF
select count(*) from $DB_NAME .$TABLE_NAME ;
EOF
# Drop and recreate table with new data
impala-shell -k -i $SERVER_NAME << EOF
DROP TABLE IF EXISTS $DB_NAME .$TABLE_NAME ;
CREATE EXTERNAL TABLE $DB_NAME .$TABLE_NAME (
Log_Timestamp String,
ClientIP String,
OSUser String,
DBUserName String,
ServerIP String,
DBProtocol String,
NetworkProtocol String,
SourceProgram String,
ServiceName String,
ObjectName String,
SQLVerb String,
FullSql String,
DatabaseName String,
Succeeded String,
ReturnedData String,
SessionStart String,
FullSQLID String,
RecordsAffected String,
RecordsAffected_desc String,
ReturnedDataCount String,
Log_Values String,
Log_Timestamp2 String,
Log_SessionId String,
AccessId String,
ClientHostName String,
AnalyzedClientIP String,
UidChain String,
ProcessID String,
GlobalId String,
BindVariablesValues String
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '|'
LOCATION '$IMPALA_STAGING_PATH '
tblproperties ("skip.header.line.count"="1");
select count(*) from $DB_NAME .$TABLE_NAME ;
EOF
exit