CREATE EXTERNAL TABLE IF NOT EXISTS s3_logs.mybucket(
BucketOwner STRING,
Bucket STRING,
RequestDateTime DATE,
RemoteIP STRING,
Requester STRING,
RequestID STRING,
Operation STRING,
Key STRING,
RequestURI_operation STRING,
RequestURI_key STRING,
RequestURI_httpProtoversion STRING,
HTTPstatus STRING,
ErrorCode STRING,
BytesSent STRING,
ObjectSize STRING,
TotalTime STRING,
TurnAroundTime STRING,
Referrer STRING,
UserAgent STRING,
VersionId STRING
) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe'
WITH SERDEPROPERTIES (
'serialization.format' = '1',
'input.regex' = '([^ ]*) ([^ ]*) \\[(.*?)\\] ([^ ]*) ([^ ]*) ([^ ]*) ([^ ]*) ([^ ]*) \\\"([^ ]*) ([^ ]*) (- |[^ ]*)\\\" (-|[0-9]*) ([^ ]*) ([^ ]*) ([^ ]*) ([^ ]*) ([^ ]*) ([^ ]*) (\"[^\"]*\") ([^ ]*)$'
) LOCATION 's3://mybucket/logs/';
SELECT count(*) AS count,
operation,
useragent
FROM mybucket
WHERE requestdatetime LIKE '%/Dec/2016:%'
GROUP BY operation, useragent
ORDER BY count DESC;
パーティション区切って集計範囲指定しないとお金が 💸 でも日付がパース出来ない
https://aws.amazon.com/jp/blogs/big-data/analyzing-data-in-s3-using-amazon-athena/
https://cwiki.apache.org/confluence/display/Hive/LanguageManual+Types#LanguageManualTypes-TimestampstimestampTimestamps