Last active
September 27, 2016 16:53
-
-
Save milimetric/301d7fec9a3234029c0f316e166301f5 to your computer and use it in GitHub Desktop.
Queries to get simple metrics from mediawiki_history
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
select substring(event_timestamp, 0, 8) day, | |
count(*) `All namespaces`, | |
sum(if( page_namespace_latest = 0 | |
,1, 0)) `Namespace Zero`, | |
sum(if( page_namespace_latest = 0 | |
and revision_deleted_timestamp is null | |
,1, 0)) `Namespace Zero not Deleted` | |
from milimetric.mediawiki_history | |
where event_entity = 'revision' | |
and event_type = 'create' | |
group by substring(event_timestamp, 0, 8) | |
order by day | |
limit 10000000 | |
; | |
select substring(event_timestamp, 0, 8) day, | |
count(*) `All namespaces`, | |
sum(if( page_namespace_latest = 0 | |
,1, 0)) `Namespace Zero`, | |
sum(if( page_namespace_latest = 0 | |
and revision_deleted_timestamp is null | |
,1, 0)) `Namespace Zero not Deleted` | |
from milimetric.mediawiki_history | |
where event_entity = 'page' | |
and event_type = 'create' | |
group by substring(event_timestamp, 0, 8) | |
order by day | |
limit 10000000 | |
; | |
CREATE EXTERNAL TABLE `mediawiki_history`( | |
wiki_db string, | |
event_entity string, | |
event_type string, | |
event_timestamp string, | |
event_comment string, | |
event_user_id bigint, | |
event_user_text string, | |
event_user_text_latest string, | |
event_user_blocks array<string>, | |
event_user_blocks_latest array<string>, | |
event_user_groups array<string>, | |
event_user_groups_latest array<string>, | |
event_user_creation_timestamp string, | |
page_id bigint, | |
page_title string, | |
page_title_latest string, | |
page_namespace int, | |
page_namespace_latest int, | |
page_creation_timestamp string, | |
user_id bigint, | |
user_text string, | |
user_text_latest string, | |
user_blocks array<string>, | |
user_blocks_latest array<string>, | |
user_groups array<string>, | |
user_groups_latest array<string>, | |
user_creation_timestamp string, | |
revision_id bigint, | |
revision_parent_id bigint, | |
revision_minor_edit boolean, | |
revision_text_bytes bigint, | |
revision_text_bytes_diff bigint, | |
revision_text_sha1 string, | |
revision_content_model string, | |
revision_content_format string, | |
revision_is_deleted boolean, | |
revision_deleted_timestamp string, | |
revision_is_identity_reverted boolean, | |
revision_most_recent_identity_revert_timestamp string, | |
revision_is_identity_revert boolean | |
) | |
ROW FORMAT SERDE | |
'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' | |
STORED AS INPUTFORMAT | |
'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat' | |
OUTPUTFORMAT | |
'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat' | |
LOCATION | |
'hdfs://analytics-hadoop/user/joal/mwhist_3/denorm_2' | |
; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment