-- original table
CREATE TABLE hackernews_copy
(
`id` String,
`doc_id` String,
`comment` String,
`vector` Array(Float32),
`node_info` Tuple(start Nullable(UInt64), end Nullable(UInt64)),
`metadata` String,
`type` Enum8('story' = 1, 'comment' = 2, 'poll' = 3, 'pollopt' = 4, 'job' = 5),
`by` LowCardinality(String),
`time` DateTime,
`title` String,
`post_score` Int32,
`dead` UInt8,
`deleted` UInt8,
`length` UInt32,
`parent` UInt32,
`kids` Array(UInt32)
)
ENGINE = MergeTree
ORDER BY (toDate(time), length, post_score)
-- hold text and by in memory for fast lookup - around 8GiB
CREATE DICTIONARY hacker_hierarchy
(
`id` UInt64,
`text` String,
`by` String
)
PRIMARY KEY id
SOURCE(CLICKHOUSE(QUERY 'SELECT id, text, by FROM default.hackernews_copy'))
LIFETIME(MIN 0 MAX 0)
LAYOUT(HASHED_ARRAY())
-- create table
CREATE TABLE hackernews
(
`id` String,
`doc_id` String,
`comment` String,
`text` String,
`vector` Array(Float32),
`node_info` Tuple(start Nullable(UInt64), end Nullable(UInt64)),
`metadata` String,
`type` Enum8('story' = 1, 'comment' = 2, 'poll' = 3, 'pollopt' = 4, 'job' = 5),
`by` LowCardinality(String),
`time` DateTime,
`title` String,
`post_score` Int32,
`dead` UInt8,
`deleted` UInt8,
`length` UInt32,
`parent` UInt32,
`kids` Array(UInt32)
)
ENGINE = MergeTree
ORDER BY (toDate(time), length, post_score)
-- insert from copy, grabbing parent and children text from dictionary
INSERT INTO hackernews SELECT
id,
id AS doc_id,
comment,
if(parent != 0, dictGet(hacker_hierarchy, 'by', parent) || ':' || dictGet(hacker_hierarchy, 'text', parent) || '\n', '') || by || ': ' || comment || '\n' || arrayStringConcat(arrayMap(k -> dictGet(hacker_hierarchy, 'by', k) || ': ' || dictGet(hacker_hierarchy, 'text', k), kids), '\n') as text,
vector,
(0, 0) AS node_info,
toJSONString(CAST((deleted, type, by, time, dead, parent, poll, kids, url, score, title, parts, descendants), 'Tuple(deleted UInt8, type Enum8(\'story\' = 1, \'comment\' = 2, \'poll\' = 3, \'pollopt\' = 4, \'job\' = 5), by String, time DateTime, dead UInt8, parent UInt32, poll UInt32, kids Array(UInt32), url String, post_score Int32, title String, parts Array(UInt32), descendants Int32)')) AS metadata, type, by, time, title, score as post_score, dead, deleted, length(tokens(text)) as length, parent, kids
FROM hackernews_copy
Created
February 21, 2024 15:58
-
-
Save gingerwizard/fc18cf5a3b9945b626cecb88a2deaa49 to your computer and use it in GitHub Desktop.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment