Skip to content

Feat/benchmark #273

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Jun 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,5 @@ src/**/.antlr
coverage
.idea
gen/
src/**/*.iml
src/**/*.iml
benchmark/reports/*
113 changes: 113 additions & 0 deletions benchmark/data/flink/create.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
CREATE TABLE MyTable ('user_id' BIGINT, 'name' STRING) WITH ('connector' = 'oracle-x');

CREATE TABLE MyTable WITH ('connector' = 'oracle-x');

CREATE TEMPORARY TABLE client_errors (
log_time TIMESTAMP(3),
request_line STRING,
status_code STRING,
size INT
) WITH (
'connector' = 'stream-x'
);

-- 尽管官方文档的 BNF 里没有支持创建临时表,但实际上是支持的
CREATE TEMPORARY TABLE MyTable ('user_id' BIGINT, 'name' STRING) WITH ('connector' = 'oracle-x');

CREATE TABLE MyTable (
'user_id' BIGINT,
'name' STRING,
'timestamp' BIGINT METADATA, -- part of the query-to-sink schema
'offset' BIGINT METADATA VIRTUAL, -- not part of the query-to-sink schema
'record_time' TIMESTAMP(3) WITH LOCAL TIME ZONE METADATA FROM 'timestamp' -- reads and writes a Kafka record's timestamp
) WITH ('connector' = 'kafka');

CREATE TABLE MyTable (
'user_id' BIGINT,
'price' DOUBLE,
'quantity' DOUBLE,
'cost' AS price * quanitity -- evaluate expression and supply the result to queries
) WITH ('connector' = 'kafka');

CREATE TABLE MyTable (
'user' BIGINT,
product STRING,
order_time TIMESTAMP(3),
WATERMARK FOR order_time AS order_time - INTERVAL '5' SECOND
) WITH ('connector' = 'kafka');

CREATE TABLE MyTable (id INT, PRIMARY KEY (id) NOT ENFORCED) WITH ('connector' = 'kafka');

CREATE TABLE tbl1 (
a BIGINT,
h VARCHAR,
g AS 2 * (a + 1),
ts AS toTimestamp(b, 'yyyy-MM-dd HH:mm:ss'),
b VARCHAR,
proc AS PROCTIME(),
meta STRING METADATA,
my_meta STRING METADATA FROM 'meta',
my_meta STRING METADATA FROM 'meta' VIRTUAL,
meta STRING METADATA VIRTUAL,
PRIMARY KEY (a, b) NOT ENFORCED
) PARTITIONED BY (a, h) WITH (
'connector' = 'kafka',
'kafka.topic' = 'log.test'
);

CREATE TABLE Orders_in_file (
'user' BIGINT,
product STRING,
order_time_string STRING,
order_time AS to_timestamp(order_time)
) PARTITIONED BY ('user') WITH (
'connector' = 'filesystem',
'path' = '...'
);

CREATE TABLE Orders_with_watermark (
id INT,
-- Add watermark definition
WATERMARK FOR order_time AS order_time - INTERVAL '5' SECOND
) WITH (
-- Overwrite the startup-mode
'scan.startup.mode' = 'latest-offset'
) LIKE Orders_in_file (
-- Exclude everything besides the computed columns which we need to generate the watermark for.
-- We do not want to have the partitions or filesystem options as those do not apply to kafka.
EXCLUDING ALL
INCLUDING GENERATED
);

CREATE TABLE my_ctas_table WITH ('connector' = 'kafka')
AS SELECT
id,
name,
age
FROM
source_table
WHERE
mod(id, 10) = 0;

CREATE TABLE catalog1.db1.table1 (id INT) WITH ('connector' = 'kafka');

CREATE TABLE catalog1.db1.table1 (
attr0 STRING,
attr1 BOOLEAN,
attr3 DECIMAL(38, 18),
attr4 TINYINT,
attr5 SMALLINT,
attr6 INT,
attr7 BIGINT,
attr8 FLOAT,
attr9 DOUBLE,
attr10 DATE,
attr11 TIME,
attr12 TIMESTAMP(3),
attr13 ARRAY<STRING>,
attr14 ROW<attr15 FLOAT, attr16 TIMESTAMP(3)>,
attr17 MAP<INT, BIGINT>,
name1 VARCHAR(64),
message ROW<data ROW<UPO_TIMESTAMP VARCHAR(20)>>,
`raw` RAW('class', 'snapshot')
) WITH ('connector' = 'kafka');
122 changes: 122 additions & 0 deletions benchmark/data/flink/select.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
-- Window TVF Aggregation
SELECT
window_start,
window_end,
supplier_id,
SUM(price) as price
FROM TABLE(
TUMBLE(TABLE Bid, DESCRIPTOR(bidtime), INTERVAL '10' MINUTES))
GROUP BY window_start, window_end, GROUPING SETS ((supplier_id), ());

SELECT
window_start,
window_end,
supplier_id,
SUM(price) as price
FROM TABLE(
TUMBLE(TABLE Bid, DESCRIPTOR(bidtime), INTERVAL '10' MINUTES))
GROUP BY window_start, window_end, ROLLUP (supplier_id);

SELECT
window_start,
window_end,
item, supplier_id,
SUM(price) as price
FROM TABLE(
TUMBLE(TABLE Bid, DESCRIPTOR(bidtime), INTERVAL '10' MINUTES))
GROUP BY window_start, window_end, CUBE (supplier_id, item);

-- GROUPING SETS
SELECT
window_start,
window_end,
supplier_id,
SUM(price) as price
FROM TABLE(
TUMBLE(TABLE Bid, DESCRIPTOR(bidtime), INTERVAL '10' MINUTES))
GROUP BY
window_start,
window_end,
GROUPING SETS ((supplier_id), ());

SELECT
window_start,
window_end,
supplier_id,
SUM(price) as price
FROM TABLE(
TUMBLE(TABLE Bid, DESCRIPTOR(bidtime), INTERVAL '10' MINUTES))
GROUP BY
window_start,
window_end,
ROLLUP (supplier_id);

SELECT
window_start,
window_end,
item,
supplier_id,
SUM(price) as price
FROM TABLE(
TUMBLE(TABLE Bid, DESCRIPTOR(bidtime), INTERVAL '10' MINUTES))
GROUP BY
window_start,
window_end,
CUBE (supplier_id, item);

-- Group Window Aggregation
SELECT
`user`,
TUMBLE_START(order_time, INTERVAL '1' DAY) AS wStart,
SUM(amount) FROM Orders
GROUP BY
TUMBLE(order_time, INTERVAL '1' DAY),
`user`;

SELECT
`user`,
TUMBLE_START(order_time, INTERVAL '1' DAY) AS wStart,
SUM(amount) FROM Orders
GROUP BY
HOP(order_time, INTERVAL '1' DAY),
`user`;

SELECT
`user`,
TUMBLE_START(order_time, INTERVAL '1' DAY) AS wStart,
SUM(amount) FROM Orders
GROUP BY
SESSION(order_time, INTERVAL '1' DAY),
`user`;

-- Having
SELECT SUM(amount)
FROM Orders
GROUP BY `users`
HAVING SUM(amount) > 50;

-- Over Aggregation
SELECT order_id, order_time, amount,
SUM(amount) OVER (
PARTITION BY product
ORDER BY order_time
RANGE BETWEEN INTERVAL '1' HOUR PRECEDING AND CURRENT ROW
) AS one_hour_prod_amount_sum
FROM Orders;

SELECT product, order_time, amount,
SUM(amount) OVER (
PARTITION BY product
ORDER BY order_time
ROWS BETWEEN 5 PRECEDING AND CURRENT ROW
) AS one_hour_prod_amount_sum
FROM source_table;

SELECT order_id, order_time, amount,
SUM(amount) OVER w AS sum_amount,
AVG(amount) OVER w AS avg_amount
FROM Orders
WINDOW w AS (
PARTITION BY product
ORDER BY order_time
RANGE BETWEEN INTERVAL '1' HOUR PRECEDING AND CURRENT ROW);
100 changes: 100 additions & 0 deletions benchmark/data/hive/create.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
CREATE TEMPORARY TABLE list_bucket_multiple (col1 STRING, col2 INT, col3 STRING);

CREATE TEMPORARY EXTERNAL TABLE list_bucket_multiple (col1 STRING, col2 INT, col3 STRING);

CREATE TEMPORARY EXTERNAL TABLE IF NOT EXISTS list_bucket_multiple (col1 STRING, col2 INT, col3 STRING);

CREATE TEMPORARY EXTERNAL TABLE IF NOT EXISTS list_bucket_multiple (col1 STRING, col2 INT, col3 STRING) COMMENT 'this is a comment';

CREATE TEMPORARY EXTERNAL TABLE IF NOT EXISTS list_bucket_multiple (col1 STRING, col2 INT, col3 STRING) COMMENT 'this is a comment1' PARTITIONED BY (`date` STRING COMMENT 'column_comment');

CREATE TEMPORARY EXTERNAL TABLE IF NOT EXISTS list_bucket_multiple (col1 STRING, col2 INT, col3 STRING) COMMENT 'this is a comment2' PARTITIONED BY (`date` STRING COMMENT 'column_comment') CLUSTERED BY (col1, col2) INTO 32 BUCKETS;

CREATE TEMPORARY EXTERNAL TABLE IF NOT EXISTS list_bucket_multiple (col1 STRING, col2 INT, col3 STRING) COMMENT 'this is a comment3' PARTITIONED BY (`date` STRING COMMENT 'column_comment') CLUSTERED BY (col1, col2) SORTED BY (col1 ASC) INTO 22 BUCKETS;

CREATE TEMPORARY EXTERNAL TABLE IF NOT EXISTS list_bucket_multiple (col1 STRING, col2 INT, col3 STRING) COMMENT 'this is a comment4' PARTITIONED BY (`date` STRING COMMENT 'column_comment') CLUSTERED BY (col1, col2) SORTED BY (col1 ASC) INTO 34 BUCKETS SKEWED BY (col1, col2) ON (('s1', 1), ('s3', 3), ('s13', 13), ('s78', 78)) STORED AS DIRECTORIES;

CREATE TABLE page_view(
viewTime INT,
userid BIGINT,
page_url STRING,
referrer_url STRING,
ip STRING COMMENT 'IP Address of the User'
) COMMENT 'This is the page view table' PARTITIONED BY(dt STRING, country STRING) CLUSTERED BY(userid) SORTED BY(viewTime) INTO 32 BUCKETS ROW FORMAT DELIMITED FIELDS TERMINATED BY '\001' COLLECTION ITEMS TERMINATED BY '\002' MAP KEYS TERMINATED BY '\003' STORED AS SEQUENCEFILE;

CREATE TEMPORARY EXTERNAL TABLE page_view(
viewTime INT,
userid BIGINT,
page_url STRING,
referrer_url STRING,
ip STRING COMMENT 'IP Address of the User'
) COMMENT 'This is the page view table' PARTITIONED BY(dt STRING, country STRING) CLUSTERED BY(userid) SORTED BY(viewTime) INTO 32 BUCKETS ROW FORMAT DELIMITED FIELDS TERMINATED BY '\001' COLLECTION ITEMS TERMINATED BY '\002' MAP KEYS TERMINATED BY '\003' STORED AS TEXTFILE;

CREATE TEMPORARY EXTERNAL TABLE IF NOT EXISTS page_view(
viewTime INT,
userid BIGINT,
page_url STRING,
referrer_url STRING,
ip STRING COMMENT 'IP Address of the User'
) COMMENT 'This is the page view table' PARTITIONED BY(dt STRING, country STRING) CLUSTERED BY(userid) SORTED BY(viewTime) INTO 32 BUCKETS ROW FORMAT DELIMITED FIELDS TERMINATED BY '\001' COLLECTION ITEMS TERMINATED BY '\002' MAP KEYS TERMINATED BY '\003' STORED AS RCFILE;

CREATE TEMPORARY EXTERNAL TABLE IF NOT EXISTS page_view(
viewTime INT,
userid BIGINT,
page_url STRING,
referrer_url STRING,
ip STRING COMMENT 'IP Address of the User'
) COMMENT 'This is the page view table' PARTITIONED BY(dt STRING, country STRING) CLUSTERED BY(userid) SORTED BY(viewTime) INTO 32 BUCKETS ROW FORMAT DELIMITED FIELDS TERMINATED BY '\001' COLLECTION ITEMS TERMINATED BY '\002' MAP KEYS TERMINATED BY '\003' STORED AS ORC LOCATION '/hsd_path';

CREATE TEMPORARY EXTERNAL TABLE IF NOT EXISTS page_view(
viewTime INT,
userid BIGINT,
page_url STRING,
referrer_url STRING,
ip STRING COMMENT 'IP Address of the User'
) COMMENT 'This is the page view table' PARTITIONED BY(dt STRING, country STRING) CLUSTERED BY(userid) SORTED BY(viewTime) INTO 32 BUCKETS ROW FORMAT DELIMITED FIELDS TERMINATED BY '\001' COLLECTION ITEMS TERMINATED BY '\002' MAP KEYS TERMINATED BY '\003' STORED AS PARQUET LOCATION '/hsd_path' AS
SELECT
(key % 1024) new_key,
concat(key, value) key_value_pair
FROM
key_value_store SORT BY new_key,
key_value_pair;


CREATE TABLE list_bucket_single (key STRING, value STRING)
SKEWED BY (key) ON (1,5,6) STORED AS AVRO;

CREATE TRANSACTIONAL TABLE transactional_table_test(key STRING, value STRING) PARTITIONED BY(ds STRING) STORED AS INPUTFORMAT 'inputfilename' OUTPUTFORMAT 'outputfilename';

CREATE TABLE IF NOT EXISTS copy_table LIKE origin_table;

CREATE TEMPORARY TABLE IF NOT EXISTS copy_table LIKE origin_table;

CREATE TEMPORARY EXTERNAL TABLE IF NOT EXISTS copy_table LIKE origin_table;

CREATE TEMPORARY EXTERNAL TABLE IF NOT EXISTS copy_table LIKE origin_table LOCATION '/hdfs_path';

CREATE TABLE IF NOT EXISTS derived_table AS
SELECT
*
FROM
origin_table;

CREATE TABLE `mydb.t1`(
`id` INT,
`dept_no` INT,
`addr` STRING,
`tel` STRING,
`hobby` ARRAY < STRING >,
`add` MAP < STRING,
STRING >
) PARTITIONED BY(`date` STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' COLLECTION ITEMS TERMINATED BY '-' MAP KEYS TERMINATED BY ':';

CREATE EXTERNAL TABLE mydb.ext_table(
id INT,
name STRING,
hobby ARRAY < STRING >,
add
MAP < STRING,
STRING >
) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' COLLECTION ITEMS TERMINATED BY '-' MAP KEYS TERMINATED BY ':' LOCATION '/user/mydb/ext_table' TBLPROPERTIES('author' = 'hayden', 'desc' = '一个外部测试表');
12 changes: 12 additions & 0 deletions benchmark/data/hive/params.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
{
"create": {
"validate": ["$sql"],
"getAllTokens": ["$sql"],
"getAllEntities": ["$sql", { "lineNumber": 8, "column": 1 }]
},
"select": {
"validate": ["$sql"],
"getAllTokens": ["$sql"],
"getAllEntities": ["$sql", { "lineNumber": 8, "column": 1 }]
}
}
Loading
Loading