使用 https://github.com/taowen/es-monitor 可以用 SQL 进行 elasticsearch 的查询。传统的 SQL 的语义是每一层查询是对下一层查询的综合,也就是每SELECT一层行数就会变少。比如
SELECT sum(per_sector) AS total FROM (
SELECT sector, count(*) AS per_sector FROM (
SELECT sector, ipo_year FROM symbol
)
)
Elasticsearch是不支持这样的嵌套SELECT的。它支持一个更实用的功能,嵌套下钻(Drill Down)。与传统SQL的语义正好相反,Elasticsearch的嵌套不是SELECT FROM 而是 SELECT INSIDE,每经过一层SELECT其行数不减少反而增加。所以我把Elasticsearch的这种嵌套查询不称之为SELECT FROM而是SELECT INSIDE以示区别,当然为了习惯,用SELECT FROM也是支持的。
SELECT INSIDE all_symbols GROUP BY ipo_year
SQL
$ cat << EOF | ./es_query.py http://127.0.0.1:9200
WITH all_symbols AS (SELECT MAX(market_cap) AS max_all_times FROM symbol);
WITH per_ipo_year AS (SELECT ipo_year, MAX(market_cap) AS max_this_year INSIDE all_symbols
GROUP BY ipo_year LIMIT 2)
EOF
第一行SELECT定义了all_symbols,第二行SELECT是在第一次查询的基础之上进行自己的二次聚合。这样多层SELECT的好处是每一层可以定义自己的SELECT字段。比如GROUP BY a,b,c
其实做了三次下钻,但是限于SQL的语法这样写是没法给没一层下钻的分桶计算指标,但是用上面这种WITH的语法,就可以每钻一层计算一层的指标。
{"_bucket_path": ["per_ipo_year"], "max_this_year": 54171930444.0, "max_all_times": 522690000000.0, "ipo_year": 2014}
{"_bucket_path": ["per_ipo_year"], "max_this_year": 5416144671.0, "max_all_times": 522690000000.0, "ipo_year": 2015}
Elasticsearch
{
"aggs": {
"max_all_times": {
"max": {
"field": "market_cap"
}
},
"ipo_year": {
"terms": {
"field": "ipo_year",
"size": 2
},
"aggs": {
"max_this_year": {
"max": {
"field": "market_cap"
}
}
}
}
},
"size": 0
}
下钻的概念直接看Elasticsearch的查询其实更清楚。每下钻一层,括号就往右深了一层。从某种意义上来说,Elasticsearch的DSL其实更接近聚合的实质。
{
"hits": {
"hits": [],
"total": 6714,
"max_score": 0.0
},
"_shards": {
"successful": 1,
"failed": 0,
"total": 1
},
"took": 2,
"aggregations": {
"max_all_times": {
"value": 522690000000.0
},
"ipo_year": {
"buckets": [
{
"max_this_year": {
"value": 54171930444.0
},
"key": 2014,
"doc_count": 390
},
{
"max_this_year": {
"value": 5416144671.0
},
"key": 2015,
"doc_count": 334
}
],
"sum_other_doc_count": 2174,
"doc_count_error_upper_bound": 0
}
},
"timed_out": false
}
Profile
[
{
"query": [
{
"query_type": "MatchAllDocsQuery",
"lucene": "*:*",
"time": "0.2003040000ms",
"breakdown": {
"score": 0,
"create_weight": 9025,
"next_doc": 162380,
"match": 0,
"build_scorer": 28899,
"advance": 0
}
}
],
"rewrite_time": 2523,
"collector": [
{
"name": "MultiCollector",
"reason": "search_multi",
"time": "2.325354000ms",
"children": [
{
"name": "TotalHitCountCollector",
"reason": "search_count",
"time": "0.2740410000ms"
},
{
"name": "BucketCollector: [[max_all_times, ipo_year]]",
"reason": "aggregation",
"time": "1.295439000ms"
}
]
}
]
}
]
SELECT INSIDE 多次
SQL
$ cat << EOF | ./es_query.py http://127.0.0.1:9200
WITH all_symbols AS (SELECT MAX(market_cap) AS max_all_times FROM symbol);
WITH per_ipo_year AS (SELECT ipo_year, MAX(market_cap) AS max_this_year INSIDE all_symbols
GROUP BY ipo_year LIMIT 2);
WITH per_sector AS (SELECT sector, MAX(market_cap) AS max_this_sector INSIDE per_ipo_year
GROUP BY sector LIMIT 2)
EOF
这个和GROUP BY ipo_year, sector
其实是差不多的,区别在于对每一层下钻都可以选取这一层的指标出来。
{"sector": "Health Care", "_bucket_path": ["per_ipo_year", "per_sector"], "max_this_year": 54171930444.0, "ipo_year": 2014, "max_all_times": 522690000000.0, "max_this_sector": 2660000000.0}
{"sector": "Finance", "_bucket_path": ["per_ipo_year", "per_sector"], "max_this_year": 54171930444.0, "ipo_year": 2014, "max_all_times": 522690000000.0, "max_this_sector": 5530000000.0}
{"sector": "Finance", "_bucket_path": ["per_ipo_year", "per_sector"], "max_this_year": 5416144671.0, "ipo_year": 2015, "max_all_times": 522690000000.0, "max_this_sector": 2740000000.0}
{"sector": "Health Care", "_bucket_path": ["per_ipo_year", "per_sector"], "max_this_year": 5416144671.0, "ipo_year": 2015, "max_all_times": 522690000000.0, "max_this_sector": 5416144671.0}
Elasticsearch
{
"aggs": {
"max_all_times": {
"max": {
"field": "market_cap"
}
},
"ipo_year": {
"terms": {
"field": "ipo_year",
"size": 2
},
"aggs": {
"sector": {
"terms": {
"field": "sector",
"size": 2
},
"aggs": {
"max_this_sector": {
"max": {
"field": "market_cap"
}
}
}
},
"max_this_year": {
"max": {
"field": "market_cap"
}
}
}
}
},
"size": 0
}
{
"hits": {
"hits": [],
"total": 6714,
"max_score": 0.0
},
"_shards": {
"successful": 1,
"failed": 0,
"total": 1
},
"took": 8,
"aggregations": {
"max_all_times": {
"value": 522690000000.0
},
"ipo_year": {
"buckets": [
{
"sector": {
"buckets": [
{
"max_this_sector": {
"value": 2660000000.0
},
"key": "Health Care",
"doc_count": 104
},
{
"max_this_sector": {
"value": 5530000000.0
},
"key": "Finance",
"doc_count": 70
}
],
"sum_other_doc_count": 216,
"doc_count_error_upper_bound": 0
},
"max_this_year": {
"value": 54171930444.0
},
"key": 2014,
"doc_count": 390
},
{
"sector": {
"buckets": [
{
"max_this_sector": {
"value": 2740000000.0
},
"key": "Finance",
"doc_count": 92
},
{
"max_this_sector": {
"value": 5416144671.0
},
"key": "Health Care",
"doc_count": 92
}
],
"sum_other_doc_count": 150,
"doc_count_error_upper_bound": 0
},
"max_this_year": {
"value": 5416144671.0
},
"key": 2015,
"doc_count": 334
}
],
"sum_other_doc_count": 2174,
"doc_count_error_upper_bound": 0
}
},
"timed_out": false
}
Profile
[
{
"query": [
{
"query_type": "MatchAllDocsQuery",
"lucene": "*:*",
"time": "0.2576120000ms",
"breakdown": {
"score": 0,
"create_weight": 63193,
"next_doc": 165400,
"match": 0,
"build_scorer": 29019,
"advance": 0
}
}
],
"rewrite_time": 3205,
"collector": [
{
"name": "MultiCollector",
"reason": "search_multi",
"time": "6.292688000ms",
"children": [
{
"name": "TotalHitCountCollector",
"reason": "search_count",
"time": "0.2599140000ms"
},
{
"name": "BucketCollector: [[max_all_times, ipo_year]]",
"reason": "aggregation",
"time": "5.172211000ms"
}
]
}
]
}
]
相信看过前面例子,你应该理解了什么叫下钻了。每下钻一层,前面的一行就会被再次分裂到多个桶里。每一层都可以搞自己的指标计算。