由于项目中要和大数据团队交互,需要解析出hive中用到的表和最外层的列,后续可能还要用到各个表和字段的血缘关系,网上搜了一圈,貌似只有一个答案,基本都是复制一个模板的。而且看起来貌似也不太符合我需要的,所以想着自己解析出想要的东西,勉强够用,记录一下。
1 import com.google.common.base.Joiner; 2 import com.google.common.collect.Lists; 3 import com.google.common.collect.Maps; 4 import lombok.extern.slf4j.Slf4j; 5 import org.apache.commons.collections.CollectionUtils; 6 import org.apache.hadoop.hive.ql.lib.Node; 7 import org.apache.hadoop.hive.ql.parse.ASTNode; 8 import org.apache.hadoop.hive.ql.parse.ParseDriver; 9 10 import java.util.*; 11 import java.util.stream.Collectors; 12 13 import static org.apache.hadoop.hive.ql.parse.HiveParser.*; 14 15 /** 16 * @author chentiefeng 17 * @date 2019/10/21 13:51 18 */ 19 @Slf4j 20 public class HiveSqlParse { 21 private ParseDriver pd = new ParseDriver(); 22 /** 23 * 原始表(表名,别名) 24 */ 25 private List<String[]> sourceTable = Lists.newArrayList(); 26 /** 27 * 插入表 28 */ 29 private List<String> insertTables = Lists.newArrayList(); 30 /** 31 * 最外层列 32 */ 33 private List<String> outermostColumns = Lists.newArrayList(); 34 /** 35 * 插入分区信息(分区列,分区值) 36 */ 37 private Map<String, String> partitionMap = Maps.newHashMap(); 38 /** 39 * 最外层Sel节点 40 */ 41 private ASTNode outermostSelNode = null; 42 /** 43 * 最外层Insert节点 44 */ 45 private ASTNode outermostInsertNode = null; 46 /** 47 * 放置 解析表栈 48 */ 49 private Stack<HiveTableParseInfo> tableParseInfoSelStack = new Stack<>(); 50 private Stack<HiveTableParseInfo> tableParseInfoFromStack = new Stack<>(); 51 /** 52 * 表关系解析信息,不包含原始表 53 */ 54 private HiveTableParseInfo tableParseInfo = null; 55 56 public HiveSqlParse() { 57 } 58 59 public HiveSqlParse(String sql) { 60 parse(sql); 61 } 62 63 /** 64 * sql解析 65 * 66 * @param sql 67 */ 68 public void parse(String sql) { 69 try { 70 ASTNode ast = pd.parse(sql); 71 log.info("hiveSql={},astTree={}", sql, ast.toStringTree()); 72 parseNode(ast); 73 insert(outermostInsertNode); 74 outermostColumns(outermostSelNode); 75 sourceTable.removeIf(arr -> arr[0].equals(insertTables.get(0))); 76 } catch (Exception e) { 77 log.error(e.getMessage(), e); 78 throw new RuntimeException(e); 79 } 80 } 81 82 private void parseNode(ASTNode ast) { 83 if (CollectionUtils.isNotEmpty(ast.getChildren())) { 84 for (Node child : ast.getChildren()) { 85 ASTNode cc = (ASTNode) child; 86 switch (cc.getToken().getType()) { 87 case TOK_INSERT: 88 outermostInsertNode = cc; 89 break; 90 case TOK_TABNAME: 91 String tableName = Joiner.on(".").join(cc.getChildren().stream().map(n -> ((ASTNode) n).getText()).collect(Collectors.toList())); 92 ASTNode ccChild = (ASTNode) cc.getParent().getChild(cc.getParent().getChildCount() - 1); 93 HiveTableParseInfo sourceTableParseInfo = new HiveTableParseInfo(); 94 if (ccChild.getToken().getType() == TOK_TABNAME) { 95 sourceTable.add(new String[]{tableName, ""}); 96 sourceTableParseInfo.setAlias(""); 97 } else { 98 sourceTable.add(new String[]{tableName, ccChild.getText()}); 99 sourceTableParseInfo.setAlias(ccChild.getText()); 100 } 101 sourceTableParseInfo.setName(tableName); 102 if (!tableParseInfoFromStack.empty()) { 103 tableParseInfoFromStack.pop().getTables().add(sourceTableParseInfo); 104 } 105 break; 106 case TOK_QUERY: 107 ASTNode ccc = (ASTNode) cc.getParent().getChild(cc.getParent().getChildCount() - 1); 108 if (ccc.getToken().getType() != TOK_QUERY) { 109 HiveTableParseInfo table = new HiveTableParseInfo(); 110 table.setAlias(ccc.getText()); 111 tableParseInfoSelStack.push(table); 112 tableParseInfoFromStack.push(table); 113 } 114 break; 115 case TOK_SELECT: 116 case TOK_SELECTDI: 117 HiveTableParseInfo pop = tableParseInfoSelStack.pop(); 118 if (!tableParseInfoSelStack.empty()) { 119 HiveTableParseInfo father = tableParseInfoSelStack.peek(); 120 if (Objects.nonNull(father)) { 121 father.getTables().add(pop); 122 } 123 } else { 124 tableParseInfo = pop; 125 } 126 parseColumns(cc, pop); 127 continue; 128 default: 129 } 130 parseNode(cc); 131 } 132 } 133 } 134 135 private void insert(ASTNode cn) { 136 if (CollectionUtils.isEmpty(cn.getChildren())) { 137 return; 138 } 139 for (Node child : cn.getChildren()) { 140 ASTNode cc = (ASTNode) child; 141 switch (cc.getToken().getType()) { 142 case TOK_INSERT_INTO: 143 case TOK_DESTINATION: 144 insertTable(cn); 145 continue; 146 case TOK_SELECT: 147 outermostSelNode = cn; 148 continue; 149 default: 150 } 151 insert(cc); 152 } 153 } 154 155 private void parseColumns(ASTNode cc, HiveTableParseInfo table) { 156 for (Node node : cc.getChildren()) { 157 ASTNode tokSelExpr = (ASTNode) node; 158 HiveTableParseInfo.HiveTableColumnParseInfo column = new HiveTableParseInfo.HiveTableColumnParseInfo(); 159 String alias = getSelExprAlias(tokSelExpr); 160 column.setName(alias); 161 parseColumn(tokSelExpr, column); 162 table.getColumns().add(column); 163 } 164 } 165 166 167 private void parseColumn(ASTNode tokSelExpr, HiveTableParseInfo.HiveTableColumnParseInfo column) { 168 if (CollectionUtils.isEmpty(tokSelExpr.getChildren())) { 169 return; 170 } 171 for (Node child : tokSelExpr.getChildren()) { 172 ASTNode cc = (ASTNode) child; 173 if (cc.getToken().getType() == TOK_TABLE_OR_COL) { 174 ASTNode ccc = (ASTNode) cc.getParent().getChild(cc.getParent().getChildCount() - 1); 175 String[] item; 176 if (ccc.getToken().getType() == TOK_TABLE_OR_COL) { 177 item = new String[]{cc.getChild(0).getText(), ""}; 178 } else { 179 item = new String[]{ccc.getText(), cc.getChild(0).getText()}; 180 } 181 Optional<String[]> any = column.getSourceList().stream().filter(s -> Arrays.equals(item, s)).findAny(); 182 if (!any.isPresent()) { 183 column.getSourceList().add(item); 184 } 185 continue; 186 } 187 parseColumn(cc, column); 188 } 189 } 190 191 /** 192 * 插入信息 193 * 194 * @param cn 195 */ 196 private void insertTable(ASTNode cn) { 197 if (CollectionUtils.isEmpty(cn.getChildren())) { 198 return; 199 } 200 for (Node child : cn.getChildren()) { 201 ASTNode cc = (ASTNode) child; 202 switch (cc.getToken().getType()) { 203 case TOK_TABNAME: 204 String tableName = Joiner.on(".").join(cc.getChildren().stream().map(n -> ((ASTNode) n).getText()).collect(Collectors.toList())); 205 insertTables.add(tableName); 206 break; 207 case TOK_PARTVAL: 208 if (cc.getChildCount() == 2) { 209 partitionMap.put(cc.getChild(0).getText(), cc.getChild(1).getText()); 210 } else { 211 partitionMap.put(cc.getChild(0).getText(), null); 212 } 213 break; 214 default: 215 } 216 insertTable(cc); 217 } 218 } 219 220 /** 221 * 最外层列 222 * 223 * @param cn 224 */ 225 private void outermostColumns(ASTNode cn) { 226 if (CollectionUtils.isEmpty(cn.getChildren())) { 227 return; 228 } 229 for (Node cnChild : cn.getChildren()) { 230 ASTNode cc = (ASTNode) cnChild; 231 if (cc.getToken().getType() == TOK_SELEXPR) { 232 String alias = getSelExprAlias(cc); 233 outermostColumns.add(alias); 234 continue; 235 } 236 outermostColumns(cc); 237 } 238 } 239 240 /** 241 * 列别名获取 242 * 243 * @param cc 244 * @return 245 */ 246 private String getSelExprAlias(ASTNode cc) { 247 ASTNode child = (ASTNode) cc.getChild(cc.getChildCount() - 1); 248 if (child.getToken().getType() == TOK_TABLE_OR_COL || child.getToken().getType() == DOT) { 249 return child.getChild(child.getChildCount() - 1).getText(); 250 } else { 251 return child.getText(); 252 } 253 } 254 255 public List<String> getOutermostColumns() { 256 return outermostColumns; 257 } 258 259 public List<String> getSourceTables() { 260 return sourceTable.stream().map(t -> t[0]).distinct().collect(Collectors.toList()); 261 } 262 263 public String getInsertTable() { 264 return CollectionUtils.isNotEmpty(insertTables) ? insertTables.get(0) : null; 265 } 266 267 public Map<String, String> getPartition() { 268 return partitionMap; 269 } 270 271 public HiveTableParseInfo getTableParseInfo() { 272 return tableParseInfo; 273 } 274 275 public static void main(String[] args) { 276 String sql23 = "insert overwrite table risk_event partition(year='2019',dt) select t.ops as order_no,t.id_no ,concat(t.consumer_no,'aa') dd,aadx from (select concat(a.opt_id,b.opt_id) as ops,b.id_no from ods.arc_event a left outer join ods.arc_user b on a.consumer_no = b.consumer_no) t left outer join (select order_no from arc_verify where dt = '20191023') t1 on t.consumer_no = t1.consumer_no"; 277 // String sql23 = "insert overwrite table riskt_eventpartition select opt_id from arc_event a inner join arc_user b"; 278 // String sql23 = "insert overwrite table riskt_eventpartition select opt_id from arc_event"; 279 // String sql23 = "SELECT SUM(CASE when rcw.eventid=2 and rcw.method = 'sendevent' then 1 else 0 END) as successCnt," + 280 // " SUM(CASE when rcw.eventid=4 and rcw.method = 'risklevel' then 1 else 0 END) as payCnt," + 281 // " SUM(CASE when rcw.eventid=2 and rcw.method = 'sendevent' then 1 else 0 END)/SUM(CASE when rcw.eventid=4 and rcw.method = 'risklevel' then 1 else 0 END) as rate" + 282 // " FROM (\n" + 283 // " SELECT DISTINCT payorderid," + 284 // " eventid," + 285 // " method" + 286 // " FROM log.pay_rc_warden_event_basic" + 287 // " WHERE dt = '20180715'" + 288 // " ) rcw"; 289 HiveSqlParse hiveSqlParse = new HiveSqlParse(sql23); 290 System.out.println(hiveSqlParse.getSourceTables()); 291 System.out.println(hiveSqlParse.getOutermostColumns()); 292 System.out.println(hiveSqlParse.getInsertTable()); 293 System.out.println(hiveSqlParse.getPartition()); 294 System.out.println(hiveSqlParse.getTableParseInfo()); 295 } 296 }