trie树(字典树) DFA算法

@Service
public class SensitiveWordUtil extends TimerTask {
    private static final Log log = LogProxy.getLog(SensitiveWordUtil.class);
    public static int minMatchTYpe = 1;//最小匹配规则:以重复词汇的最少词来匹配
    public static int maxMatchType = 2;//最大匹配规则:以重复词汇的最多词来匹配
    private static String isEnd = "isEnd";
    private static String isEnd_0 = "0";//没有结束
    private static String isEnd_1 = "1";//结束
    private String configKey;//校验key是否一样的
    @Resource
    private ConfigService configService;
    private String sensitiveWorld = "饿了";
    private static ConcurrentHashMap sensitiveWordMap = new ConcurrentHashMap();//用于本地查询

    /** * 使用内部定时器匹配敏感词 * 1.初始化敏感词 */
    @Override
    public void run() {
        String keyWord = configService.getConfig(ConfigTypeEnum.PROPERTY.getType(), StoreConstants.ConfigKey.SENSITIVE_WORD, sensitiveWorld);
        if (keyWord.equals(configKey)) {
            if (log.isDebugEnabled()) {
                log.debug("=sensitiveWord=" + configKey);
            }
            return;
        } else {
            configKey = keyWord;
        }
        String[] strs = keyWord.split(",");
        Set<String> keyWordSet = new HashSet<String>();
        for (String str : strs) {
            keyWordSet.add(str);
        }
        sensitiveWordMap = addSensitiveWordToHashMap(keyWordSet);
        log.info("=sensitiveWord=" + JSON.toJSONString(sensitiveWordMap));
    }

    /** * 读取敏感词库,将敏感词放入HashSet中,构建一个DFA算法模型:<br> * {天={'isEnd':0,'猫'={'isEnd:1'}},习={isEnd:0,大={isEnd:0,大={isEnd:1}}}} * 待整个map生成好最后直接赋值 * * @param keyWordSet 敏感词库 */
    private ConcurrentHashMap addSensitiveWordToHashMap(Set<String> keyWordSet) {
        ConcurrentHashMap sensitiveWordMapOrg = new ConcurrentHashMap(keyWordSet.size());//初始化敏感词容器,减少扩容操作
        //迭代keyWordSet
        Iterator<String> iterator = keyWordSet.iterator();
        while (iterator.hasNext()) {
            String key = iterator.next();//关键字
            Map nowMap = sensitiveWordMapOrg;
            for (int i = 0; i < key.length(); i++) {
                char keyChar = key.charAt(i);//转换成char型
                Object wordMap = nowMap.get(keyChar); //获取
                if (wordMap != null) { //如果存在该key,直接赋值
                    nowMap = (Map) wordMap;
                } else {     //不存在则,则构建一个map,同时将isEnd设置为0,因为他不是最后一个
                    Map<String, String> newWorMap = new HashMap<String, String>();
                    newWorMap.put(isEnd, isEnd_0);     //不是最后一个
                    nowMap.put(keyChar, newWorMap);
                    nowMap = newWorMap;
                }
                if (i == key.length() - 1) {
                    nowMap.put(isEnd, isEnd_1);    //最后一个
                }
            }
        }
        return sensitiveWordMapOrg;
    }

    /** * 判断文字是否包含敏感字符 * * @param txt 文字 * @param matchType 匹配规则 1:最小匹配规则,2:最大匹配规则 * @return 若包含返回true,否则返回false */
    public static boolean isContaintSensitiveWord(String txt, int matchType) {
        boolean flag = false;
        if (StringUtils.isBlank(txt)) {
            return flag;
        }
        for (int i = 0; i < txt.length(); i++) {
            int matchFlag = checkSensitiveWord(txt, i, matchType); //判断是否包含敏感字符
            if (matchFlag > 0) {    //大于0存在,返回true
                flag = true;
            }
        }
        return flag;
    }

    /** * 获取文字中的敏感词 * * @param txt 文字 * @param matchType 匹配规则:1:最小匹配规则,2:最大匹配规则 * @return */
    public static Set<String> getSensitiveWord(String txt, int matchType) {
        Set<String> sensitiveWordList = new HashSet<String>();
        if (StringUtils.isBlank(txt)) {
            return sensitiveWordList;
        }
        for (int i = 0; i < txt.length(); i++) {
            int length = checkSensitiveWord(txt, i, matchType);    //判断是否包含敏感字符
            if (length > 0) {    //存在,加入list中
                sensitiveWordList.add(txt.substring(i, i + length));
                i = i + length - 1;    //减1的原因,是因为for会自增
            }
        }
        return sensitiveWordList;
    }

    /** * 替换敏感字字符 * * @param txt 替换字符,默认* */
    public static String replaceSensitiveWord(String txt) {
        if (StringUtils.isBlank(txt)) {
            return txt;
        }
        String word = replaceSensitiveWord(txt, maxMatchType, null);
        return word;
    }

    /** * 替换敏感字字符 * * @param txt * @param matchType * @param replaceChar 替换字符,默认* */
    public static String replaceSensitiveWord(String txt, int matchType, String replaceChar) {
        if (StringUtils.isBlank(replaceChar)) {
            replaceChar = "*";
        }
        String resultTxt = txt;
        Set<String> set = getSensitiveWord(txt, matchType);//获取所有的敏感词
        Iterator<String> iterator = set.iterator();
        while (iterator.hasNext()) {
            String word = iterator.next();
            String replaceString = getReplaceChars(replaceChar, word.length());
            resultTxt = resultTxt.replaceAll(word, replaceString);
        }
        return resultTxt;
    }

    /** * 获取替换字符串 * * @param replaceChar * @param length * @return */
    private static String getReplaceChars(String replaceChar, int length) {
        String resultReplace = replaceChar;
        for (int i = 1; i < length; i++) {
            resultReplace += replaceChar;
        }
        return resultReplace;
    }

    /** * 检查文字中是否包含敏感字符,检查规则如下:<br> * * @param txt * @param beginIndex * @param matchType * @return 如果存在,则返回敏感词字符的长度,不存在返回0 */
    @SuppressWarnings({"rawtypes"})
    public static int checkSensitiveWord(String txt, int beginIndex, int matchType) {
        boolean flag = false;    //敏感词结束标识位:用于敏感词只有1位的情况
        int matchFlag = 0;     //匹配标识数默认为0
        Map nowMap = sensitiveWordMap;
        if (nowMap == null) {
            nowMap = new HashMap();
        }
        for (int i = beginIndex; i < txt.length(); i++) {
            char word = txt.charAt(i);
            nowMap = (Map) nowMap.get(word);     //获取指定key
            if (nowMap != null) {     //存在,则判断是否为最后一个
                matchFlag++;     //找到相应key,匹配标识+1
                if (isEnd_1.equals(nowMap.get(isEnd))) {       //如果为最后一个匹配规则,结束循环,返回匹配标识数
                    flag = true;       //结束标志位为true
                    if (minMatchTYpe == matchType) {    //最小规则,直接返回,最大规则还需继续查找
                        break;
                    }
                }
            } else {//不存在,直接返回
                break;
            }
        }
        if (matchFlag < 1 || !flag) { //长度必须大于等于2,为词
            matchFlag = 0;
        }
        return matchFlag;
    }
    原文作者:Trie树
    原文地址: https://blog.csdn.net/wangjianhua_love/article/details/80529991
    本文转自网络文章,转载此文章仅为分享知识,如有侵权,请联系博主进行删除。
点赞

发表评论

电子邮件地址不会被公开。 必填项已用*标注