@Service
public class SensitiveWordUtil extends TimerTask {
private static final Log log = LogProxy.getLog(SensitiveWordUtil.class);
public static int minMatchTYpe = 1;//最小匹配规则:以重复词汇的最少词来匹配
public static int maxMatchType = 2;//最大匹配规则:以重复词汇的最多词来匹配
private static String isEnd = "isEnd";
private static String isEnd_0 = "0";//没有结束
private static String isEnd_1 = "1";//结束
private String configKey;//校验key是否一样的
@Resource
private ConfigService configService;
private String sensitiveWorld = "饿了";
private static ConcurrentHashMap sensitiveWordMap = new ConcurrentHashMap();//用于本地查询
/** * 使用内部定时器匹配敏感词 * 1.初始化敏感词 */
@Override
public void run() {
String keyWord = configService.getConfig(ConfigTypeEnum.PROPERTY.getType(), StoreConstants.ConfigKey.SENSITIVE_WORD, sensitiveWorld);
if (keyWord.equals(configKey)) {
if (log.isDebugEnabled()) {
log.debug("=sensitiveWord=" + configKey);
}
return;
} else {
configKey = keyWord;
}
String[] strs = keyWord.split(",");
Set<String> keyWordSet = new HashSet<String>();
for (String str : strs) {
keyWordSet.add(str);
}
sensitiveWordMap = addSensitiveWordToHashMap(keyWordSet);
log.info("=sensitiveWord=" + JSON.toJSONString(sensitiveWordMap));
}
/** * 读取敏感词库,将敏感词放入HashSet中,构建一个DFA算法模型:<br> * {天={'isEnd':0,'猫'={'isEnd:1'}},习={isEnd:0,大={isEnd:0,大={isEnd:1}}}} * 待整个map生成好最后直接赋值 * * @param keyWordSet 敏感词库 */
private ConcurrentHashMap addSensitiveWordToHashMap(Set<String> keyWordSet) {
ConcurrentHashMap sensitiveWordMapOrg = new ConcurrentHashMap(keyWordSet.size());//初始化敏感词容器,减少扩容操作
//迭代keyWordSet
Iterator<String> iterator = keyWordSet.iterator();
while (iterator.hasNext()) {
String key = iterator.next();//关键字
Map nowMap = sensitiveWordMapOrg;
for (int i = 0; i < key.length(); i++) {
char keyChar = key.charAt(i);//转换成char型
Object wordMap = nowMap.get(keyChar); //获取
if (wordMap != null) { //如果存在该key,直接赋值
nowMap = (Map) wordMap;
} else { //不存在则,则构建一个map,同时将isEnd设置为0,因为他不是最后一个
Map<String, String> newWorMap = new HashMap<String, String>();
newWorMap.put(isEnd, isEnd_0); //不是最后一个
nowMap.put(keyChar, newWorMap);
nowMap = newWorMap;
}
if (i == key.length() - 1) {
nowMap.put(isEnd, isEnd_1); //最后一个
}
}
}
return sensitiveWordMapOrg;
}
/** * 判断文字是否包含敏感字符 * * @param txt 文字 * @param matchType 匹配规则 1:最小匹配规则,2:最大匹配规则 * @return 若包含返回true,否则返回false */
public static boolean isContaintSensitiveWord(String txt, int matchType) {
boolean flag = false;
if (StringUtils.isBlank(txt)) {
return flag;
}
for (int i = 0; i < txt.length(); i++) {
int matchFlag = checkSensitiveWord(txt, i, matchType); //判断是否包含敏感字符
if (matchFlag > 0) { //大于0存在,返回true
flag = true;
}
}
return flag;
}
/** * 获取文字中的敏感词 * * @param txt 文字 * @param matchType 匹配规则:1:最小匹配规则,2:最大匹配规则 * @return */
public static Set<String> getSensitiveWord(String txt, int matchType) {
Set<String> sensitiveWordList = new HashSet<String>();
if (StringUtils.isBlank(txt)) {
return sensitiveWordList;
}
for (int i = 0; i < txt.length(); i++) {
int length = checkSensitiveWord(txt, i, matchType); //判断是否包含敏感字符
if (length > 0) { //存在,加入list中
sensitiveWordList.add(txt.substring(i, i + length));
i = i + length - 1; //减1的原因,是因为for会自增
}
}
return sensitiveWordList;
}
/** * 替换敏感字字符 * * @param txt 替换字符,默认* */
public static String replaceSensitiveWord(String txt) {
if (StringUtils.isBlank(txt)) {
return txt;
}
String word = replaceSensitiveWord(txt, maxMatchType, null);
return word;
}
/** * 替换敏感字字符 * * @param txt * @param matchType * @param replaceChar 替换字符,默认* */
public static String replaceSensitiveWord(String txt, int matchType, String replaceChar) {
if (StringUtils.isBlank(replaceChar)) {
replaceChar = "*";
}
String resultTxt = txt;
Set<String> set = getSensitiveWord(txt, matchType);//获取所有的敏感词
Iterator<String> iterator = set.iterator();
while (iterator.hasNext()) {
String word = iterator.next();
String replaceString = getReplaceChars(replaceChar, word.length());
resultTxt = resultTxt.replaceAll(word, replaceString);
}
return resultTxt;
}
/** * 获取替换字符串 * * @param replaceChar * @param length * @return */
private static String getReplaceChars(String replaceChar, int length) {
String resultReplace = replaceChar;
for (int i = 1; i < length; i++) {
resultReplace += replaceChar;
}
return resultReplace;
}
/** * 检查文字中是否包含敏感字符,检查规则如下:<br> * * @param txt * @param beginIndex * @param matchType * @return 如果存在,则返回敏感词字符的长度,不存在返回0 */
@SuppressWarnings({"rawtypes"})
public static int checkSensitiveWord(String txt, int beginIndex, int matchType) {
boolean flag = false; //敏感词结束标识位:用于敏感词只有1位的情况
int matchFlag = 0; //匹配标识数默认为0
Map nowMap = sensitiveWordMap;
if (nowMap == null) {
nowMap = new HashMap();
}
for (int i = beginIndex; i < txt.length(); i++) {
char word = txt.charAt(i);
nowMap = (Map) nowMap.get(word); //获取指定key
if (nowMap != null) { //存在,则判断是否为最后一个
matchFlag++; //找到相应key,匹配标识+1
if (isEnd_1.equals(nowMap.get(isEnd))) { //如果为最后一个匹配规则,结束循环,返回匹配标识数
flag = true; //结束标志位为true
if (minMatchTYpe == matchType) { //最小规则,直接返回,最大规则还需继续查找
break;
}
}
} else {//不存在,直接返回
break;
}
}
if (matchFlag < 1 || !flag) { //长度必须大于等于2,为词
matchFlag = 0;
}
return matchFlag;
}
trie树(字典树) DFA算法
原文作者:Trie树
原文地址: https://blog.csdn.net/wangjianhua_love/article/details/80529991
本文转自网络文章,转载此文章仅为分享知识,如有侵权,请联系博主进行删除。
原文地址: https://blog.csdn.net/wangjianhua_love/article/details/80529991
本文转自网络文章,转载此文章仅为分享知识,如有侵权,请联系博主进行删除。