Trie树学习
标签(空格分隔): Trie树
文章目录
定义
字典树、单词搜索树或键树。用于统计和排序大量的字符串
三个基本性质
- 根节点不包含字符,每条边代表一个字符
- 从根节点到某一节点连起来的字符表示该节点表示的一个字符串
- 节点的子节点包含的字符都不相同
Java实现
import java.util.HashMap; import java.util.Map; /** * author:lanpeng * Date:2018/10/6 * Time:16:15 */ public class TrieTree { public static void main(String[] args) { String[] strs = {"我啊", "我打", "b", "我打你", "lisfa", "asfkjlsf"}; String[] words = {"我", "我打", "asf", "ac"}; TrieTree trieTree = new TrieTree(); TrieNode root = new TrieNode(); for(String a: strs) { trieTree.insert(root, a); } for (String s: words) { System.out.println(s+":"+trieTree.search(root,s)); } } //插入一个字符串 public void insert(TrieNode node, String str) { for(int i = 0; i < str.length(); ++i) { Character c = new Character(str.charAt(i)); if(!node.childdren.containsKey(c)) { node.childdren.put(c, new TrieNode()); } else { node.childdren.get(c).nCount++; } node = node.childdren.get(c); } } //搜索一个字符串 public int search(TrieNode node, String str) { for(int i = 0; i < str.length(); ++i) { Character c = new Character(str.charAt(i)); if(!node.childdren.containsKey(c)) { return 0; } node = node.childdren.get(c); } return node.nCount; } private static class TrieNode { int nCount;//计数 Map<Character, TrieNode> childdren;//子节点 public TrieNode() { nCount = 1; childdren = new HashMap<>(); } } }
package com.csair.etm.quartz.utils;
import org.apache.commons.lang.CharUtils;
import org.apache.commons.lang.StringUtils;
import java.util.HashMap;
import java.util.Map;
/**
* author:lanpeng
* Date:2018/10/6
* Time:16:15
*/
public class TrieTree {
public static void main(String[] args) {
String[] strs = {"我啊", "我打", "b", "我打你", "lisfa", "asfkjlsf"};
String[] words = {"我", "我打", "asf", "ac"};
String s = "aaaab啊asfkjlsf噩噩噩噩";
TrieTree trieTree = new TrieTree();
TrieNode root = new TrieNode();
for(String a: strs) {
trieTree.addWord(root, a);
}
System.out.println(trieTree.filter(root, s));
System.out.println(trieTree.isSymbol(s.charAt(4)));
}
public void insert(TrieNode node, String str) {
for(int i = 0; i < str.length(); ++i) {
Character c = new Character(str.charAt(i));
if(!node.childdren.containsKey(c)) {
node.childdren.put(c, new TrieNode());
} else {
node.childdren.get(c).nCount++;
}
node = node.childdren.get(c);
}
}
public int search(TrieNode node, String str) {
for(int i = 0; i < str.length(); ++i) {
Character c = new Character(str.charAt(i));
if(!node.childdren.containsKey(c)) {
return 0;
}
node = node.childdren.get(c);
}
return node.nCount;
}
public void addWord(TrieNode node, String str) {
for (int i = 0; i < str.length(); ++i) {
Character c = new Character(str.charAt(i));
if(!node.childdren.containsKey(c)) {
node.childdren.put(c, new TrieNode());
} else {
node.childdren.get(c).nCount++;
}
node = node.childdren.get(c);
if (i == str.length() - 1) {
node.end = true;
}
}
}
private boolean isSymbol(char c) {
int ic = (int) c;
//东亚文字
return !CharUtils.isAsciiAlphanumeric(c) && (ic < 0x2E80 || ic > 0x9FFF);
}
public String filter(TrieNode rootNode, String text) {
if (StringUtils.isBlank(text)) {
return text;
}
StringBuilder result = new StringBuilder();
String replaceMent = "***";
TrieNode tempNode = rootNode;
//begin一直向后移动,代表当前搜索的敏感词的头结点
int begin = 0;
//position是当前敏感词的某一个结点,来回移动的那个
int position = 0;
while (position < text.length()) {
char c = text.charAt(position);
tempNode = tempNode.childdren.get(c);
//当前结点为null,说明不是敏感词
if (tempNode == null) {
result.append(text.charAt(begin));
position = begin + 1;
begin = position;
tempNode = rootNode;
} else if (tempNode.end) {
result.append(replaceMent);
position = position + 1;
begin = position;
tempNode = rootNode;
} else {
++position;
}
}
//position走到了最后,别忘了把begin剩下的也加进来,
//不过也有可能begin也没有啥嘞
result.append(text.substring(begin));
return result.toString();
}
private static class TrieNode {
int nCount;
boolean end = false;
Map<Character, TrieNode> childdren = new HashMap<>();
public TrieNode() {
nCount = 1;
childdren = new HashMap<>();
}
}
}