針對微博的短篇博文,編寫的簡單分詞和匹配算法。相對於一篇文檔的複雜分詞算法,能夠在效率和可用性上得到較好的平衡。
package com.sina.tblog.sentiment;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.regex.Pattern;
import com.sina.tblog.sentiment.constant.Constant;
public class KeyWordFilter {
public static HashSet<String> KeyWordsList = null;
public static HashSet<String> letterKeyWordsList = null;
/**
* 初始化或重新導入關鍵詞列表
* @throws IOException
*/
static{
try {
initKeyWords(Constant.KeyWordsFiles);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public static int deleteNewWord(String word){
if(word.length()>10||word.length()<2)
return -1;
if(!KeyWordsList.contains(word))
return 0;
KeyWordsList.remove(word);
if(Pattern.compile("(?i)[a-z][A-Z]").matcher(word).find())
letterKeyWordsList.remove(word.toUpperCase());
FileOutputStream stream;
OutputStreamWriter writer;
try {
stream = new FileOutputStream(Constant.newWordsFile,true);
writer = new OutputStreamWriter(stream);
writer.write("\n"+word);
writer.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
return -1;
}
return 1;
}
public static int addWord(String word){
if(word.length()>10)
return -1;
if(KeyWordsList.contains(word))
return 0;
KeyWordsList.add(word);
if(Pattern.compile("(?i)[a-z][A-Z]").matcher(word).find())
letterKeyWordsList.add(word.toUpperCase());
FileOutputStream stream;
OutputStreamWriter writer;
try {
stream = new FileOutputStream(Constant.newWordsFile,true);
writer = new OutputStreamWriter(stream);
writer.write("\n"+word);
writer.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
return -1;
}
return 1;
}
private static void initKeyWords(String Files[]) throws IOException {
if(KeyWordsList!=null)
KeyWordsList.clear();
else
KeyWordsList = new HashSet<String>();
if(letterKeyWordsList!=null)
letterKeyWordsList.clear();
else
letterKeyWordsList = new HashSet<String>();
for(int i=0;i<Files.length;i++){
File file = new File(Files[i]);
BufferedReader reader = null;
reader = new BufferedReader(new FileReader(file));
String tmp = reader.readLine();
while(tmp!=null){
KeyWordsList.add(tmp);
if(Pattern.compile("(?i)[a-z][A-Z]").matcher(tmp).find())
letterKeyWordsList.add(tmp.toUpperCase());
tmp = reader.readLine();
}
reader.close();
}
}
private static boolean findWord(String str,boolean ignoreCase){
if(ignoreCase == false)
return KeyWordsList.contains(str);
else{
boolean match = KeyWordsList.contains(str);
if(match == false){
match = letterKeyWordsList.contains(str.toUpperCase());
}
return match;
}
}
public static List<String> segmentStrQuickMatch( String str_line,boolean ignoreCase)
{
String term = "";
boolean term_tag = false;
int str_size=0,left=0,len=0;
List<String> list = new ArrayList<String>();
str_size = str_line.length();
while(left<str_size)
{
len = Constant.max_len;
while( len>=Constant.min_len ) //gkm:每一詞
{
term="";
int right = left+len;
int x = 0;
if(right>str_size){
x = right-str_size;
right = str_size;
}
term=str_line.substring(left,right);
term_tag=findWord(term,ignoreCase);
if(term_tag==true)
break;
if(x>0)
len-=x+1;
else
len-=1;
}
if(term_tag==false) //gkm:詞典中沒有term,後移一個字符(以一個字符的速度後移,使得可以分出中英混合的詞,沒有判斷無效字符,有待改進!!! )
{
left+=1;
}
else //gkm:詞典中有term,後移len個字符,term加入到terms_vct[term_tag]
{
left+=len;
list.add(term);
}
}//while(left<str_size)
return list;
}
public static List<String> segmentStrFullMatch( String str_line,boolean ignoreCase)
{
String term = "";
boolean term_tag = false;
int str_size=0,left=0,len=0;
List<String> list = new ArrayList<String>();
str_size = str_line.length();
while(left<str_size)
{
len = Constant.max_len;
while( len>=Constant.min_len ) //gkm:每一詞
{
term="";
int right = left+len;
int x = 0;
if(right>str_size){
x = right-str_size;
right = str_size;
}
term=str_line.substring(left,right);
term_tag=findWord(term,ignoreCase);
if(term_tag==true)
list.add(term);
if(x>0)
len-=x+1;
else
len-=1;
}
left+=1;
}//while(left<str_size)
return list;
}
public static void main(String[] args) throws IOException {
System.out.println(segmentStrFullMatch("中華人民共和國",true));
}
}