簡單關鍵詞匹配算法

2024年2月17日 120次阅读
針對微博的短篇博文，編寫的簡單分詞和匹配算法。相對於一篇文檔的複雜分詞算法，能夠在效率和可用性上得到較好的平衡。
package com.sina.tblog.sentiment;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.regex.Pattern;

import com.sina.tblog.sentiment.constant.Constant;

public class KeyWordFilter {
	
	public static HashSet<String> KeyWordsList = null;
	public static HashSet<String> letterKeyWordsList = null;
	
	/**
	 * 初始化或重新導入關鍵詞列表
	 * @throws IOException
	 */
	static{
		try {
			initKeyWords(Constant.KeyWordsFiles);
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		
	}

	
	public static int deleteNewWord(String word){
		if(word.length()>10||word.length()<2)
			return -1;
		
		if(!KeyWordsList.contains(word))
			return 0;
		KeyWordsList.remove(word);
		if(Pattern.compile("(?i)[a-z][A-Z]").matcher(word).find())  
			letterKeyWordsList.remove(word.toUpperCase());
		FileOutputStream stream; 
		OutputStreamWriter writer;

		try {
			stream = new FileOutputStream(Constant.newWordsFile,true);
			
			writer = new OutputStreamWriter(stream);
			writer.write("\n"+word);
			writer.close();
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
			return -1;
		}
		return 1;
		
	}
	
	public static int addWord(String word){
		if(word.length()>10)
			return -1;
		
		if(KeyWordsList.contains(word))
			return 0;
		KeyWordsList.add(word);
		if(Pattern.compile("(?i)[a-z][A-Z]").matcher(word).find())  
			letterKeyWordsList.add(word.toUpperCase());
		FileOutputStream stream; 
		OutputStreamWriter writer;

		try {
			stream = new FileOutputStream(Constant.newWordsFile,true);
			writer = new OutputStreamWriter(stream);
			writer.write("\n"+word);
			writer.close();
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
			return -1;
		}
		return 1;
		
	}
	
	private static void initKeyWords(String Files[]) throws IOException {
		
		if(KeyWordsList!=null)
			KeyWordsList.clear();
		else
			KeyWordsList = new HashSet<String>();
		
		if(letterKeyWordsList!=null)
			letterKeyWordsList.clear();
		else
			letterKeyWordsList = new HashSet<String>();
			
		for(int i=0;i<Files.length;i++){
				
				File file = new File(Files[i]);
				BufferedReader reader = null;
				reader = new BufferedReader(new FileReader(file));
				String tmp = reader.readLine();
				while(tmp!=null){
					KeyWordsList.add(tmp);
					if(Pattern.compile("(?i)[a-z][A-Z]").matcher(tmp).find())  
						letterKeyWordsList.add(tmp.toUpperCase());
					tmp = reader.readLine();
				}
				reader.close();
		}
	}
	
	private static boolean findWord(String str,boolean ignoreCase){
		
		
			if(ignoreCase == false)
				return KeyWordsList.contains(str);
			else{
				boolean match = KeyWordsList.contains(str);
				if(match == false){
					match = letterKeyWordsList.contains(str.toUpperCase());
				}
				return match;
			}
			
	}
	
	public static List<String> segmentStrQuickMatch( String str_line,boolean ignoreCase)
	{
		String term = "";
		boolean term_tag = false;
		int str_size=0,left=0,len=0;
		List<String> list = new ArrayList<String>();
		
		str_size = str_line.length();
		
		while(left<str_size)
		{
			len = Constant.max_len;
		
			while( len>=Constant.min_len )			//gkm：每一詞
			{
				term="";
				int right = left+len;
				
				int x = 0;
				if(right>str_size){
					x = right-str_size;
					right = str_size;
				}
				
				term=str_line.substring(left,right);
				
					term_tag=findWord(term,ignoreCase);

				if(term_tag==true)
					break;
				if(x>0)
					len-=x+1;
				else
					len-=1;
			}
			if(term_tag==false)		//gkm：詞典中沒有term，後移一個字符（以一個字符的速度後移，使得可以分出中英混合的詞，沒有判斷無效字符，有待改進！！！ ）
			{
				left+=1;
			}
			else					//gkm：詞典中有term，後移len個字符,term加入到terms_vct[term_tag]
			{
				left+=len;
				
				list.add(term);
			}
			
		}//while(left<str_size)
		return list;
	}
	
	public static List<String> segmentStrFullMatch( String str_line,boolean ignoreCase)
	{
		String term = "";
		boolean term_tag = false;
		int str_size=0,left=0,len=0;
		List<String> list = new ArrayList<String>();
		
		str_size = str_line.length();
		
		while(left<str_size)
		{
			len = Constant.max_len;
			
			while( len>=Constant.min_len )			//gkm：每一詞
			{
				term="";
				int right = left+len;
				
				int x = 0;
				if(right>str_size){
					x = right-str_size;
					right = str_size;
				}
				
				term=str_line.substring(left,right);

				term_tag=findWord(term,ignoreCase);

				if(term_tag==true)
					list.add(term);
				
				if(x>0)
					len-=x+1;
				else
					len-=1;
				
			}
				left+=1;
			
		}//while(left<str_size)
		return list;
	}
	
	
	
	public static void main(String[] args) throws IOException {
		System.out.println(segmentStrFullMatch("中華人民共和國",true));
	}
		
}