抽取参考文献中的标题

工作中需要对word文档中的标题进行提取,作个笔记记录一下

思想比较简单:通过正则表达式匹配

用到正则表达式: public static Pattern titlePattern = Pattern.compile(“(?<=“|\”).+?(?=”|\”)|(?<=“|\”|‘).+?(?=”|\”|’)”);

public static List<String> exactTitle(String file){
		String[] paraText =readWord(file);
		List<String> result = new ArrayList<String>();
		for(String text:paraText){
			if(text.replaceAll("[\\s\r\n|\n]", "").length()>0){
				Matcher m = titlePattern.matcher(text);
				if(m.find()){
					result.add(m.group());
				}
				
			}
		}
		return result;
		
	}

private static String[] readWord(String file) {
		String[] content = null;
		InputStream is = null;
		try {
			is = new FileInputStream(file);			
			int index = file.indexOf(".");
			if (index != -1) {
				String endWith = file.substring(index);
				if (endWith.equalsIgnoreCase(".doc")) {
					content = readWordDoc(is);
				} else {
					content = readWordDocx(is);
				}
			}else{
				System.out.println("文件格式不是word格式!");
			}
		} catch (IOException e) {
			throw new IllegalArgumentException(file + "文件不存在");
		}
		return content;
	}

//下面是利用POI读取word

/**
	 * 利用XWPF读取
	 * 
	 * @param file
	 * @return
	 * @throws IOException 
	 */
	private static String[] readWordDocx(InputStream is) throws IOException {
		XWPFDocument xwpf = new XWPFDocument(is);
		XWPFWordExtractor xwordExtractor = new XWPFWordExtractor(xwpf);
		String source = xwordExtractor.getText();
		String[] paraText = source.split("\r\n|\n");
		return paraText;

	}

	/**
	 * 利用HWPF按照段落读取文本
	 * 
	 * @param file
	 * @return
	 * @throws IOException 
	 */
	public static String[] readWordDoc(InputStream is) throws IOException {
		HWPFDocument hwpf = new HWPFDocument(is);
		WordExtractor wordExtractor = new WordExtractor(hwpf);
		return wordExtractor.getParagraphText();
	}

    原文作者:默默前行的蜗牛
    原文地址: https://blog.csdn.net/ywf008/article/details/52798012
    本文转自网络文章,转载此文章仅为分享知识,如有侵权,请联系博主进行删除。
点赞