POI按照Word文档指定标题进行拆分

2024年4月2日 57次阅读来源: Taowiedong

Pom配置

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>org.taowd</groupId>
    <artifactId>Hello_Word</artifactId>
    <version>1.0-SNAPSHOT</version>
    <dependencies>
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi-ooxml</artifactId>
            <version>3.8</version>
        </dependency>
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi-scratchpad</artifactId>
            <version>3.8</version>
        </dependency>
    </dependencies>

</project>

代码实现

import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.poi.xwpf.usermodel.BodyElementType;
import org.apache.poi.xwpf.usermodel.IBodyElement;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;

/** * 根据文档将所有段落都遍历一遍，找到对应标题的，讲其他元素删除，保留原有样式 <br/> * https://blog.csdn.net/qq_37201321/article/details/91864843 */
public class HelloWorld { 
    public static void main(String[] args) throws IOException { 
        readAndWriterTest4();
    }

    public static void readAndWriterTest4() throws IOException { 

        String str = "";
        try { 
            File file = new File("test.docx");
            FileInputStream fis = new FileInputStream(file);
            XWPFDocument xdoc = new XWPFDocument(fis);
            List<IBodyElement> bodyElements = xdoc.getBodyElements();
            int count = bodyElements.size();
            System.out.println(count);
            // for (int i = 0; i < count; i++) { 
            // IBodyElement bodyElement = bodyElements.get(i);
            // BodyElementType bet = bodyElement.getElementType();
            //
            // }

            int start = 0;
            int end = bodyElements.size();
            for (int i = 0; i < count; i++) { 
                IBodyElement bodyElement = bodyElements.get(i);
                BodyElementType bet = bodyElement.getElementType();
                if (bet == BodyElementType.PARAGRAPH) { 
                    // 段落
                    XWPFParagraph paragraph = ((XWPFParagraph)bodyElement);
                    // 判断该段落是否设置了大纲级别
                    String control = getTitleLvl(xdoc, paragraph);
                    if (paragraph.getText().equals("系统设计") && control.equals("0")) { 
                        start = i;
                    }
                    if (paragraph.getText().equals("系统功能的设计与实现") && control.equals("0")) { 
                        System.out.println(paragraph.getText());
                        System.out.println(i);
                        end = i - 1;
                    }
                }
            }
            System.out.println("#################################################################");
            // for (int i = 0; i < count; i++) { 
            // if (i < start || i > end) { 
            // continue;
            // }
            // IBodyElement bodyElement = bodyElements.get(i);
            // BodyElementType bet = bodyElement.getElementType();
            // if (bet == BodyElementType.TABLE) { 
            // // 表格
            // System.out.println("table" + bodyElement.getPart());
            // } else { 
            // // 段落
            // XWPFParagraph paragraph = ((XWPFParagraph)bodyElement);
            // System.out.println(paragraph.getText());
            // }
            // }
            System.out.println("#################################################################");

            // XWPFWordExtractor extractor = new XWPFWordExtractor(xdoc);
            // String doc1 = extractor.getText();
            // System.out.println(doc1);
            fis.close();

            File file_new = new File("test.docx");
            FileInputStream fis_new = new FileInputStream(file_new);
            XWPFDocument out_doc = new XWPFDocument(fis_new);

            System.out.println("start:" + start);
            System.out.println("end:" + end);
            for (int i = count - 1; i >= end; i--) { 
                out_doc.removeBodyElement(i);
                // System.out.println(i + "==" + out_doc.removeBodyElement(i));
            }
            System.out.println("*****************");
            // 写入一个新文件
            for (int i = start; i >= 0; i--) { 
                out_doc.removeBodyElement(i);
                // System.out.println(i + "==" + out_doc.removeBodyElement(i));
            }

            File new_file = new File("new_test1111.docx");
            FileOutputStream out = new FileOutputStream(new_file);
            out_doc.write(out);
        } catch (Exception e) { 
            e.printStackTrace();
        }
    }

    /** * 输出大纲,返回的布尔值用于判断内一级是否需要判断大纲 * * @param paragraph * @param number * @throws IOException */
    private static int num = 0;

    /** * Word中的大纲级别，可以通过getPPr().getOutlineLvl()直接提取，但需要注意，Word中段落级别，通过如下三种方式定义： 1、直接对段落进行定义； 2、对段落的样式进行定义； * 3、对段落样式的基础样式进行定义。 因此，在通过“getPPr().getOutlineLvl()”提取时，需要依次在如上三处读取。 * * @param doc * @param para * @return */
    private static String getTitleLvl(XWPFDocument doc, XWPFParagraph para) { 
        String titleLvl = "";
        try { 
            // 判断该段落是否设置了大纲级别
            if (para.getCTP().getPPr().getOutlineLvl() != null) { 
                // System.out.println("getCTP()");
                // System.out.println(para.getParagraphText());
                // System.out.println(para.getCTP().getPPr().getOutlineLvl().getVal());

                return String.valueOf(para.getCTP().getPPr().getOutlineLvl().getVal());
            }
        } catch (Exception e) { 

        }

        try { 
            // 判断该段落的样式是否设置了大纲级别
            if (doc.getStyles().getStyle(para.getStyle()).getCTStyle().getPPr().getOutlineLvl() != null) { 

                // System.out.println("getStyle");
                // System.out.println(para.getParagraphText());
                // System.out.println(doc.getStyles().getStyle(para.getStyle()).getCTStyle().getPPr().getOutlineLvl().getVal());

                return String
                    .valueOf(doc.getStyles().getStyle(para.getStyle()).getCTStyle().getPPr().getOutlineLvl().getVal());
            }
        } catch (Exception e) { 

        }

        try { 
            // 判断该段落的样式的基础样式是否设置了大纲级别
            if (doc.getStyles().getStyle(doc.getStyles().getStyle(para.getStyle()).getCTStyle().getBasedOn().getVal())
                .getCTStyle().getPPr().getOutlineLvl() != null) { 
                // System.out.println("getBasedOn");
                // System.out.println(para.getParagraphText());
                String styleName = doc.getStyles().getStyle(para.getStyle()).getCTStyle().getBasedOn().getVal();
                // System.out.println(doc.getStyles().getStyle(styleName).getCTStyle().getPPr().getOutlineLvl().getVal());

                return String
                    .valueOf(doc.getStyles().getStyle(styleName).getCTStyle().getPPr().getOutlineLvl().getVal());
            }
        } catch (Exception e) { 

        }

        try { 
            if (para.getStyleID() != null) { 
                return para.getStyleID();
            }
        } catch (Exception e) { 

        }

        return titleLvl;
    }

    /** * 去除文档中可能存在的转义符 * * @param str */
    public static String unescapeJava(String str) { 
        String dest = "";
        if (str != null) { 
            Pattern p = Pattern.compile("\\s*|\t|\r|\n");
            Matcher m = p.matcher(str);
            dest = m.replaceAll("");
        }
        return dest;
    }

    class NewWord { 
        /** * 新文档开始索引 */
        int strat;
        /** * 新文档结束索引 */
        int end;
        /** * 新文档名称，根据大纲级别导出新文档 */
        String name;
    }
}

    原文作者：Taowiedong
    原文地址: https://blog.csdn.net/taoweidong1/article/details/119988747
    本文转自网络文章，转载此文章仅为分享知识，如有侵权，请联系博主进行删除。