public class DocumentExtractor extends AbstractExtractor implements Closeable
Copyright (c) 2020 xsx All Rights Reserved. x-easypdf-pdfbox is licensed under Mulan PSL v2. You can use this software according to the terms and conditions of the Mulan PSL v2. You may obtain a copy of Mulan PSL v2 at: http://license.coscl.org.cn/MulanPSL2 THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. See the Mulan PSL v2 for more details.
Modifier and Type | Field and Description |
---|---|
protected AbstractBookmarkExtractor |
bookmarkExtractor
书签提取器
|
protected AbstractCommentExtractor |
commentExtractor
评论提取器
|
protected AbstractFormExtractor |
formExtractor
表单提取器
|
protected AbstractImageExtractor |
imageExtractor
图像提取器
|
protected AbstractTextExtractor |
textExtractor
文本提取器
|
document, log
Constructor and Description |
---|
DocumentExtractor(Document document)
构造方法
|
Modifier and Type | Method and Description |
---|---|
void |
close()
关闭
|
Map<Integer,org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem> |
extractBookmark(int... bookmarkIndexes)
提取书签
|
Map<Integer,List<String>> |
extractComment(int... pageIndexes)
提取评论
|
Map<Integer,List<String>> |
extractCommentByRegex(String regex,
int... pageIndexes)
正则提取评论
|
Map<String,org.apache.pdfbox.pdmodel.interactive.form.PDField> |
extractFormField()
表单提取字段
|
Map<String,BufferedImage> |
extractFormImage()
表单提取图像
|
Map<String,String> |
extractFormText()
表单提取文本
|
Map<Integer,List<BufferedImage>> |
extractImage(int... pageIndexes)
提取图像
|
Map<Integer,List<String>> |
extractText(int... pageIndexes)
提取文本
|
Map<Integer,List<String>> |
extractTextByRegex(String regex,
int... pageIndexes)
正则提取文本
|
Map<Integer,Map<String,String>> |
extractTextByRegionArea(Map<String,Rectangle> regionArea,
int... pageIndexes)
区域提取文本
|
Map<Integer,Map<String,String>> |
extractTextByRegionArea(Map<String,Rectangle> regionArea,
String wordSeparator,
int... pageIndexes)
区域提取文本
|
Map<Integer,Map<String,List<List<String>>>> |
extractTextForTable(Map<String,Rectangle> regionArea,
int... pageIndexes)
表格提取文本
|
Map<Integer,Map<String,List<List<String>>>> |
extractTextForTable(Map<String,Rectangle> regionArea,
String wordSeparator,
int... pageIndexes)
表格提取文本
|
getDocument
protected AbstractTextExtractor textExtractor
protected AbstractImageExtractor imageExtractor
protected AbstractFormExtractor formExtractor
protected AbstractCommentExtractor commentExtractor
protected AbstractBookmarkExtractor bookmarkExtractor
public DocumentExtractor(Document document)
document
- 文档public Map<Integer,List<String>> extractText(int... pageIndexes)
pageIndexes
- 页面索引key=页面索引,value=提取文本
public Map<Integer,List<String>> extractTextByRegex(String regex, int... pageIndexes)
regex
- 正则表达式pageIndexes
- 页面索引key=页面索引,value=提取文本
public Map<Integer,Map<String,String>> extractTextByRegionArea(Map<String,Rectangle> regionArea, int... pageIndexes)
regionArea
- 区域pageIndexes
- 页面索引一级,key = 页面索引,value = 提取文本字典
二级,key = 区域名称,value = 提取文本public Map<Integer,Map<String,String>> extractTextByRegionArea(Map<String,Rectangle> regionArea, String wordSeparator, int... pageIndexes)
regionArea
- 区域wordSeparator
- 单词分隔符pageIndexes
- 页面索引一级,key = 页面索引,value = 提取文本字典
二级,key = 区域名称,value = 提取文本public Map<Integer,Map<String,List<List<String>>>> extractTextForTable(Map<String,Rectangle> regionArea, int... pageIndexes)
注:单行单列
regionArea
- 区域pageIndexes
- 页面索引一级,key = 页面索引,value = 提取文本字典
二级,key = 区域名称,value = 提取文本
public Map<Integer,Map<String,List<List<String>>>> extractTextForTable(Map<String,Rectangle> regionArea, String wordSeparator, int... pageIndexes)
注:单行单列
regionArea
- 区域wordSeparator
- 单词分隔符pageIndexes
- 页面索引一级,key = 页面索引,value = 提取文本字典
二级,key = 区域名称,value = 提取文本
public Map<Integer,List<BufferedImage>> extractImage(int... pageIndexes)
pageIndexes
- 页面索引key = 页面索引,value = 提取图像
public Map<String,String> extractFormText()
key = 字段名称,value = 提取文本
public Map<String,BufferedImage> extractFormImage()
key = 字段名称,value = 提取图像
public Map<String,org.apache.pdfbox.pdmodel.interactive.form.PDField> extractFormField()
key = 字段名称,value = 提取字段
public Map<Integer,List<String>> extractComment(int... pageIndexes)
pageIndexes
- 页面索引key=页面索引,value=提取评论
public Map<Integer,List<String>> extractCommentByRegex(String regex, int... pageIndexes)
regex
- 正则表达式pageIndexes
- 页面索引key=页面索引,value=提取评论
public Map<Integer,org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem> extractBookmark(int... bookmarkIndexes)
bookmarkIndexes
- 书签索引key=书签索引,value=提取书签
public void close()
close
in interface Closeable
close
in interface AutoCloseable
Copyright © 2024. All rights reserved.