public abstract class AbstractTextExtractor extends AbstractExtractor
Copyright (c) 2020 xsx All Rights Reserved. x-easypdf-pdfbox is licensed under Mulan PSL v2. You can use this software according to the terms and conditions of the Mulan PSL v2. You may obtain a copy of Mulan PSL v2 at: http://license.coscl.org.cn/MulanPSL2 THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. See the Mulan PSL v2 for more details.
Modifier and Type | Class and Description |
---|---|
protected static interface |
AbstractTextExtractor.Function<R>
功能接口
|
Modifier and Type | Field and Description |
---|---|
protected static Pattern |
TABLE_PATTERN
表格正则(单行单列)
|
document, log
Constructor and Description |
---|
AbstractTextExtractor(Document document)
有参构造
|
Modifier and Type | Method and Description |
---|---|
abstract Map<Integer,List<String>> |
extractByRegex(String regex,
int... pageIndexes)
正则提取文本
|
abstract Map<Integer,Map<String,String>> |
extractByRegionArea(String wordSeparator,
Map<String,Rectangle> regionArea,
int... pageIndexes)
区域提取文本
|
abstract Map<Integer,Map<String,List<List<String>>>> |
extractByTable(String wordSeparator,
Map<String,Rectangle> regionArea,
int... pageIndexes)
表格提取文本
|
protected <R> Map<Integer,R> |
extractText(AbstractTextExtractor.Function<R> function,
String wordSeparator,
Map<String,Rectangle> regionArea,
int... pageIndexes)
提取文本
|
protected List<String> |
processTextByRegex(String regex,
org.apache.pdfbox.text.PDFTextStripper stripper)
正则处理文本
|
protected Map<String,String> |
processTextByRegionArea(Map<String,Rectangle> regionArea,
String wordSeparator,
org.apache.pdfbox.pdmodel.PDPage page)
区域处理文本
|
protected Map<String,List<List<String>>> |
processTextByTable(Map<String,Rectangle> regionArea,
String wordSeparator,
org.apache.pdfbox.pdmodel.PDPage page)
表格处理文本
|
getDocument
protected static final Pattern TABLE_PATTERN
public AbstractTextExtractor(Document document)
document
- 文档public abstract Map<Integer,List<String>> extractByRegex(String regex, int... pageIndexes)
regex
- 正则表达式pageIndexes
- 页面索引key = 页面索引,value = 提取文本
public abstract Map<Integer,Map<String,String>> extractByRegionArea(String wordSeparator, Map<String,Rectangle> regionArea, int... pageIndexes)
wordSeparator
- 单词分隔符regionArea
- 区域pageIndexes
- 页面索引key = 页面索引,value = 提取文本
public abstract Map<Integer,Map<String,List<List<String>>>> extractByTable(String wordSeparator, Map<String,Rectangle> regionArea, int... pageIndexes)
wordSeparator
- 单词分隔符regionArea
- 区域pageIndexes
- 页面索引key = 页面索引,value = 提取文本
protected List<String> processTextByRegex(String regex, org.apache.pdfbox.text.PDFTextStripper stripper)
regex
- 正则表达式stripper
- 文本剥离器protected Map<String,String> processTextByRegionArea(Map<String,Rectangle> regionArea, String wordSeparator, org.apache.pdfbox.pdmodel.PDPage page)
regionArea
- 区域wordSeparator
- 单词分隔符page
- 页面key = 区域名称,value = 提取文本
protected Map<String,List<List<String>>> processTextByTable(Map<String,Rectangle> regionArea, String wordSeparator, org.apache.pdfbox.pdmodel.PDPage page)
regionArea
- 区域wordSeparator
- 单词分隔符page
- 页面key = 区域名称,value = 提取文本
protected <R> Map<Integer,R> extractText(AbstractTextExtractor.Function<R> function, String wordSeparator, Map<String,Rectangle> regionArea, int... pageIndexes)
R
- 返回类型function
- 功能函数wordSeparator
- 单词分隔符regionArea
- 区域pageIndexes
- 页面索引key = 页面索引,value = 提取文本
Copyright © 2024. All rights reserved.