Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit341d3fc

Browse files
authored
[feat] Add pdf loader (#71)
* reorganize folders* automatically setting ids* Add pdf loading functionality* add pdf tests* add deps* Add load_pdf to init* add load_pdf to doc* Add copyright
1 parentc0044a0 commit341d3fc

File tree

8 files changed

+252
-11
lines changed

8 files changed

+252
-11
lines changed

‎docs/api_doc/io.rst

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,22 +2,28 @@ Load and Export Layout Data
22
================================
33

44

5-
DataFrame and CSV
5+
`Dataframe` and CSV
66
--------------------------------
77

88
..autofunction::layoutparser.io.load_dataframe
99

1010
..autofunction::layoutparser.io.load_csv
1111

1212

13-
Dictionary and JSON
13+
`Dict` and JSON
1414
--------------------------------
1515

1616
..autofunction::layoutparser.io.load_dict
1717

1818
..autofunction::layoutparser.io.load_json
1919

2020

21+
PDF
22+
--------------------------------
23+
24+
..autofunction::layoutparser.io.load_pdf
25+
26+
2127
Other Formats
2228
--------------------------------
2329
Stay tuned! We are working on to support more formats.

‎setup.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,8 @@
4141
"pillow",
4242
"pyyaml>=5.1",
4343
"iopath",
44+
"pdfplumber",
45+
"pdf2image",
4446
],
4547
extras_require={
4648
"ocr": [

‎src/layoutparser/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,8 @@
4141
"load_json",
4242
"load_dict",
4343
"load_csv",
44-
"load_dataframe"
44+
"load_dataframe",
45+
"load_pdf"
4546
],
4647
"file_utils":[
4748
"is_torch_available",

‎src/layoutparser/io/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
from .basicimportload_json,load_dict,load_csv,load_dataframe
2+
from .pdfimportload_pdf

‎src/layoutparser/io.pyrenamed to ‎src/layoutparser/io/basic.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,8 @@
1818

1919
importpandasaspd
2020

21-
from .elementsimport (
22-
BaseCoordElement,
21+
from ..elementsimport (
2322
BaseLayoutElement,
24-
Interval,
25-
Rectangle,
26-
Quadrilateral,
2723
TextBlock,
2824
Layout,
2925
BASECOORD_ELEMENT_NAMEMAP,
@@ -144,4 +140,7 @@ def load_dataframe(df: pd.DataFrame, block_type: str = None) -> Layout:
144140
else:
145141
df["block_type"]=block_type
146142

143+
if"id"notindf.columns:
144+
df["id"]=df.index
145+
147146
returnload_dict(df.apply(lambdax:x.dropna().to_dict(),axis=1).to_list())

‎src/layoutparser/io/pdf.py

Lines changed: 220 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,220 @@
1+
# Copyright 2021 The Layout Parser team. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
fromtypingimportList,Union,Optional,Dict,Tuple
16+
17+
importpdfplumber
18+
importpandasaspd
19+
20+
from ..elementsimportLayout
21+
from .basicimportload_dataframe
22+
23+
DEFAULT_PDF_DPI=72
24+
25+
26+
defextract_words_for_page(
27+
page:pdfplumber.page.Page,
28+
x_tolerance=1.5,
29+
y_tolerance=2,
30+
keep_blank_chars=False,
31+
use_text_flow=True,
32+
horizontal_ltr=True,
33+
vertical_ttb=True,
34+
extra_attrs=None,
35+
)->Layout:
36+
"""The helper function used for extracting words from a pdfplumber page
37+
object.
38+
39+
Returns:
40+
Layout: a layout object representing all extracted pdf tokens on this page.
41+
"""
42+
ifextra_attrsisNone:
43+
extra_attrs= ["fontname","size"]
44+
45+
tokens=page.extract_words(
46+
x_tolerance=x_tolerance,
47+
y_tolerance=y_tolerance,
48+
keep_blank_chars=keep_blank_chars,
49+
use_text_flow=use_text_flow,
50+
horizontal_ltr=horizontal_ltr,
51+
vertical_ttb=vertical_ttb,
52+
extra_attrs=extra_attrs,
53+
)
54+
55+
df=pd.DataFrame(tokens)
56+
df[["x0","x1"]]= (
57+
df[["x0","x1"]].clip(lower=0,upper=int(page.width)).astype("float")
58+
)
59+
df[["top","bottom"]]= (
60+
df[["top","bottom"]].clip(lower=0,upper=int(page.height)).astype("float")
61+
)
62+
63+
page_tokens=load_dataframe(
64+
df.rename(
65+
columns={
66+
"x0":"x_1",
67+
"x1":"x_2",
68+
"top":"y_1",
69+
"bottom":"y_2",
70+
"fontname":"type",# also loading fontname as "type"
71+
}
72+
),
73+
block_type="rectangle",
74+
)
75+
76+
returnpage_tokens
77+
78+
79+
defload_pdf(
80+
filename:str,
81+
load_images:bool=False,
82+
x_tolerance:int=1.5,
83+
y_tolerance:int=2,
84+
keep_blank_chars:bool=False,
85+
use_text_flow:bool=True,
86+
horizontal_ltr:bool=True,
87+
vertical_ttb:bool=True,
88+
extra_attrs:Optional[List[str]]=None,
89+
dpi:int=DEFAULT_PDF_DPI,
90+
)->Union[List[Layout],Tuple[List[Layout],List["Image.Image"]]]:
91+
"""Load all tokens for each page from a PDF file, and save them
92+
in a list of Layout objects with the original page order.
93+
94+
Args:
95+
filename (str): The path to the PDF file.
96+
load_images (bool, optional):
97+
Whether load screenshot for each page of the PDF file.
98+
When set to true, the function will return both the layout and
99+
screenshot image for each page.
100+
Defaults to False.
101+
x_tolerance (int, optional):
102+
The threshold used for extracting "word tokens" from the pdf file.
103+
It will merge the pdf characters into a word token if the difference
104+
between the x_2 of one character and the x_1 of the next is less than
105+
or equal to x_tolerance. See details in `pdf2plumber's documentation
106+
<https://github.com/jsvine/pdfplumber#the-pdfplumberpage-class>`_.
107+
Defaults to 1.5.
108+
y_tolerance (int, optional):
109+
The threshold used for extracting "word tokens" from the pdf file.
110+
It will merge the pdf characters into a word token if the difference
111+
between the y_2 of one character and the y_1 of the next is less than
112+
or equal to y_tolerance. See details in `pdf2plumber's documentation
113+
<https://github.com/jsvine/pdfplumber#the-pdfplumberpage-class>`_.
114+
Defaults to 2.
115+
keep_blank_chars (bool, optional):
116+
When keep_blank_chars is set to True, it will treat blank characters
117+
are treated as part of a word, not as a space between words. See
118+
details in `pdf2plumber's documentation
119+
<https://github.com/jsvine/pdfplumber#the-pdfplumberpage-class>`_.
120+
Defaults to False.
121+
use_text_flow (bool, optional):
122+
When use_text_flow is set to True, it will use the PDF's underlying
123+
flow of characters as a guide for ordering and segmenting the words,
124+
rather than presorting the characters by x/y position. (This mimics
125+
how dragging a cursor highlights text in a PDF; as with that, the
126+
order does not always appear to be logical.) See details in
127+
`pdf2plumber's documentation
128+
<https://github.com/jsvine/pdfplumber#the-pdfplumberpage-class>`_.
129+
Defaults to True.
130+
horizontal_ltr (bool, optional):
131+
When horizontal_ltr is set to True, it means the doc should read
132+
text from left to right, vice versa.
133+
Defaults to True.
134+
vertical_ttb (bool, optional):
135+
When vertical_ttb is set to True, it means the doc should read
136+
text from top to bottom, vice versa.
137+
Defaults to True.
138+
extra_attrs (Optional[List[str]], optional):
139+
Passing a list of extra_attrs (e.g., ["fontname", "size"]) will
140+
restrict each words to characters that share exactly the same
141+
value for each of those `attributes extracted by pdfplumber
142+
<https://github.com/jsvine/pdfplumber/blob/develop/README.md#char-properties>`_,
143+
and the resulting word dicts will indicate those attributes.
144+
See details in `pdf2plumber's documentation
145+
<https://github.com/jsvine/pdfplumber#the-pdfplumberpage-class>`_.
146+
Defaults to `["fontname", "size"]`.
147+
dpi (int, optional):
148+
When loading images of the pdf, you can also specify the resolution
149+
(or `DPI, dots per inch <https://en.wikipedia.org/wiki/Dots_per_inch>`_)
150+
for rendering the images. Higher DPI values mean clearer images (also
151+
larger file sizes).
152+
Setting dpi will also automatically resizes the extracted pdf_layout
153+
to match the sizes of the images. Therefore, when visualizing the
154+
pdf_layouts, it can be rendered appropriately.
155+
Defaults to `DEFAULT_PDF_DPI=72`, which is also the default rendering dpi
156+
from the pdfplumber PDF parser.
157+
158+
Returns:
159+
List[Layout]:
160+
When `load_images=False`, it will only load the pdf_tokens from
161+
the PDF file. Each element of the list denotes all the tokens appeared
162+
on a single page, and the list is ordered the same as the original PDF
163+
page order.
164+
Tuple[List[Layout], List["Image.Image"]]:
165+
When `load_images=True`, besides the `all_page_layout`, it will also
166+
return a list of page images.
167+
168+
Examples::
169+
>>> import layoutparser as lp
170+
>>> pdf_layout = lp.load_pdf("path/to/pdf")
171+
>>> pdf_layout[0] # the layout for page 0
172+
>>> pdf_layout, pdf_images = lp.load_pdf("path/to/pdf", load_images=True)
173+
>>> lp.draw_box(pdf_images[0], pdf_layout[0])
174+
"""
175+
176+
plumber_pdf_object=pdfplumber.open(filename)
177+
178+
all_page_layout= []
179+
forpage_idinrange(len(plumber_pdf_object.pages)):
180+
cur_page=plumber_pdf_object.pages[page_id]
181+
182+
page_tokens=extract_words_for_page(
183+
cur_page,
184+
x_tolerance=x_tolerance,
185+
y_tolerance=y_tolerance,
186+
keep_blank_chars=keep_blank_chars,
187+
use_text_flow=use_text_flow,
188+
horizontal_ltr=horizontal_ltr,
189+
vertical_ttb=vertical_ttb,
190+
extra_attrs=extra_attrs,
191+
)
192+
193+
# Adding metadata for the current page
194+
page_tokens.page_data["width"]=float(cur_page.width)
195+
page_tokens.page_data["height"]=float(cur_page.height)
196+
page_tokens.page_data["index"]=page_id
197+
198+
all_page_layout.append(page_tokens)
199+
200+
ifnotload_images:
201+
returnall_page_layout
202+
else:
203+
importpdf2image
204+
205+
pdf_images=pdf2image.convert_from_path(filename,dpi=dpi)
206+
207+
forpage_id,page_imageinenumerate(pdf_images):
208+
image_width,image_height=page_image.size
209+
page_layout=all_page_layout[page_id]
210+
layout_width=page_layout.page_data["width"]
211+
layout_height=page_layout.page_data["height"]
212+
ifimage_width!=layout_widthorimage_height!=layout_height:
213+
scale_x=image_width/layout_width
214+
scale_y=image_height/layout_height
215+
page_layout=page_layout.scale((scale_x,scale_y))
216+
page_layout.page_data["width"]=image_width
217+
page_layout.page_data["height"]=image_height
218+
all_page_layout[page_id]=page_layout
219+
220+
returnall_page_layout,pdf_images

‎tests/fixtures/io/example.pdf

216 KB
Binary file not shown.

‎tests/test_io.py

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,7 @@
1414

1515
importnumpyasnp
1616
fromlayoutparser.elementsimportInterval,Rectangle,Quadrilateral,TextBlock,Layout
17-
fromlayoutparser.ioimportload_json,load_dict,load_csv
18-
17+
fromlayoutparserimportload_json,load_dict,load_csv,load_pdf
1918

2019
deftest_json():
2120

@@ -67,4 +66,16 @@ def test_csv():
6766
l2=Layout([i2,r2,q2])
6867

6968
_l2=load_csv("tests/fixtures/io/layout_textblock.csv")
70-
assert_l2==l2
69+
assert_l2==l2
70+
71+
72+
deftest_pdf():
73+
pdf_layout=load_pdf("tests/fixtures/io/example.pdf")
74+
assertlen(pdf_layout)==1
75+
76+
page_layout=pdf_layout[0]
77+
forattr_namein ["width","height","index"]:
78+
assertattr_nameinpage_layout.page_data
79+
80+
assertlen(set(ele.typeforeleinpage_layout))==3
81+
# Only three types of font show-up in the file

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp