NotificationsYou must be signed in to change notification settings
Fork506
Star5.4k

Commit341d3fc

authored

[feat] Add pdf loader (#71)

* reorganize folders* automatically setting ids* Add pdf loading functionality* add pdf tests* add deps* Add load_pdf to init* add load_pdf to doc* Add copyright

1 parentc0044a0 commit341d3fcCopy full SHA for 341d3fc

File tree

8 files changed

+252

-11

lines changed

docs/api_doc
- io.rst
setup.py
src/layoutparser
- __init__.py
- io
tests
- fixtures/io
  - example.pdf
- test_io.py

8 files changed

+252

-11

lines changed

`‎docs/api_doc/io.rst`

Lines changed: 8 additions & 2 deletions

Original file line number	Diff line number	Diff line change
`@@ -2,22 +2,28 @@ Load and Export Layout Data`
`2`	`2`	`================================`
`3`	`3`
`4`	`4`
`5`		`-DataFrame and CSV`
	`5`	+`Dataframe` and CSV
`6`	`6`	`--------------------------------`
`7`	`7`
`8`	`8`	`..autofunction::layoutparser.io.load_dataframe`
`9`	`9`
`10`	`10`	`..autofunction::layoutparser.io.load_csv`
`11`	`11`
`12`	`12`
`13`		`-Dictionary and JSON`
	`13`	+`Dict` and JSON
`14`	`14`	`--------------------------------`
`15`	`15`
`16`	`16`	`..autofunction::layoutparser.io.load_dict`
`17`	`17`
`18`	`18`	`..autofunction::layoutparser.io.load_json`
`19`	`19`
`20`	`20`
	`21`	`+PDF`
	`22`	`+--------------------------------`
	`23`	`+`
	`24`	`+..autofunction::layoutparser.io.load_pdf`
	`25`	`+`
	`26`	`+`
`21`	`27`	`Other Formats`
`22`	`28`	`--------------------------------`
`23`	`29`	`Stay tuned! We are working on to support more formats.`

`‎setup.py`

Lines changed: 2 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -41,6 +41,8 @@`
`41`	`41`	`"pillow",`
`42`	`42`	`"pyyaml>=5.1",`
`43`	`43`	`"iopath",`
	`44`	`+"pdfplumber",`
	`45`	`+"pdf2image",`
`44`	`46`	`],`
`45`	`47`	`extras_require={`
`46`	`48`	`"ocr": [`

`‎src/layoutparser/init.py`

Lines changed: 2 additions & 1 deletion

Original file line number	Diff line number	Diff line change
`@@ -41,7 +41,8 @@`
`41`	`41`	`"load_json",`
`42`	`42`	`"load_dict",`
`43`	`43`	`"load_csv",`
`44`		`-"load_dataframe"`
	`44`	`+"load_dataframe",`
	`45`	`+"load_pdf"`
`45`	`46`	`],`
`46`	`47`	`"file_utils":[`
`47`	`48`	`"is_torch_available",`

`‎src/layoutparser/io/init.py`

Lines changed: 2 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+from .basicimportload_json,load_dict,load_csv,load_dataframe`
	`2`	`+from .pdfimportload_pdf`

`‎src/layoutparser/io.pyrenamed to ‎src/layoutparser/io/basic.py`

Lines changed: 4 additions & 5 deletions

Original file line number	Diff line number	Diff line change
`@@ -18,12 +18,8 @@`
`18`	`18`
`19`	`19`	`importpandasaspd`
`20`	`20`
`21`		`-from .elementsimport (`
`22`		`-BaseCoordElement,`
	`21`	`+from ..elementsimport (`
`23`	`22`	`BaseLayoutElement,`
`24`		`-Interval,`
`25`		`-Rectangle,`
`26`		`-Quadrilateral,`
`27`	`23`	`TextBlock,`
`28`	`24`	`Layout,`
`29`	`25`	`BASECOORD_ELEMENT_NAMEMAP,`
`@@ -144,4 +140,7 @@ def load_dataframe(df: pd.DataFrame, block_type: str = None) -> Layout:`
`144`	`140`	`else:`
`145`	`141`	`df["block_type"]=block_type`
`146`	`142`
	`143`	`+if"id"notindf.columns:`
	`144`	`+df["id"]=df.index`
	`145`	`+`
`147`	`146`	`returnload_dict(df.apply(lambdax:x.dropna().to_dict(),axis=1).to_list())`

`‎src/layoutparser/io/pdf.py`

Lines changed: 220 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,220 @@`
	`1`	`+# Copyright 2021 The Layout Parser team. All rights reserved.`
	`2`	`+#`
	`3`	`+# Licensed under the Apache License, Version 2.0 (the "License");`
	`4`	`+# you may not use this file except in compliance with the License.`
	`5`	`+# You may obtain a copy of the License at`
	`6`	`+#`
	`7`	`+# http://www.apache.org/licenses/LICENSE-2.0`
	`8`	`+#`
	`9`	`+# Unless required by applicable law or agreed to in writing, software`
	`10`	`+# distributed under the License is distributed on an "AS IS" BASIS,`
	`11`	`+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
	`12`	`+# See the License for the specific language governing permissions and`
	`13`	`+# limitations under the License.`
	`14`	`+`
	`15`	`+fromtypingimportList,Union,Optional,Dict,Tuple`
	`16`	`+`
	`17`	`+importpdfplumber`
	`18`	`+importpandasaspd`
	`19`	`+`
	`20`	`+from ..elementsimportLayout`
	`21`	`+from .basicimportload_dataframe`
	`22`	`+`
	`23`	`+DEFAULT_PDF_DPI=72`
	`24`	`+`
	`25`	`+`
	`26`	`+defextract_words_for_page(`
	`27`	`+page:pdfplumber.page.Page,`
	`28`	`+x_tolerance=1.5,`
	`29`	`+y_tolerance=2,`
	`30`	`+keep_blank_chars=False,`
	`31`	`+use_text_flow=True,`
	`32`	`+horizontal_ltr=True,`
	`33`	`+vertical_ttb=True,`
	`34`	`+extra_attrs=None,`
	`35`	`+)->Layout:`
	`36`	`+"""The helper function used for extracting words from a pdfplumber page`
	`37`	`+ object.`
	`38`	`+`
	`39`	`+ Returns:`
	`40`	`+ Layout: a layout object representing all extracted pdf tokens on this page.`
	`41`	`+ """`
	`42`	`+ifextra_attrsisNone:`
	`43`	`+extra_attrs= ["fontname","size"]`
	`44`	`+`
	`45`	`+tokens=page.extract_words(`
	`46`	`+x_tolerance=x_tolerance,`
	`47`	`+y_tolerance=y_tolerance,`
	`48`	`+keep_blank_chars=keep_blank_chars,`
	`49`	`+use_text_flow=use_text_flow,`
	`50`	`+horizontal_ltr=horizontal_ltr,`
	`51`	`+vertical_ttb=vertical_ttb,`
	`52`	`+extra_attrs=extra_attrs,`
	`53`	`+ )`
	`54`	`+`
	`55`	`+df=pd.DataFrame(tokens)`
	`56`	`+df[["x0","x1"]]= (`
	`57`	`+df[["x0","x1"]].clip(lower=0,upper=int(page.width)).astype("float")`
	`58`	`+ )`
	`59`	`+df[["top","bottom"]]= (`
	`60`	`+df[["top","bottom"]].clip(lower=0,upper=int(page.height)).astype("float")`
	`61`	`+ )`
	`62`	`+`
	`63`	`+page_tokens=load_dataframe(`
	`64`	`+df.rename(`
	`65`	`+columns={`
	`66`	`+"x0":"x_1",`
	`67`	`+"x1":"x_2",`
	`68`	`+"top":"y_1",`
	`69`	`+"bottom":"y_2",`
	`70`	`+"fontname":"type",# also loading fontname as "type"`
	`71`	`+ }`
	`72`	`+ ),`
	`73`	`+block_type="rectangle",`
	`74`	`+ )`
	`75`	`+`
	`76`	`+returnpage_tokens`
	`77`	`+`
	`78`	`+`
	`79`	`+defload_pdf(`
	`80`	`+filename:str,`
	`81`	`+load_images:bool=False,`
	`82`	`+x_tolerance:int=1.5,`
	`83`	`+y_tolerance:int=2,`
	`84`	`+keep_blank_chars:bool=False,`
	`85`	`+use_text_flow:bool=True,`
	`86`	`+horizontal_ltr:bool=True,`
	`87`	`+vertical_ttb:bool=True,`
	`88`	`+extra_attrs:Optional[List[str]]=None,`
	`89`	`+dpi:int=DEFAULT_PDF_DPI,`
	`90`	`+)->Union[List[Layout],Tuple[List[Layout],List["Image.Image"]]]:`
	`91`	`+"""Load all tokens for each page from a PDF file, and save them`
	`92`	`+ in a list of Layout objects with the original page order.`
	`93`	`+`
	`94`	`+ Args:`
	`95`	`+ filename (str): The path to the PDF file.`
	`96`	`+ load_images (bool, optional):`
	`97`	`+ Whether load screenshot for each page of the PDF file.`
	`98`	`+ When set to true, the function will return both the layout and`
	`99`	`+ screenshot image for each page.`
	`100`	`+ Defaults to False.`
	`101`	`+ x_tolerance (int, optional):`
	`102`	`+ The threshold used for extracting "word tokens" from the pdf file.`
	`103`	`+ It will merge the pdf characters into a word token if the difference`
	`104`	`+ between the x_2 of one character and the x_1 of the next is less than`
	`105`	+ or equal to x_tolerance. See details in `pdf2plumber's documentation
	`106`	+ <https://github.com/jsvine/pdfplumber#the-pdfplumberpage-class>`_.
	`107`	`+ Defaults to 1.5.`
	`108`	`+ y_tolerance (int, optional):`
	`109`	`+ The threshold used for extracting "word tokens" from the pdf file.`
	`110`	`+ It will merge the pdf characters into a word token if the difference`
	`111`	`+ between the y_2 of one character and the y_1 of the next is less than`
	`112`	+ or equal to y_tolerance. See details in `pdf2plumber's documentation
	`113`	+ <https://github.com/jsvine/pdfplumber#the-pdfplumberpage-class>`_.
	`114`	`+ Defaults to 2.`
	`115`	`+ keep_blank_chars (bool, optional):`
	`116`	`+ When keep_blank_chars is set to True, it will treat blank characters`
	`117`	`+ are treated as part of a word, not as a space between words. See`
	`118`	+ details in `pdf2plumber's documentation
	`119`	+ <https://github.com/jsvine/pdfplumber#the-pdfplumberpage-class>`_.
	`120`	`+ Defaults to False.`
	`121`	`+ use_text_flow (bool, optional):`
	`122`	`+ When use_text_flow is set to True, it will use the PDF's underlying`
	`123`	`+ flow of characters as a guide for ordering and segmenting the words,`
	`124`	`+ rather than presorting the characters by x/y position. (This mimics`
	`125`	`+ how dragging a cursor highlights text in a PDF; as with that, the`
	`126`	`+ order does not always appear to be logical.) See details in`
	`127`	+ `pdf2plumber's documentation
	`128`	+ <https://github.com/jsvine/pdfplumber#the-pdfplumberpage-class>`_.
	`129`	`+ Defaults to True.`
	`130`	`+ horizontal_ltr (bool, optional):`
	`131`	`+ When horizontal_ltr is set to True, it means the doc should read`
	`132`	`+ text from left to right, vice versa.`
	`133`	`+ Defaults to True.`
	`134`	`+ vertical_ttb (bool, optional):`
	`135`	`+ When vertical_ttb is set to True, it means the doc should read`
	`136`	`+ text from top to bottom, vice versa.`
	`137`	`+ Defaults to True.`
	`138`	`+ extra_attrs (Optional[List[str]], optional):`
	`139`	`+ Passing a list of extra_attrs (e.g., ["fontname", "size"]) will`
	`140`	`+ restrict each words to characters that share exactly the same`
	`141`	+ value for each of those `attributes extracted by pdfplumber
	`142`	+ <https://github.com/jsvine/pdfplumber/blob/develop/README.md#char-properties>`_,
	`143`	`+ and the resulting word dicts will indicate those attributes.`
	`144`	+ See details in `pdf2plumber's documentation
	`145`	+ <https://github.com/jsvine/pdfplumber#the-pdfplumberpage-class>`_.
	`146`	+ Defaults to `["fontname", "size"]`.
	`147`	`+ dpi (int, optional):`
	`148`	`+ When loading images of the pdf, you can also specify the resolution`
	`149`	+ (or `DPI, dots per inch <https://en.wikipedia.org/wiki/Dots_per_inch>`_)
	`150`	`+ for rendering the images. Higher DPI values mean clearer images (also`
	`151`	`+ larger file sizes).`
	`152`	`+ Setting dpi will also automatically resizes the extracted pdf_layout`
	`153`	`+ to match the sizes of the images. Therefore, when visualizing the`
	`154`	`+ pdf_layouts, it can be rendered appropriately.`
	`155`	+ Defaults to `DEFAULT_PDF_DPI=72`, which is also the default rendering dpi
	`156`	`+ from the pdfplumber PDF parser.`
	`157`	`+`
	`158`	`+ Returns:`
	`159`	`+ List[Layout]:`
	`160`	+ When `load_images=False`, it will only load the pdf_tokens from
	`161`	`+ the PDF file. Each element of the list denotes all the tokens appeared`
	`162`	`+ on a single page, and the list is ordered the same as the original PDF`
	`163`	`+ page order.`
	`164`	`+ Tuple[List[Layout], List["Image.Image"]]:`
	`165`	+ When `load_images=True`, besides the `all_page_layout`, it will also
	`166`	`+ return a list of page images.`
	`167`	`+`
	`168`	`+ Examples::`
	`169`	`+ >>> import layoutparser as lp`
	`170`	`+ >>> pdf_layout = lp.load_pdf("path/to/pdf")`
	`171`	`+ >>> pdf_layout[0] # the layout for page 0`
	`172`	`+ >>> pdf_layout, pdf_images = lp.load_pdf("path/to/pdf", load_images=True)`
	`173`	`+ >>> lp.draw_box(pdf_images[0], pdf_layout[0])`
	`174`	`+ """`
	`175`	`+`
	`176`	`+plumber_pdf_object=pdfplumber.open(filename)`
	`177`	`+`
	`178`	`+all_page_layout= []`
	`179`	`+forpage_idinrange(len(plumber_pdf_object.pages)):`
	`180`	`+cur_page=plumber_pdf_object.pages[page_id]`
	`181`	`+`
	`182`	`+page_tokens=extract_words_for_page(`
	`183`	`+cur_page,`
	`184`	`+x_tolerance=x_tolerance,`
	`185`	`+y_tolerance=y_tolerance,`
	`186`	`+keep_blank_chars=keep_blank_chars,`
	`187`	`+use_text_flow=use_text_flow,`
	`188`	`+horizontal_ltr=horizontal_ltr,`
	`189`	`+vertical_ttb=vertical_ttb,`
	`190`	`+extra_attrs=extra_attrs,`
	`191`	`+ )`
	`192`	`+`
	`193`	`+# Adding metadata for the current page`
	`194`	`+page_tokens.page_data["width"]=float(cur_page.width)`
	`195`	`+page_tokens.page_data["height"]=float(cur_page.height)`
	`196`	`+page_tokens.page_data["index"]=page_id`
	`197`	`+`
	`198`	`+all_page_layout.append(page_tokens)`
	`199`	`+`
	`200`	`+ifnotload_images:`
	`201`	`+returnall_page_layout`
	`202`	`+else:`
	`203`	`+importpdf2image`
	`204`	`+`
	`205`	`+pdf_images=pdf2image.convert_from_path(filename,dpi=dpi)`
	`206`	`+`
	`207`	`+forpage_id,page_imageinenumerate(pdf_images):`
	`208`	`+image_width,image_height=page_image.size`
	`209`	`+page_layout=all_page_layout[page_id]`
	`210`	`+layout_width=page_layout.page_data["width"]`
	`211`	`+layout_height=page_layout.page_data["height"]`
	`212`	`+ifimage_width!=layout_widthorimage_height!=layout_height:`
	`213`	`+scale_x=image_width/layout_width`
	`214`	`+scale_y=image_height/layout_height`
	`215`	`+page_layout=page_layout.scale((scale_x,scale_y))`
	`216`	`+page_layout.page_data["width"]=image_width`
	`217`	`+page_layout.page_data["height"]=image_height`
	`218`	`+all_page_layout[page_id]=page_layout`
	`219`	`+`
	`220`	`+returnall_page_layout,pdf_images`

`‎tests/fixtures/io/example.pdf`

216 KB

Binary file not shown.

`‎tests/test_io.py`

Lines changed: 14 additions & 3 deletions

Original file line number	Diff line number	Diff line change
`@@ -14,8 +14,7 @@`
`14`	`14`
`15`	`15`	`importnumpyasnp`
`16`	`16`	`fromlayoutparser.elementsimportInterval,Rectangle,Quadrilateral,TextBlock,Layout`
`17`		`-fromlayoutparser.ioimportload_json,load_dict,load_csv`
`18`		`-`
	`17`	`+fromlayoutparserimportload_json,load_dict,load_csv,load_pdf`
`19`	`18`
`20`	`19`	`deftest_json():`
`21`	`20`
`@@ -67,4 +66,16 @@ def test_csv():`
`67`	`66`	`l2=Layout([i2,r2,q2])`
`68`	`67`
`69`	`68`	`_l2=load_csv("tests/fixtures/io/layout_textblock.csv")`
`70`		`-assert_l2==l2`
	`69`	`+assert_l2==l2`
	`70`	`+`
	`71`	`+`
	`72`	`+deftest_pdf():`
	`73`	`+pdf_layout=load_pdf("tests/fixtures/io/example.pdf")`
	`74`	`+assertlen(pdf_layout)==1`
	`75`	`+`
	`76`	`+page_layout=pdf_layout[0]`
	`77`	`+forattr_namein ["width","height","index"]:`
	`78`	`+assertattr_nameinpage_layout.page_data`
	`79`	`+`
	`80`	`+assertlen(set(ele.typeforeleinpage_layout))==3`
	`81`	`+# Only three types of font show-up in the file`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit341d3fc

File tree

8 files changed

8 files changed

`‎docs/api_doc/io.rst`

`‎setup.py`

`‎src/layoutparser/init.py`

`‎src/layoutparser/io/init.py`

`‎src/layoutparser/io.pyrenamed to ‎src/layoutparser/io/basic.py`

`‎src/layoutparser/io/pdf.py`

`‎tests/fixtures/io/example.pdf`

`‎tests/test_io.py`

0 commit comments