In [1]:
Copied!
from natural_pdf import PDF
# Open a PDF file
pdf = PDF("https://github.com/jsoma/natural-pdf/raw/refs/heads/main/pdfs/0500000US42001.pdf")
from natural_pdf import PDF
# Open a PDF file
pdf = PDF("https://github.com/jsoma/natural-pdf/raw/refs/heads/main/pdfs/0500000US42001.pdf")
Accessing Pages¶
Once you have a PDF object, you can access its pages:
In [2]:
Copied!
# Get the total number of pages
num_pages = len(pdf)
print(f"This PDF has {num_pages} pages")
# Get a specific page (0-indexed)
first_page = pdf.pages[0]
last_page = pdf.pages[-1]
# Iterate through the first 20 pages
for page in pdf.pages[:20]:
print(f"Page {page.number} has {len(page.extract_text())} characters")
# Get the total number of pages
num_pages = len(pdf)
print(f"This PDF has {num_pages} pages")
# Get a specific page (0-indexed)
first_page = pdf.pages[0]
last_page = pdf.pages[-1]
# Iterate through the first 20 pages
for page in pdf.pages[:20]:
print(f"Page {page.number} has {len(page.extract_text())} characters")
This PDF has 153 pages Page 1 has 985 characters Page 2 has 778 characters Page 3 has 522 characters Page 4 has 984 characters Page 5 has 778 characters Page 6 has 523 characters Page 7 has 982 characters Page 8 has 772 characters Page 9 has 522 characters Page 10 has 1008 characters Page 11 has 796 characters Page 12 has 532 characters Page 13 has 986 characters Page 14 has 780 characters Page 15 has 523 characters Page 16 has 990 characters Page 17 has 782 characters Page 18 has 520 characters Page 19 has 1006 characters Page 20 has 795 characters
Page Properties¶
Each Page
object has useful properties:
In [3]:
Copied!
# Page dimensions in points (1/72 inch)
print(page.width, page.height)
# Page number (1-indexed as shown in PDF viewers)
print(page.number)
# Page index (0-indexed position in the PDF)
print(page.index)
# Page dimensions in points (1/72 inch)
print(page.width, page.height)
# Page number (1-indexed as shown in PDF viewers)
print(page.number)
# Page index (0-indexed position in the PDF)
print(page.index)
612 792 20 19
Working Across Pages¶
Natural PDF makes it easy to work with content across multiple pages:
In [4]:
Copied!
# Extract text from all pages
all_text = pdf.extract_text()
# Find elements across all pages
all_headings = pdf.find_all('text[size>=14]:bold')
# Add exclusion zones to all pages (like headers/footers)
pdf.add_exclusion(
lambda page: page.find('text:contains("CONFIDENTIAL")').above() if page.find('text:contains("CONFIDENTIAL")') else None,
label="header"
)
# Extract text from all pages
all_text = pdf.extract_text()
# Find elements across all pages
all_headings = pdf.find_all('text[size>=14]:bold')
# Add exclusion zones to all pages (like headers/footers)
pdf.add_exclusion(
lambda page: page.find('text:contains("CONFIDENTIAL")').above() if page.find('text:contains("CONFIDENTIAL")') else None,
label="header"
)
Out[4]:
<natural_pdf.core.pdf.PDF at 0x109bee9b0>
The Page Collection¶
The pdf.pages
object is a PageCollection
that allows batch operations on pages:
In [5]:
Copied!
# Extract text from specific pages
text = pdf.pages[2:5].extract_text()
# Find elements across specific pages
elements = pdf.pages[2:5].find_all('text:contains("Annual Report")')
# Extract text from specific pages
text = pdf.pages[2:5].extract_text()
# Find elements across specific pages
elements = pdf.pages[2:5].find_all('text:contains("Annual Report")')
2025-05-06T15:29:28.620225Z [warning ] Ignoring unsupported layout keyword argument: 'apply_exclusions' lineno=64 module=natural_pdf.utils.text_extraction
[2025-05-06 11:29:28,620] [ WARNING] text_extraction.py:64 - Ignoring unsupported layout keyword argument: 'apply_exclusions'
2025-05-06T15:29:28.631200Z [warning ] Ignoring unsupported layout keyword argument: 'apply_exclusions' lineno=64 module=natural_pdf.utils.text_extraction
[2025-05-06 11:29:28,631] [ WARNING] text_extraction.py:64 - Ignoring unsupported layout keyword argument: 'apply_exclusions'
2025-05-06T15:29:28.640121Z [warning ] Ignoring unsupported layout keyword argument: 'apply_exclusions' lineno=64 module=natural_pdf.utils.text_extraction
[2025-05-06 11:29:28,640] [ WARNING] text_extraction.py:64 - Ignoring unsupported layout keyword argument: 'apply_exclusions'
Document Sections Across Pages¶
You can extract sections that span across multiple pages:
In [6]:
Copied!
# Get sections with headings as section starts
sections = pdf.pages.get_sections(
start_elements='text[size>=14]:bold',
new_section_on_page_break=False
)
# Get sections with headings as section starts
sections = pdf.pages.get_sections(
start_elements='text[size>=14]:bold',
new_section_on_page_break=False
)
Next Steps¶
Now that you know how to navigate PDFs, you can: