Quick Reference

Essential Workflows

Basic Text Extraction

from natural_pdf import PDF

pdf = PDF("https://github.com/jsoma/natural-pdf/raw/refs/heads/main/pdfs/01-practice.pdf")
page = pdf.pages[0]
text = page.extract_text()

Find → Extract Pattern

# Find specific elements, then extract
heading = page.find('text:contains("Summary"):bold')
content = heading.below().extract_text()

OCR for Scanned Documents

# Apply OCR first, then extract
page.apply_ocr(engine='easyocr', languages=['en'])
text = page.extract_text()

Layout Analysis → Table Extraction

# Detect layout, then extract tables
page.analyze_layout(engine='yolo')
table_region = page.find('region[type=table]')
data = table_region.extract_table()

Common Selectors

Text Content

page.find('text:contains("Invoice")')           # Contains text
page.find('text:contains("total")', case=False) # Case insensitive
page.find('text:contains("\\d+")', regex=True)  # Regex pattern

Text Formatting

page.find_all('text:bold')                      # Bold text
page.find_all('text:italic')                    # Italic text
page.find_all('text:strike')                    # Struck-through text
page.find_all('text:underline')                 # Underlined text
page.find_all('text[size>=12]')                 # Large text
page.find_all('text[fontname*=Arial]')          # Specific font

Spatial Relationships

page.find('text:above("line[width>=2]")')       # Above thick line
page.find('text:below("text:contains("Title")")')  # Below title
page.find('text:near("image")')                 # Near images

Layout Elements

page.find_all('line:horizontal')                # Horizontal lines
page.find_all('rect')                           # Rectangles
page.find_all('region[type=table]')             # Detected tables
page.find_all('region[type=title]')             # Detected titles

OCR and Sources

page.find_all('text[source=ocr]')               # OCR-generated text
page.find_all('text[source=pdf]')               # Original PDF text
page.find_all('text[confidence>=0.8]')          # High-confidence OCR

Essential Methods

Finding Elements

page.find(selector)                             # First match
page.find_all(selector)                         # All matches
element.next()                                  # Next element in reading order
element.previous()                              # Previous element

element.above(height=100)                       # Region above element
element.below(until='line:horizontal')          # Below until boundary
element.left(width=200)                         # Region to the left
element.right()                                 # Region to the right

Text Extraction

page.extract_text()                             # All text from page
page.extract_text(layout=True)                  # Preserve layout
element.extract_text()                          # Text from specific element
region.extract_text()                           # Text from region

Table Extraction

page.extract_table()                            # First table on page
region.extract_table()                          # Table from region
region.extract_table(method='tatr')             # Force TATR method
region.extract_table(method='pdfplumber')       # Force pdfplumber method

OCR

page.apply_ocr()                                # Default OCR
page.apply_ocr(engine='paddle', languages=['en', 'zh-cn'])
page.apply_ocr(engine='easyocr', min_confidence=0.8)
region.apply_ocr()                              # OCR specific region

Layout Analysis

page.analyze_layout()                           # Default YOLO
page.analyze_layout(engine='tatr')              # Table-focused
page.analyze_layout(engine='surya')             # High accuracy
page.clear_detected_layout_regions()           # Clear previous results

Document QA

result = page.ask("What is the total amount?")
print(result.answer)                            # The answer
print(result.confidence)                        # Confidence score
result.show()                                   # Highlight answer location

Structured Data Extraction

# Simple approach
data = page.extract(schema=["company", "date", "total"]).extracted()

# With Pydantic schema
from pydantic import BaseModel
class Invoice(BaseModel):
    company: str
    total: float
    date: str

data = page.extract(schema=Invoice, client=client).extracted()

Visualization & Debugging

Highlighting

# Simple visualization
elements.show(color="red")                      # Single collection
elements.show(color="blue", label="Headers")    # With label
elements.show(group_by='type')                  # Color by type

# Multiple collections together
with page.highlights() as h:
    h.add(elements1, color="red", label="Type 1")
    h.add(elements2, color="blue", label="Type 2")
    h.show()

Viewing

page.show()                                     # Show page with highlights
element.show()                                  # Show specific element
page.show(width=700)                        # Generate image
region.show(crop=True)                 # Crop to region only

Interactive Viewer

page.viewer()                                   # Launch interactive viewer (Jupyter)

Exclusion Zones

Page-Level Exclusions

header = page.find('text:contains("CONFIDENTIAL")').above()
page.add_exclusion(header)                      # Exclude from extraction
page.clear_exclusions()                         # Remove exclusions
text = page.extract_text(use_exclusions=False)  # Ignore exclusions

PDF-Level Exclusions

# Exclude headers from all pages
pdf.add_exclusion(
    lambda p: p.create_region(0, 0, p.width, p.height * 0.1),
    label="Header"
)

Configuration Options

OCR Engines

from natural_pdf.ocr import EasyOCROptions, PaddleOCROptions

easy_opts = EasyOCROptions(gpu=True, paragraph=True)
paddle_opts = PaddleOCROptions(lang='en')

Layout Analysis Options

from natural_pdf.analyzers.layout import YOLOOptions

yolo_opts = YOLOOptions(confidence_threshold=0.5)
page.analyze_layout(engine='yolo', options=yolo_opts)

Common Patterns

Extract Inspection Report Data

# Find violation count
violations = page.find('text:contains("Violation Count"):right(width=100)')

# Get inspection number from the header box (regex search)
inspection_num = page.find('text:contains("INS-[A-Z0-9]+")', regex=True)

# Extract inspection date
inspection_date = page.find('text:contains("Date:"):right(width=150)')

# Get site name (text to the right of "Site:")
site_name = page.find('text:contains("Site:"):right(width=300)').extract_text()

Process Forms

# Exclude header/footer
page.add_exclusion(page.create_region(0, 0, page.width, 50))
page.add_exclusion(page.create_region(0, page.height-50, page.width, page.height))

# Extract form fields
fields = page.find_all('text:bold')
values = [field.right(width=300).extract_text() for field in fields]

Handle Scanned Documents

# Apply OCR with high accuracy
page.apply_ocr(engine='surya', languages=['en'])

# Extract with confidence filtering
text_elements = page.find_all('text[source=ocr][confidence>=0.8]')
clean_text = text_elements.extract_text()

Troubleshooting

Problem	Solution
No text found	Try `page.apply_ocr()` first
Wrong elements selected	Use `elements.show()` to debug selectors
Poor table extraction	Try `page.analyze_layout(engine='tatr')` first
Text extraction includes headers	Use `page.add_exclusion()`
Low OCR accuracy	Try different engine or increase resolution
Elements overlap multiple pages	Use page-specific searches

File Formats

Saving Results

# Save as image
page.save_image("output.png", width=700)

# Save table as CSV
import pandas as pd
df = table_data.to_df(header="first")
df.to_csv("table.csv")

# Export searchable PDF
from natural_pdf.exporters import SearchablePDFExporter
exporter = SearchablePDFExporter()
exporter.export(pdf, "searchable.pdf")

Next Steps

New to Natural PDF? → Start with Installation
Learning the basics? → Follow the Tutorials
Solving specific problems? → Check the how-to guides
Need detailed info? → See the API Reference

Quick Reference

Essential Workflows

Basic Text Extraction

Find → Extract Pattern

OCR for Scanned Documents

Layout Analysis → Table Extraction

Common Selectors

Text Content

Text Formatting

Spatial Relationships

Layout Elements

OCR and Sources

Essential Methods

Finding Elements

Spatial Navigation

Text Extraction

Table Extraction

OCR

Layout Analysis

Document QA

Structured Data Extraction

Visualization & Debugging

Highlighting

Viewing

Interactive Viewer

Exclusion Zones

Page-Level Exclusions

PDF-Level Exclusions

Configuration Options

OCR Engines

Layout Analysis Options

Common Patterns

Extract Inspection Report Data

Process Forms

Handle Scanned Documents

Troubleshooting

File Formats

Saving Results

Next Steps