Quick Reference
Essential Workflows
from natural_pdf import PDF
pdf = PDF("https://github.com/jsoma/natural-pdf/raw/refs/heads/main/pdfs/01-practice.pdf")
page = pdf.pages[0]
text = page.extract_text()
# Find specific elements, then extract
heading = page.find('text:contains("Summary"):bold')
content = heading.below().extract_text()
OCR for Scanned Documents
# Apply OCR first, then extract
page.apply_ocr(engine='easyocr', languages=['en'])
text = page.extract_text()
# Detect layout, then extract tables
page.analyze_layout(engine='yolo')
table_region = page.find('region[type=table]')
data = table_region.extract_table()
Common Selectors
Text Content
page.find('text:contains("Invoice")') # Contains text
page.find('text:contains("total")', case=False) # Case insensitive
page.find('text:contains("\\d+")', regex=True) # Regex pattern
Text Formatting
page.find_all('text:bold') # Bold text
page.find_all('text:italic') # Italic text
page.find_all('text:strike') # Struck-through text
page.find_all('text:underline') # Underlined text
page.find_all('text[size>=12]') # Large text
page.find_all('text[fontname*=Arial]') # Specific font
Spatial Relationships
page.find('text:above("line[width>=2]")') # Above thick line
page.find('text:below("text:contains("Title")")') # Below title
page.find('text:near("image")') # Near images
Layout Elements
page.find_all('line:horizontal') # Horizontal lines
page.find_all('rect') # Rectangles
page.find_all('region[type=table]') # Detected tables
page.find_all('region[type=title]') # Detected titles
OCR and Sources
page.find_all('text[source=ocr]') # OCR-generated text
page.find_all('text[source=pdf]') # Original PDF text
page.find_all('text[confidence>=0.8]') # High-confidence OCR
Essential Methods
Finding Elements
page.find(selector) # First match
page.find_all(selector) # All matches
element.next() # Next element in reading order
element.previous() # Previous element
Spatial Navigation
element.above(height=100) # Region above element
element.below(until='line:horizontal') # Below until boundary
element.left(width=200) # Region to the left
element.right() # Region to the right
page.extract_text() # All text from page
page.extract_text(layout=True) # Preserve layout
element.extract_text() # Text from specific element
region.extract_text() # Text from region
page.extract_table() # First table on page
region.extract_table() # Table from region
region.extract_table(method='tatr') # Force TATR method
region.extract_table(method='pdfplumber') # Force pdfplumber method
OCR
page.apply_ocr() # Default OCR
page.apply_ocr(engine='paddle', languages=['en', 'zh-cn'])
page.apply_ocr(engine='easyocr', min_confidence=0.8)
region.apply_ocr() # OCR specific region
Layout Analysis
page.analyze_layout() # Default YOLO
page.analyze_layout(engine='tatr') # Table-focused
page.analyze_layout(engine='surya') # High accuracy
page.clear_detected_layout_regions() # Clear previous results
Document QA
result = page.ask("What is the total amount?")
print(result.answer) # The answer
print(result.confidence) # Confidence score
result.show() # Highlight answer location
# Simple approach
data = page.extract(schema=["company", "date", "total"]).extracted()
# With Pydantic schema
from pydantic import BaseModel
class Invoice(BaseModel):
company: str
total: float
date: str
data = page.extract(schema=Invoice, client=client).extracted()
Visualization & Debugging
Highlighting
elements.highlight(color="red") # Simple highlight
elements.highlight(color="blue", label="Headers") # With label
elements.highlight(group_by='type') # Color by type
page.clear_highlights() # Remove highlights
Viewing
page.show() # Show page with highlights
element.show() # Show specific element
page.to_image(width=700) # Generate image
region.to_image(crop=True) # Crop to region only
Interactive Viewer
page.viewer() # Launch interactive viewer (Jupyter)
Exclusion Zones
Page-Level Exclusions
header = page.find('text:contains("CONFIDENTIAL")').above()
page.add_exclusion(header) # Exclude from extraction
page.clear_exclusions() # Remove exclusions
text = page.extract_text(use_exclusions=False) # Ignore exclusions
PDF-Level Exclusions
# Exclude headers from all pages
pdf.add_exclusion(
lambda p: p.create_region(0, 0, p.width, p.height * 0.1),
label="Header"
)
Configuration Options
OCR Engines
from natural_pdf.ocr import EasyOCROptions, PaddleOCROptions
easy_opts = EasyOCROptions(gpu=True, paragraph=True)
paddle_opts = PaddleOCROptions(lang='en')
Layout Analysis Options
from natural_pdf.analyzers.layout import YOLOOptions
yolo_opts = YOLOOptions(confidence_threshold=0.5)
page.analyze_layout(engine='yolo', options=yolo_opts)
Common Patterns
# Find violation count
violations = page.find('text:contains("Violation Count"):right(width=100)')
# Get inspection number from the header box (regex search)
inspection_num = page.find('text:contains("INS-[A-Z0-9]+")', regex=True)
# Extract inspection date
inspection_date = page.find('text:contains("Date:"):right(width=150)')
# Get site name (text to the right of "Site:")
site_name = page.find('text:contains("Site:"):right(width=300)').extract_text()
# Exclude header/footer
page.add_exclusion(page.create_region(0, 0, page.width, 50))
page.add_exclusion(page.create_region(0, page.height-50, page.width, page.height))
# Extract form fields
fields = page.find_all('text:bold')
values = [field.right(width=300).extract_text() for field in fields]
Handle Scanned Documents
# Apply OCR with high accuracy
page.apply_ocr(engine='surya', languages=['en'])
# Extract with confidence filtering
text_elements = page.find_all('text[source=ocr][confidence>=0.8]')
clean_text = text_elements.extract_text()
Troubleshooting
Problem |
Solution |
No text found |
Try page.apply_ocr() first |
Wrong elements selected |
Use elements.show() to debug selectors |
Poor table extraction |
Try page.analyze_layout(engine='tatr') first |
Text extraction includes headers |
Use page.add_exclusion() |
Low OCR accuracy |
Try different engine or increase resolution |
Elements overlap multiple pages |
Use page-specific searches |
Saving Results
# Save as image
page.save_image("output.png", width=700)
# Save table as CSV
import pandas as pd
df = table_data.to_df(header="first")
df.to_csv("table.csv")
# Export searchable PDF
from natural_pdf.exporters import SearchablePDFExporter
exporter = SearchablePDFExporter()
exporter.export(pdf, "searchable.pdf")
Next Steps
- New to Natural PDF? → Start with Installation
- Learning the basics? → Follow the Tutorials
- Solving specific problems? → Check the how-to guides
- Need detailed info? → See the API Reference