Working with Regions¶
Regions are rectangular areas on a page that let you focus on specific parts of a document. They're perfect for extracting text from defined areas, finding elements within certain boundaries, and working with document sections.
In [1]:
Copied!
#%pip install "natural-pdf[all]"
#%pip install "natural-pdf[all]"
In [2]:
Copied!
from natural_pdf import PDF
# Load a PDF
pdf = PDF("https://github.com/jsoma/natural-pdf/raw/refs/heads/main/pdfs/01-practice.pdf")
page = pdf.pages[0]
# Create a region in the top portion of the page
top_region = page.create_region(
50, # x0 (left)
100, # y0 (top)
page.width - 50, # x1 (right)
200 # y1 (bottom)
)
# Visualize the region
top_region.show(color="blue")
from natural_pdf import PDF
# Load a PDF
pdf = PDF("https://github.com/jsoma/natural-pdf/raw/refs/heads/main/pdfs/01-practice.pdf")
page = pdf.pages[0]
# Create a region in the top portion of the page
top_region = page.create_region(
50, # x0 (left)
100, # y0 (top)
page.width - 50, # x1 (right)
200 # y1 (bottom)
)
# Visualize the region
top_region.show(color="blue")
Out[2]:
In [3]:
Copied!
# Extract text from this region
top_region.extract_text()
# Extract text from this region
top_region.extract_text()
Out[3]:
'Date: February 3, 1905\nViolation Count: 7\nSummary: Worst of any, however, were the fertilizer men, and those who served in the cooking rooms.\nThese people could not be shown to the visitor - for the odor of a fertilizer man would scare any ordinary\nvisitor at a hundred yards, and as for the other men, who worked in tank rooms full of steam, and in\nsome of which there were open vats near the level of the floor, their peculiar trouble was that they fell'
Creating Regions from Elements¶
In [4]:
Copied!
# Find an element to create regions around
title = page.find('text:contains("Jungle Health")')
# Create regions relative to this element
below_title = title.below(height=100)
right_of_title = title.right(width=200)
above_title = title.above(height=50)
# Visualize these regions
page.clear_highlights()
below_title.highlight(color="green", label="Below")
right_of_title.highlight(color="red", label="Right")
above_title.highlight(color="orange", label="Above")
page.to_image()
# Find an element to create regions around
title = page.find('text:contains("Jungle Health")')
# Create regions relative to this element
below_title = title.below(height=100)
right_of_title = title.right(width=200)
above_title = title.above(height=50)
# Visualize these regions
page.clear_highlights()
below_title.highlight(color="green", label="Below")
right_of_title.highlight(color="red", label="Right")
above_title.highlight(color="orange", label="Above")
page.to_image()
Out[4]:
In [5]:
Copied!
# Extract text from the region below the title
below_title.extract_text()
# Extract text from the region below the title
below_title.extract_text()
Out[5]:
'INS-UP70N51NCL41R\nSite: Durham’s Meatpacking Chicago, Ill.\nDate: February 3, 1905\nViolation Count: 7'
Finding Elements Within Regions¶
In [6]:
Copied!
# Create a region for a specific document section
form_region = page.create_region(50, 100, page.width - 50, 300)
# Find elements only within this region
labels = form_region.find_all('text:contains(":")')
# Visualize the region and the elements found
form_region.show(
color=(0, 0, 1, 0.2),
label="Form Region"
)
labels.show(color="purple", label="Labels")
# Create a region for a specific document section
form_region = page.create_region(50, 100, page.width - 50, 300)
# Find elements only within this region
labels = form_region.find_all('text:contains(":")')
# Visualize the region and the elements found
form_region.show(
color=(0, 0, 1, 0.2),
label="Form Region"
)
labels.show(color="purple", label="Labels")
Out[6]:
Expanding and Adjusting Regions¶
In [7]:
Copied!
# Find an element to work with
element = page.find('text:contains("Summary:")')
# Create a tight region around the element
tight_region = element.expand(0, 0, 0, 0)
# Expand it to include surrounding content
expanded_region = tight_region.expand(
left=10, # Expand 10 points to the left
right=200, # Expand 200 points to the right
top=5, # Expand 5 points above
bottom=100 # Expand 100 points below
)
# Visualize both regions
page.clear_highlights()
tight_region.highlight(color="red", label="Original")
expanded_region.highlight(color="blue", label="Expanded")
page.to_image()
# Find an element to work with
element = page.find('text:contains("Summary:")')
# Create a tight region around the element
tight_region = element.expand(0, 0, 0, 0)
# Expand it to include surrounding content
expanded_region = tight_region.expand(
left=10, # Expand 10 points to the left
right=200, # Expand 200 points to the right
top=5, # Expand 5 points above
bottom=100 # Expand 100 points below
)
# Visualize both regions
page.clear_highlights()
tight_region.highlight(color="red", label="Original")
expanded_region.highlight(color="blue", label="Expanded")
page.to_image()
Out[7]:
Creating Bounded Regions¶
In [8]:
Copied!
# Find two elements to serve as boundaries
start_elem = page.find('text:contains("Summary:")')
end_elem = page.find('text:contains("Violations")')
# Create a region from start to end element
bounded_region = start_elem.until(end_elem)
# Visualize the bounded region
bounded_region.show(color="green", label="Bounded Region")
# Extract text from this bounded region
bounded_region.extract_text()[:200] + "..."
# Find two elements to serve as boundaries
start_elem = page.find('text:contains("Summary:")')
end_elem = page.find('text:contains("Violations")')
# Create a region from start to end element
bounded_region = start_elem.until(end_elem)
# Visualize the bounded region
bounded_region.show(color="green", label="Bounded Region")
# Extract text from this bounded region
bounded_region.extract_text()[:200] + "..."
Out[8]:
'Jungle Health and Safety Inspection Service\nINS-UP70N51NCL41R\nSite: Durham’s Meatpacking Chicago, Ill.\nDate: February 3, 1905\nViolation Count: 7\nSummary: Worst of any, however, were the fertilizer men...'
Working with Multiple Regions¶
In [9]:
Copied!
# Define multiple regions to extract different parts of the document
header_region = page.create_region(0, 0, page.width, 100)
main_region = page.create_region(100, 100, page.width - 100, page.height - 150)
footer_region = page.create_region(0, page.height - 50, page.width, page.height)
# Visualize all regions
header_region.show(color="blue", label="Header")
main_region.show(color="green", label="Main Content")
footer_region.show(color="red", label="Footer")
# Extract content from each region
document_parts = {
"header": header_region.extract_text(),
"main": main_region.extract_text()[:100] + "...",
"footer": footer_region.extract_text()
}
# Show what we extracted
document_parts
# Define multiple regions to extract different parts of the document
header_region = page.create_region(0, 0, page.width, 100)
main_region = page.create_region(100, 100, page.width - 100, page.height - 150)
footer_region = page.create_region(0, page.height - 50, page.width, page.height)
# Visualize all regions
header_region.show(color="blue", label="Header")
main_region.show(color="green", label="Main Content")
footer_region.show(color="red", label="Footer")
# Extract content from each region
document_parts = {
"header": header_region.extract_text(),
"main": main_region.extract_text()[:100] + "...",
"footer": footer_region.extract_text()
}
# Show what we extracted
document_parts
Out[9]:
{'header': 'Jungle Health and Safety Inspection Service\nINS-UP70N51NCL41R\nSite: Durham’s Meatpacking Chicago, Ill.', 'main': 'ruary 3, 1905\nCount: 7\nWorst of any, however, were the fertilizer men, and those who served in the c...', 'footer': 'Jungle Health and Safety Inspection Service'}
Creating an Image of a Region¶
In [10]:
Copied!
# Find a region of interest
table_header = page.find('text:contains("Statute")')
table_region = table_header.below(height=100)
# Visualize the region
table_region.show(color="purple", label="Table Region")
# Create an image of just this region
table_region.to_image(resolution=150)
# Find a region of interest
table_header = page.find('text:contains("Statute")')
table_region = table_header.below(height=100)
# Visualize the region
table_region.show(color="purple", label="Table Region")
# Create an image of just this region
table_region.to_image(resolution=150)
Out[10]:
Regions allow you to precisely target specific parts of a document for extraction and analysis. They're essential for handling complex document layouts and isolating the exact content you need.