# Install required packages
!pip install --upgrade --quiet 'natural-pdf[all]>=0.5.0'
print('✓ Packages installed!')
Slides: slides.pdf
from natural_pdf import PDF
pdf = PDF("https://github.com/jsoma/ire25-natural-pdf/raw/refs/heads/main/Atlanta_Public_Schools_GA_sample.pdf")
pdf
page = pdf.pages[0]
page.show()
We'll add two exclusion zones because we aren't interested in the top and bottom of the pages.
pdf.add_exclusion(lambda page: page.find('line[width>=2]').above())
pdf.add_exclusion(lambda page: page.find_all('line')[-1].below())
page.find_all('text').inspect()
titles = page.find_all('text[font_variant=AAAAAB][size=10]')
titles.show()
books = titles.below(until='text[font_variant=AAAAAB][size=10]',
include_endpoint=False,
include_source=True)
books.show()
books.find('text:contains(Site)').below().clip(books).find_all('text[x0<47][size=10]')
books[3].above(until='text[size>10]').endpoint.show()
books.above(until='text[size>10]').endpoints.show()
import pandas as pd
df = pd.DataFrame({
'author': books.find('text:contains(Author)').extract_each_text(),
'isbn': books.find('text:contains(ISBN)').extract_each_text(),
'published': books.find('text:contains(Published)').extract_each_text(),
'site': books.find('text:contains(Site)').below().clip(books).apply(lambda area: area.find_all('text[x0<47][size=10]').extract_text()),
'barcode': books.find('text:contains(Barcode)').below(width='element', height=12).expand(right=50).extract_each_text(),
'price': books.find('text:contains(Price)').below(width='element', height=12).expand(right=50).extract_each_text(),
'acquired': books.find('text:contains(Acquired)').below(width='element', height=12).expand(right=10).extract_each_text(),
'removed_by': books.find('text:contains(Removed By)').below(width='element', height=12).expand(right=40).extract_each_text(),
'date_removed': books.above(until='text[size>10]').endpoints.extract_each_text()
})
df
#page.find_all('text[size>10]').show()
sections = page.get_sections('text[size>10]')
sections[0].show()