Download notebook

In [ ]:

# Install required packages
!pip install --upgrade --quiet 'natural-pdf[all]>=0.6.4'

print('✓ Packages installed!')

In [ ]:

from natural_pdf import PDF

pdf = PDF("https://github.com/jsoma/ire25-natural-pdf/raw/refs/heads/main/Atlanta_Public_Schools_GA_sample.pdf")
pdf

In [ ]:

page = pdf.pages[0]
page.show()

We'll add two exclusion zones because we aren't interested in the top and bottom of the pages.

In [ ]:

pdf.add_exclusion(lambda page: page.find('line[width>=2]').above())
pdf.add_exclusion(lambda page: page.find_all('line')[-1].below())

In [ ]:

page.find_all('text').inspect()

In [ ]:

titles = page.find_all('text[font_variant=AAAAAB][size=10]')
titles.show()

In [ ]:

books = titles.below(until='text[font_variant=AAAAAB][size=10]',
                     include_endpoint=False,
                     include_source=True)
books.show()

In [ ]:

books.find('text:contains(Site)').below().clip(books).find_all('text[x0<47][size=10]')

In [ ]:

books[3].above(until='text[size>10]').endpoint.show()

In [ ]:

books.above(until='text[size>10]').endpoints.show()

In [ ]:

import pandas as pd

df = pd.DataFrame({
    'author': books.find('text:contains(Author)').extract_each_text(),
    'isbn': books.find('text:contains(ISBN)').extract_each_text(),
    'published': books.find('text:contains(Published)').extract_each_text(),
    'site': books.find('text:contains(Site)').below().clip(books).apply(lambda area: area.find_all('text[x0<47][size=10]').extract_text()),
    'barcode': books.find('text:contains(Barcode)').below(width='element', height=12).expand(right=50).extract_each_text(),
    'price': books.find('text:contains(Price)').below(width='element', height=12).expand(right=50).extract_each_text(),
    'acquired': books.find('text:contains(Acquired)').below(width='element', height=12).expand(right=10).extract_each_text(),
    'removed_by': books.find('text:contains(Removed By)').below(width='element', height=12).expand(right=40).extract_each_text(),
    'date_removed': books.above(until='text[size>10]').endpoints.extract_each_text()
})
df