Extract Data from Forms and Invoices

You have a stack of invoices, forms, or structured documents where you need to pull out specific pieces of information - invoice numbers, totals, dates, names, etc. Here's how to automate that extraction.

The Problem

Manual data entry from PDFs is slow and error-prone. You need to: - Extract the same fields from hundreds of similar documents - Handle slight variations in layout between documents
- Get structured data you can actually work with - Maintain accuracy while processing quickly

Quick Solution: List the Fields You Want

Don't overthink it - just tell Natural PDF what information you're looking for:

from natural_pdf import PDF

pdf = PDF("https://github.com/jsoma/natural-pdf/raw/refs/heads/main/pdfs/01-practice.pdf")
page = pdf.pages[0]

# Extract data using a simple list that matches the inspection report columns
data = page.extract(schema=["site", "violation count", "date", "inspection number", "summary"]).extracted()

# Access the extracted information
print(f"Site: {data.site}")
print(f"Violations: {data.violation_count}")
print(f"Date: {data.date}")
print(f"Inspection #: {data.inspection_number}")

# Check confidence levels
print(f"Confidence – Site: {data.site_confidence:.2f}")
print(f"Confidence – Violations: {data.violation_count_confidence:.2f}")

This works completely offline using document question-answering models.

For Complex Data: Use Pydantic Schemas

When you need more control over data types and validation:

from pydantic import BaseModel, Field
from openai import OpenAI

# Define exactly what you want to extract for the inspection report
class InspectionReport(BaseModel):
    site_name: str = Field(description="Name of the inspection site")
    violation_count: int = Field(description="Number of violations found")
    inspection_date: str = Field(description="Inspection date in any format")
    inspection_number: str = Field(description="Inspection reference ID")
    summary: str = Field(description="Inspection summary paragraph")

# Set up LLM client (using Anthropic here)
client = OpenAI(
    api_key="your-api-key",
    base_url="https://generativelanguage.googleapis.com/v1beta/openai/"
)

# Extract structured data
page.extract(schema=InspectionReport, client=client, model="gemini-2.0-flash")

try:
    report_data = page.extracted()
    print(f"Site: {report_data.site_name}")
    print(f"Violations: {report_data.violation_count}")
    print(f"Inspection #: {report_data.inspection_number}")
except Exception as e:
    print("Extraction failed with error", e)

Handle Different Document Layouts

For documents that vary in structure, use spatial hints:

# Sometimes data is in specific areas of the page
header_region = page.create_region(0, 0, page.width, page.height * 0.3)
footer_region = page.create_region(0, page.height * 0.7, page.width, page.height)

# Extract company info from header
company_data = header_region.extract(
    schema=["company name", "address", "phone"]
).extracted()

# Extract totals from footer  
totals_data = footer_region.extract(
    schema=["subtotal", "tax", "total"]
).extracted()

Process Multiple Documents

Batch process a folder of similar documents:

import os
from pathlib import Path

# Define your extraction schema
class FormData(BaseModel):
    applicant_name: str
    application_date: str  
    reference_number: str
    status: str = Field(default="unknown")

# Process all PDFs in a folder
form_results = []
pdf_folder = Path("forms/")

for pdf_file in pdf_folder.glob("*.pdf"):
    print(f"Processing {pdf_file.name}...")

    pdf = PDF(str(pdf_file))
    page = pdf.pages[0]  # Assuming single-page forms

    # Extract data
    page.extract(schema=FormData, client=client)
    data = page.extracted()

    # Add filename for tracking
    result = {
        "filename": pdf_file.name,
        "applicant_name": data.applicant_name,
        "application_date": data.application_date,
        "reference_number": data.reference_number,
        "status": data.status
    }
    form_results.append(result)

    pdf.close()  # Clean up

# Save results to CSV
import pandas as pd
df = pd.DataFrame(form_results)
df.to_csv("extracted_form_data.csv", index=False)
print(f"Processed {len(form_results)} forms")

Handle Scanned Documents

For image-based PDFs, apply OCR first:

# Apply OCR before extraction
page.apply_ocr(engine='easyocr', languages=['en'])

# Filter out low-confidence OCR text to avoid noise
reliable_text = page.find_all('text[source=ocr][confidence>=0.8]')
print(f"Using {len(reliable_text)} high-confidence OCR elements")

# Now extract data (works on OCR'd text)
data = page.extract(schema=["invoice number", "total", "date"]).extracted()

Common Form Patterns

Validation and Error Handling

Check your extracted data for common issues:

def validate_invoice_data(data):
    issues = []

    # Check for missing required fields
    if not data.invoice_number or data.invoice_number.strip() == "":
        issues.append("Missing invoice number")

    # Validate amounts
    if data.total_amount <= 0:
        issues.append("Invalid total amount")

    # Check date format
    try:
        from datetime import datetime
        datetime.strptime(data.invoice_date, "%Y-%m-%d")
    except ValueError:
        # Try common date formats
        common_formats = ["%m/%d/%Y", "%d/%m/%Y", "%B %d, %Y"]
        date_valid = False
        for fmt in common_formats:
            try:
                datetime.strptime(data.invoice_date, fmt)
                date_valid = True
                break
            except ValueError:
                continue
        if not date_valid:
            issues.append(f"Invalid date format: {data.invoice_date}")

    return issues

# Validate extracted data
validation_issues = validate_invoice_data(invoice_data)
if validation_issues:
    print("Data quality issues found:")
    for issue in validation_issues:
        print(f"- {issue}")
else:
    print("Data validation passed!")

Improve Accuracy with Context

Give the AI more context for better extraction:

# Add context about the document type
extraction_prompt = """
This is a medical insurance claim form. 
Extract the following information, paying attention to:
- Policy numbers are usually 10-12 digits
- Claim amounts should be in dollars
- Dates should be in MM/DD/YYYY format
- Provider names are usually at the top of the form
"""

class InsuranceClaim(BaseModel):
    policy_number: str = Field(description="Insurance policy number (10-12 digits)")
    claim_amount: float = Field(description="Total claim amount in USD")
    service_date: str = Field(description="Date of service in MM/DD/YYYY format")
    provider_name: str = Field(description="Healthcare provider name")
    patient_name: str = Field(description="Patient full name")

# Use custom prompt for better results
page.extract(
    schema=InsuranceClaim, 
    client=client,
    prompt=extraction_prompt
)

Debug Extraction Issues

When extraction isn't working well:

# 1. Check what text the AI can actually see
extracted_text = page.extract_text()
print("Available text:")
print(extracted_text[:500])  # First 500 characters

# 2. Try extracting with lower confidence threshold
data = page.extract(
    schema=["invoice number", "total"], 
    min_confidence=0.5  # Lower threshold
).extracted()

# 3. Check confidence scores for each field
for field_name in data.__fields__:
    confidence_field = f"{field_name}_confidence"
    if hasattr(data, confidence_field):
        confidence = getattr(data, confidence_field)
        value = getattr(data, field_name)
        print(f"{field_name}: '{value}' (confidence: {confidence:.2f})")

# 4. Try vision mode if text mode fails
if any(getattr(data, f"{field}_confidence", 0) < 0.7 for field in ["invoice_number", "total"]):
    print("Low confidence detected, trying vision mode...")
    page.extract(schema=["invoice number", "total"], client=client, using='vision')
    data = page.extracted()