Skip to content

Messy Tables

Extract and clean tables with multi-line cells, continuation rows, merged data, and missing values.

When to use this pattern:

  • Police/incident logs with multi-line descriptions
  • Government data dumps with inconsistent formatting
  • Legacy database exports
  • Any table where one logical row spans multiple visual rows

The Problem

You have a table where:

  • Descriptions wrap across multiple lines
  • Some rows are "continuations" of the previous row
  • Multiple values appear in one cell (e.g., multiple officers)
  • Some cells are empty or marked with placeholders

Sample PDF

This tutorial uses pdfs/cookbook/incident_log.pdf - an incident log with all these issues.

import natural_pdf as npdf

pdf = npdf.PDF("pdfs/cookbook/incident_log.pdf")
pdf.pages[0].show()

Step 1: Extract the Raw Table

import natural_pdf as npdf
import pandas as pd

pdf = npdf.PDF("pdfs/cookbook/incident_log.pdf")
page = pdf.pages[0]

# Extract the table
table = page.extract_table()
df = table.to_df()

print(df)
pdf.close()

Raw output (before cleaning):

    ID    Date       Officer(s)      Location                             Description
0  001   01/15        Smith, J.    100 Main St   Traffic stop - vehicle observed running red...
1  002   01/15  Smith, J.\nJohnson, M.  250 Oak Ave   Disturbance call at residential address....
2        (continued)                              Additional notes: Neighbor reported ongoing...
3  003   01/16        Davis, R.  500 Park Blvd   Welfare check requested by family member....
4  004   01/16  Davis, R.\nSmith, J.\nWilson, K.  75 Commerce Dr   Shoplifting report at retail store....
5        (continued)                              Suspect identified as J. Doe (DOB: 03/15/1990)...
6  005   01/17      Johnson, M.    300 River Rd   Vehicle accident - two cars, minor damage....
7  006   01/17                  425 Industrial Way   Anonymous tip received. Area checked,...

Step 2: Identify Continuation Rows

Continuation rows usually have empty ID fields or contain markers like "(continued)":

def is_continuation(row):
    """Check if this row is a continuation of the previous."""
    # Empty ID indicates continuation
    if pd.isna(row['ID']) or row['ID'] == '':
        return True
    # Explicit continuation marker
    if '(continued)' in str(row.get('Date', '')).lower():
        return True
    return False

Step 3: Merge Continuation Rows

def merge_continuations(df):
    """Merge continuation rows into their parent rows."""
    merged_rows = []
    current_row = None

    for _, row in df.iterrows():
        if is_continuation(row):
            if current_row is not None:
                # Append description to current row
                current_desc = current_row.get('Description', '')
                new_desc = row.get('Description', '')
                if new_desc:
                    current_row['Description'] = f"{current_desc} {new_desc}".strip()
        else:
            # Save previous row and start new one
            if current_row is not None:
                merged_rows.append(current_row)
            current_row = row.to_dict()

    # Don't forget the last row
    if current_row is not None:
        merged_rows.append(current_row)

    return pd.DataFrame(merged_rows)

df_merged = merge_continuations(df)
print(df_merged)

Step 4: Handle Multi-Value Cells

Some cells contain multiple values (like multiple officers). Split them into lists:

def parse_officers(officer_str):
    """Parse officer string into list of names."""
    if pd.isna(officer_str) or officer_str == '':
        return []

    # Split on newlines or commas
    officers = []
    for part in str(officer_str).replace('\n', ',').split(','):
        name = part.strip()
        if name:
            officers.append(name)
    return officers

df_merged['officers_list'] = df_merged['Officer(s)'].apply(parse_officers)
df_merged['officer_count'] = df_merged['officers_list'].apply(len)

Step 5: Clean Empty/Placeholder Values

def clean_value(val, empty_markers=None):
    """Clean a value, treating certain markers as empty."""
    if empty_markers is None:
        empty_markers = ['', 'N/A', 'n/a', '-', '--', 'None', 'null']

    if pd.isna(val):
        return None
    val_str = str(val).strip()
    if val_str in empty_markers:
        return None
    return val_str

# Apply to all columns
for col in df_merged.columns:
    if col not in ['officers_list', 'officer_count']:
        df_merged[col] = df_merged[col].apply(clean_value)

Step 6: Validate and Flag Issues

def validate_row(row):
    """Check for data quality issues."""
    issues = []

    if not row.get('ID'):
        issues.append('missing_id')
    if not row.get('Date'):
        issues.append('missing_date')
    if row.get('officer_count', 0) == 0:
        issues.append('no_officer')

    return issues if issues else None

df_merged['data_issues'] = df_merged.apply(validate_row, axis=1)

# Show rows with issues
issues_df = df_merged[df_merged['data_issues'].notna()]
print(f"Rows with issues: {len(issues_df)}")

Complete Example

import natural_pdf as npdf
import pandas as pd

def is_continuation(row):
    """Check if this row is a continuation of the previous."""
    if pd.isna(row['ID']) or row['ID'] == '':
        return True
    if '(continued)' in str(row.get('Date', '')).lower():
        return True
    return False

def merge_continuations(df, text_column='Description'):
    """Merge continuation rows into their parent rows."""
    merged_rows = []
    current_row = None

    for _, row in df.iterrows():
        if is_continuation(row):
            if current_row is not None:
                current_desc = current_row.get(text_column, '')
                new_desc = row.get(text_column, '')
                if new_desc:
                    current_row[text_column] = f"{current_desc} {new_desc}".strip()
        else:
            if current_row is not None:
                merged_rows.append(current_row)
            current_row = row.to_dict()

    if current_row is not None:
        merged_rows.append(current_row)

    return pd.DataFrame(merged_rows)

def parse_multi_value(value, separators=None):
    """Split a cell with multiple values into a list."""
    if separators is None:
        separators = ['\n', ';', ' and ']

    if pd.isna(value) or value == '':
        return []

    result = str(value)
    for sep in separators:
        result = result.replace(sep, ',')

    items = [item.strip() for item in result.split(',')]
    return [item for item in items if item]

def clean_incident_log(pdf_path):
    """Extract and clean an incident log table."""
    pdf = npdf.PDF(pdf_path)
    page = pdf.pages[0]

    # Extract raw table
    table = page.extract_table()
    df = table.to_df()

    # Merge continuation rows
    df = merge_continuations(df, text_column='Description')

    # Parse multi-value officer column
    df['officers'] = df['Officer(s)'].apply(parse_multi_value)
    df['officer_count'] = df['officers'].apply(len)

    # Clean up
    df = df.drop(columns=['Officer(s)'])
    df = df.rename(columns={'officers': 'Officer(s)'})

    # Reset index
    df = df.reset_index(drop=True)

    pdf.close()
    return df

# Usage
df = clean_incident_log("pdfs/cookbook/incident_log.pdf")
print(df[['ID', 'Date', 'Officer(s)', 'officer_count']])

Common Messy Table Patterns

Pattern: Detecting Headers Mid-Table

Some tables repeat headers on each page:

def is_header_row(row, expected_headers):
    """Check if row contains header values."""
    row_values = [str(v).strip().lower() for v in row.values if pd.notna(v)]
    header_values = [h.lower() for h in expected_headers]
    return len(set(row_values) & set(header_values)) > len(expected_headers) / 2

# Filter out repeated headers
headers = ['ID', 'Date', 'Officer(s)', 'Location', 'Description']
df = df[~df.apply(lambda r: is_header_row(r, headers), axis=1)]

Pattern: One-to-Many Records

When one incident has multiple related sub-records:

# Explode the officers list into separate rows
df_exploded = df.explode('Officer(s)')
# Now each officer-incident combination is a row

Pattern: Handling Footnotes and Annotations

def remove_footnotes(text):
    """Remove footnote markers like [1], *, etc."""
    import re
    if pd.isna(text):
        return text
    # Remove [1], [2], etc.
    text = re.sub(r'\[\d+\]', '', text)
    # Remove standalone asterisks
    text = re.sub(r'\s*\*+\s*', ' ', text)
    return text.strip()

df['Description'] = df['Description'].apply(remove_footnotes)

Troubleshooting

"Table extraction misses columns"

Try extracting with explicit settings:

# Use layout analysis first
page.analyze_layout()
tables = page.find_all('region[type=table]')
for t in tables:
    df = t.extract_table().to_df()

"Cells are merged incorrectly"

Check if the PDF uses visual alignment vs actual table structure:

# View the raw elements
page.find_all('text').show()

# If it's whitespace-aligned, not a real table,
# you may need to extract by position

"Text wrapping creates garbage"

Normalize whitespace after extraction:

def normalize_whitespace(text):
    """Clean up wrapped text."""
    if pd.isna(text):
        return text
    # Replace multiple whitespace with single space
    import re
    return re.sub(r'\s+', ' ', text).strip()

Next Steps