Messy Tables
Extract and clean tables with multi-line cells, continuation rows, merged data, and missing values.
When to use this pattern:
- Police/incident logs with multi-line descriptions
- Government data dumps with inconsistent formatting
- Legacy database exports
- Any table where one logical row spans multiple visual rows
The Problem
You have a table where:
- Descriptions wrap across multiple lines
- Some rows are "continuations" of the previous row
- Multiple values appear in one cell (e.g., multiple officers)
- Some cells are empty or marked with placeholders
Sample PDF
This tutorial uses pdfs/cookbook/incident_log.pdf - an incident log with all these issues.
import natural_pdf as npdf
pdf = npdf.PDF("pdfs/cookbook/incident_log.pdf")
pdf.pages[0].show()
Step 1: Extract the Raw Table
import natural_pdf as npdf
import pandas as pd
pdf = npdf.PDF("pdfs/cookbook/incident_log.pdf")
page = pdf.pages[0]
# Extract the table
table = page.extract_table()
df = table.to_df()
print(df)
pdf.close()
Raw output (before cleaning):
ID Date Officer(s) Location Description
0 001 01/15 Smith, J. 100 Main St Traffic stop - vehicle observed running red...
1 002 01/15 Smith, J.\nJohnson, M. 250 Oak Ave Disturbance call at residential address....
2 (continued) Additional notes: Neighbor reported ongoing...
3 003 01/16 Davis, R. 500 Park Blvd Welfare check requested by family member....
4 004 01/16 Davis, R.\nSmith, J.\nWilson, K. 75 Commerce Dr Shoplifting report at retail store....
5 (continued) Suspect identified as J. Doe (DOB: 03/15/1990)...
6 005 01/17 Johnson, M. 300 River Rd Vehicle accident - two cars, minor damage....
7 006 01/17 425 Industrial Way Anonymous tip received. Area checked,...
Step 2: Identify Continuation Rows
Continuation rows usually have empty ID fields or contain markers like "(continued)":
def is_continuation(row):
"""Check if this row is a continuation of the previous."""
# Empty ID indicates continuation
if pd.isna(row['ID']) or row['ID'] == '':
return True
# Explicit continuation marker
if '(continued)' in str(row.get('Date', '')).lower():
return True
return False
Step 3: Merge Continuation Rows
def merge_continuations(df):
"""Merge continuation rows into their parent rows."""
merged_rows = []
current_row = None
for _, row in df.iterrows():
if is_continuation(row):
if current_row is not None:
# Append description to current row
current_desc = current_row.get('Description', '')
new_desc = row.get('Description', '')
if new_desc:
current_row['Description'] = f"{current_desc} {new_desc}".strip()
else:
# Save previous row and start new one
if current_row is not None:
merged_rows.append(current_row)
current_row = row.to_dict()
# Don't forget the last row
if current_row is not None:
merged_rows.append(current_row)
return pd.DataFrame(merged_rows)
df_merged = merge_continuations(df)
print(df_merged)
Step 4: Handle Multi-Value Cells
Some cells contain multiple values (like multiple officers). Split them into lists:
def parse_officers(officer_str):
"""Parse officer string into list of names."""
if pd.isna(officer_str) or officer_str == '':
return []
# Split on newlines or commas
officers = []
for part in str(officer_str).replace('\n', ',').split(','):
name = part.strip()
if name:
officers.append(name)
return officers
df_merged['officers_list'] = df_merged['Officer(s)'].apply(parse_officers)
df_merged['officer_count'] = df_merged['officers_list'].apply(len)
Step 5: Clean Empty/Placeholder Values
def clean_value(val, empty_markers=None):
"""Clean a value, treating certain markers as empty."""
if empty_markers is None:
empty_markers = ['', 'N/A', 'n/a', '-', '--', 'None', 'null']
if pd.isna(val):
return None
val_str = str(val).strip()
if val_str in empty_markers:
return None
return val_str
# Apply to all columns
for col in df_merged.columns:
if col not in ['officers_list', 'officer_count']:
df_merged[col] = df_merged[col].apply(clean_value)
Step 6: Validate and Flag Issues
def validate_row(row):
"""Check for data quality issues."""
issues = []
if not row.get('ID'):
issues.append('missing_id')
if not row.get('Date'):
issues.append('missing_date')
if row.get('officer_count', 0) == 0:
issues.append('no_officer')
return issues if issues else None
df_merged['data_issues'] = df_merged.apply(validate_row, axis=1)
# Show rows with issues
issues_df = df_merged[df_merged['data_issues'].notna()]
print(f"Rows with issues: {len(issues_df)}")
Complete Example
import natural_pdf as npdf
import pandas as pd
def is_continuation(row):
"""Check if this row is a continuation of the previous."""
if pd.isna(row['ID']) or row['ID'] == '':
return True
if '(continued)' in str(row.get('Date', '')).lower():
return True
return False
def merge_continuations(df, text_column='Description'):
"""Merge continuation rows into their parent rows."""
merged_rows = []
current_row = None
for _, row in df.iterrows():
if is_continuation(row):
if current_row is not None:
current_desc = current_row.get(text_column, '')
new_desc = row.get(text_column, '')
if new_desc:
current_row[text_column] = f"{current_desc} {new_desc}".strip()
else:
if current_row is not None:
merged_rows.append(current_row)
current_row = row.to_dict()
if current_row is not None:
merged_rows.append(current_row)
return pd.DataFrame(merged_rows)
def parse_multi_value(value, separators=None):
"""Split a cell with multiple values into a list."""
if separators is None:
separators = ['\n', ';', ' and ']
if pd.isna(value) or value == '':
return []
result = str(value)
for sep in separators:
result = result.replace(sep, ',')
items = [item.strip() for item in result.split(',')]
return [item for item in items if item]
def clean_incident_log(pdf_path):
"""Extract and clean an incident log table."""
pdf = npdf.PDF(pdf_path)
page = pdf.pages[0]
# Extract raw table
table = page.extract_table()
df = table.to_df()
# Merge continuation rows
df = merge_continuations(df, text_column='Description')
# Parse multi-value officer column
df['officers'] = df['Officer(s)'].apply(parse_multi_value)
df['officer_count'] = df['officers'].apply(len)
# Clean up
df = df.drop(columns=['Officer(s)'])
df = df.rename(columns={'officers': 'Officer(s)'})
# Reset index
df = df.reset_index(drop=True)
pdf.close()
return df
# Usage
df = clean_incident_log("pdfs/cookbook/incident_log.pdf")
print(df[['ID', 'Date', 'Officer(s)', 'officer_count']])
Common Messy Table Patterns
Pattern: Detecting Headers Mid-Table
Some tables repeat headers on each page:
def is_header_row(row, expected_headers):
"""Check if row contains header values."""
row_values = [str(v).strip().lower() for v in row.values if pd.notna(v)]
header_values = [h.lower() for h in expected_headers]
return len(set(row_values) & set(header_values)) > len(expected_headers) / 2
# Filter out repeated headers
headers = ['ID', 'Date', 'Officer(s)', 'Location', 'Description']
df = df[~df.apply(lambda r: is_header_row(r, headers), axis=1)]
Pattern: One-to-Many Records
When one incident has multiple related sub-records:
# Explode the officers list into separate rows
df_exploded = df.explode('Officer(s)')
# Now each officer-incident combination is a row
Pattern: Handling Footnotes and Annotations
def remove_footnotes(text):
"""Remove footnote markers like [1], *, etc."""
import re
if pd.isna(text):
return text
# Remove [1], [2], etc.
text = re.sub(r'\[\d+\]', '', text)
# Remove standalone asterisks
text = re.sub(r'\s*\*+\s*', ' ', text)
return text.strip()
df['Description'] = df['Description'].apply(remove_footnotes)
Troubleshooting
"Table extraction misses columns"
Try extracting with explicit settings:
# Use layout analysis first
page.analyze_layout()
tables = page.find_all('region[type=table]')
for t in tables:
df = t.extract_table().to_df()
"Cells are merged incorrectly"
Check if the PDF uses visual alignment vs actual table structure:
# View the raw elements
page.find_all('text').show()
# If it's whitespace-aligned, not a real table,
# you may need to extract by position
"Text wrapping creates garbage"
Normalize whitespace after extraction:
def normalize_whitespace(text):
"""Clean up wrapped text."""
if pd.isna(text):
return text
# Replace multiple whitespace with single space
import re
return re.sub(r'\s+', ' ', text).strip()
Next Steps
- Simple Table Extraction - Basics of table extraction
- One Page = One Row - Handle forms with embedded tables
- Multipage Content - Tables spanning multiple pages