# Install required packages
!pip install --quiet gradio ipywidgets pymupdf rapidocr opencv-python-headless onnxruntime
print('✓ Packages installed!')
Gradio is a wonderful Python library that helps you make tiny little tools with simple components. It's very popular on Hugging Face Spaces for AI demos.
import gradio as gr
# We do the work in here
def greet(name):
value = f"Hello, {name}!"
return value
with gr.Blocks() as demo:
gr.Markdown("## A simple Gradio app")
with gr.Row():
# These gr. bits are all of our user interface
with gr.Column():
name = gr.Textbox(label="Name")
btn = gr.Button("Submit", variant='primary')
with gr.Column():
output = gr.Textbox(label="Output")
# When the button is clicked...
# run greet...
# with the value of name as input
# and put the result in output.
btn.click(greet, inputs=name, outputs=output)
demo.launch()
Let's say we're talking about OCR and I've been bragging about how good RapidOCR is instead of Tesseract.
Let's see how it works on this document.
from pathlib import Path
from tempfile import TemporaryDirectory
import fitz
from rapidocr import RapidOCR
import fitz
import requests
print("Downloading PDF...")
url = "https://raw.githubusercontent.com/jsoma/workshop-newsroom-ai-infra/main/data/pdfs/265897-lawsuit-complaint.pdf"
response = requests.get(url)
response.raise_for_status()
pdf = fitz.open(stream=response.content, filetype="pdf")
print("Initializing OCR...")
ocr = RapidOCR()
all_text = []
with TemporaryDirectory() as image_dir:
image_dir = Path(image_dir)
for page_number, page in enumerate(pdf, start=1):
if page_number > 3:
print("Reached page limit, stopping.")
break
print(f"Processing page {page_number}...")
image_path = image_dir / f"page-{page_number}.png"
pixmap = page.get_pixmap(matrix=fitz.Matrix(3, 3), alpha=False)
pixmap.save(image_path)
result = ocr(str(image_path))
text = "\n".join(result.txts or [])
all_text.append(f"## Page {page_number}\n\n{text}")
print("## FINAL TEXT\n\n")
print("\n\n".join(all_text))
I have some awful PDFs and am really interested in trying this out. Why should you need to email them to me?
from pathlib import Path
from tempfile import TemporaryDirectory
import fitz
import gradio as gr
from rapidocr import RapidOCR
DPI = 200
ocr_engine = RapidOCR()
def render_pdf_pages(pdf_path: str, image_dir: Path) -> list[Path]:
image_paths = []
zoom = DPI / 72
matrix = fitz.Matrix(zoom, zoom)
with fitz.open(pdf_path) as pdf:
for page_number, page in enumerate(pdf, start=1):
# 3 page limit
if page_number > 3:
break
pixmap = page.get_pixmap(matrix=matrix, alpha=False)
image_path = image_dir / f"page-{page_number}.png"
pixmap.save(image_path)
image_paths.append(image_path)
return image_paths
def ocr_pdf(pdf_file: str) -> str:
with TemporaryDirectory() as image_dir:
image_paths = render_pdf_pages(pdf_file, Path(image_dir))
page_texts = []
for image_path in image_paths:
result = ocr_engine(str(image_path))
text = "\n".join(result.txts or "").strip()
page_texts.append(text)
return "\n\n".join(page_texts)
with gr.Blocks(title="PDF OCR demo") as demo:
gr.Markdown("# PDF OCR demo")
pdf = gr.File(label="PDF", file_types=[".pdf"], type="filepath")
button = gr.Button("OCR PDF", variant="primary")
output = gr.Textbox(label="OCR text", lines=24)
button.click(ocr_pdf, pdf, output)
demo.launch()