Download notebook

In [ ]:

# Install required packages
!pip install --quiet gradio ipywidgets pymupdf rapidocr opencv-python-headless onnxruntime

print('✓ Packages installed!')

Welcome to Gradio¶

Gradio is a wonderful Python library that helps you make tiny little tools with simple components. It's very popular on Hugging Face Spaces for AI demos.

In [ ]:

import gradio as gr

# We do the work in here
def greet(name):
    value = f"Hello, {name}!"

    return value

with gr.Blocks() as demo:
    gr.Markdown("## A simple Gradio app")

    with gr.Row():
        # These gr. bits are all of our user interface
        with gr.Column():
            name = gr.Textbox(label="Name")
            btn = gr.Button("Submit", variant='primary')
        with gr.Column():
            output = gr.Textbox(label="Output")

    # When the button is clicked...
    # run greet...
    # with the value of name as input
    # and put the result in output.
    btn.click(greet, inputs=name, outputs=output)

In [ ]:

demo.launch()

Why we would use Gradio in real life¶

Let's say we're talking about OCR and I've been bragging about how good RapidOCR is instead of Tesseract.

Let's see how it works on this document.

In [ ]:

from pathlib import Path
from tempfile import TemporaryDirectory

import fitz
from rapidocr import RapidOCR

import fitz
import requests

print("Downloading PDF...")
url = "https://raw.githubusercontent.com/jsoma/workshop-newsroom-ai-infra/main/data/pdfs/265897-lawsuit-complaint.pdf"

response = requests.get(url)
response.raise_for_status()

pdf = fitz.open(stream=response.content, filetype="pdf")

print("Initializing OCR...")
ocr = RapidOCR()

all_text = []

with TemporaryDirectory() as image_dir:
    image_dir = Path(image_dir)

    for page_number, page in enumerate(pdf, start=1):
        if page_number > 3:
            print("Reached page limit, stopping.")
            break

        print(f"Processing page {page_number}...")
        image_path = image_dir / f"page-{page_number}.png"

        pixmap = page.get_pixmap(matrix=fitz.Matrix(3, 3), alpha=False)
        pixmap.save(image_path)

        result = ocr(str(image_path))
        text = "\n".join(result.txts or [])

        all_text.append(f"## Page {page_number}\n\n{text}")

print("## FINAL TEXT\n\n")
print("\n\n".join(all_text))

I have some awful PDFs and am really interested in trying this out. Why should you need to email them to me?

Building the Gradio demo¶

In [ ]:

from pathlib import Path
from tempfile import TemporaryDirectory

import fitz
import gradio as gr
from rapidocr import RapidOCR


DPI = 200
ocr_engine = RapidOCR()


def render_pdf_pages(pdf_path: str, image_dir: Path) -> list[Path]:
    image_paths = []
    zoom = DPI / 72
    matrix = fitz.Matrix(zoom, zoom)

    with fitz.open(pdf_path) as pdf:
        for page_number, page in enumerate(pdf, start=1):
            # 3 page limit
            if page_number > 3:
                break
            pixmap = page.get_pixmap(matrix=matrix, alpha=False)
            image_path = image_dir / f"page-{page_number}.png"
            pixmap.save(image_path)
            image_paths.append(image_path)

    return image_paths


def ocr_pdf(pdf_file: str) -> str:
    with TemporaryDirectory() as image_dir:
        image_paths = render_pdf_pages(pdf_file, Path(image_dir))

        page_texts = []
        for image_path in image_paths:
            result = ocr_engine(str(image_path))
            text = "\n".join(result.txts or "").strip()
            page_texts.append(text)

    return "\n\n".join(page_texts)


with gr.Blocks(title="PDF OCR demo") as demo:
    gr.Markdown("# PDF OCR demo")

    pdf = gr.File(label="PDF", file_types=[".pdf"], type="filepath")
    button = gr.Button("OCR PDF", variant="primary")
    output = gr.Textbox(label="OCR text", lines=24)

    button.click(ocr_pdf, pdf, output)

In [ ]:

demo.launch()