Run in Colab Download notebook

In [ ]:

# Install required packages
!pip install --upgrade --quiet python-dotenv
!pip install --upgrade --quiet pydantic-ai
!pip install --upgrade --quiet perplexityai

print('✓ Packages installed!')

Slides: from-vibes-to-benchmarks.pdf

General setup¶

API keys etc

In [ ]:

import nest_asyncio
from dotenv import load_dotenv
import os

load_dotenv()
nest_asyncio.apply()

# These should all be in .env
# Get your Braintrust API key from https://www.braintrust.dev
os.environ["BRAINTRUST_API_URL"] = "https://api.braintrust.dev"
os.environ["BRAINTRUST_API_KEY"] = ""
os.environ["TAVILY_API_KEY"] = ""
os.environ['ANTHROPIC_API_KEY'] = ''
os.environ['OPENAI_API_KEY'] = ''
os.environ['PERPLEXITY_API_KEY'] = ''
os.environ["PERPLEXITY_API_KEY"] = ""

tavily_key = os.getenv('TAVILY_API_KEY')

In [ ]:

# Added
os.environ["BRAINTRUST_OTEL_COMPAT"] = "true"

Instrument Pydantic AI with Braintrust¶

In [ ]:

%pip install --upgrade --quiet pydantic-ai

In [ ]:

import braintrust
from braintrust.otel import BraintrustSpanProcessor
from opentelemetry import trace
from opentelemetry.sdk.trace import TracerProvider
from pydantic_ai import Agent

provider = TracerProvider()
trace.set_tracer_provider(provider)
provider.add_span_processor(BraintrustSpanProcessor())

braintrust.init_logger(project="default-otel-project")
Agent.instrument_all()

Set up Pydantic agent¶

In [ ]:

from pydantic import BaseModel
from pydantic_ai import WebSearchTool, WebSearchUserLocation
from pydantic_ai.common_tools.tavily import tavily_search_tool
from perplexity import Perplexity

# Model list at https://ai.pydantic.dev/api/models/base/#pydantic_ai.models.KnownModelName
agent = Agent(
    'anthropic:claude-haiku-4-5',
    tools=[tavily_search_tool(tavily_key)],
    retries=3,
    # Should we add this?
    # builtin_tools=[WebSearchTool(location=WebSearchUserLocation.US)]
)

# Set up Perplexity tool
client = Perplexity()

@agent.tool_plain
async def perplexity_research(query: str):
    """Do research on a topic using Perplexity.

    Args:
        query: what to search for
    """
    search = client.search.create(query=query, max_results=10)
    return search.results

# Define the instructions for the agent

instructions = f"""
You are an investigative journalism assistant that triages news tips.

Your task is to:
1. Evaluate the tip for credibility and newsworthiness (score 1-10)
2. Determine if it is valid (score >= 5) or spam/invalid (score < 5)
3. Categorize it into the most appropriate beat from this list: local news, crime, politics, business, health, technology, environment, education, culture, sports, international news
4. Write a concise 2-3 sentence summary
5. Use Perplexity and/or Tavily search to conduct background research on the topic, entities, and location mentioned
6. Research the tipster's name and email to assess credibility (look for public profiles, previous journalism involvement, potential biases)
7. Generate 3-5 specific follow-up questions for further investigation
8. Compose a professional email response:
   - If valid: Thank them, indicate you're reviewing it, and ask the follow-up questions
   - If invalid: Politely decline and thank them for their interest

Be thorough in your research but efficient. Focus on verifiable facts and credible sources. Understand that people don't really know how to write tips, and you should follow up on most of the ones that are not obviously crazy people.

IMPORTANT OVERRIDE: Don't use any tools. **DO NOT** do any research. Just provide your response immediately. Be terse in the response.
"""

# Define the output from the agent
class ProcessedTip(BaseModel):
    is_valid: bool
    credibility: int
    newsworthiness: int
    validity: int
    beat: str
    summary: str
    research: str
    follow_up_questions: list[str]
    email_response: str

@braintrust.traced
def triage_tip(name, email, tip):
    prompt = f"""
    Submitter: {name} - {email}
    Tip details: {tip}
    """
    result = agent.run_sync(prompt,
                            instructions=instructions,
                            output_type=ProcessedTip)
    output = result.output.model_dump()

    return output

name = "Jonathan Soma"
email = "jonathan.soma@gmail.com"
tip = """
The mayor of Fayetteville, NC has been embezzling. I'm the treasurer,
I know it. I have documents, I can prove it.
"""

result = triage_tip(name, email, tip)
print(result)

Datasets¶

Datasets are created on the Braintrust console (I guess you can do it programmatically but I doubt anyone does??).

In [ ]:

import pprint

dataset = braintrust.init_dataset(
    project="default-otel-project",
    name="tiny-tips")

for row in dataset:
    pprint.pprint(row)

Building a code-based scorer¶

You can do this on Braintrust but it seems weird to put your code up there if you're already in Python world.

In [ ]:

def credible_match(output, expected):
    """Binary match on credible flag."""
    if "is_valid" not in output or "worth_researching" not in expected:
        return 0.0

    return 1.0 if output["is_valid"] == expected["worth_researching"] else 0.0

In [ ]:

from braintrust import Eval, EvalAsync

eval_result = await EvalAsync(
    "default-otel-project",
    data=dataset,
    task=lambda input_row: triage_tip(
        input_row["name"],
        input_row["email"],
        input_row["tip"],
    ),
    scores=[credible_match], # ADD THE SCORERS HERE
    experiment_name="tip-eval-with-judge", # NAME IS IMPORTANT
    max_concurrency=5
)

print(f"Summary: {eval_result.summary}")

LLM-based scorer¶

In [ ]:

judge_agent = Agent("openai:gpt-5-mini", retries=2)

def email_politeness(input_row, output, expected):
    """
    LLM judge: is the email response well-written and polite?
    Returns float in [0,1]. If you prefer a boolean, set a threshold (e.g., >= 0.6 => 1.0 else 0.0).
    """
    tip_text = input_row.get("tip", "")
    email_response = output.get("email_response", "")
    prompt = f"""
    You are grading an email response to a news tip.

    Tip:
    {tip_text}

    Email Response:
    {email_response}

    Score how well the email response is well-written and polite on a 0-1 scale.
    Elements to consider:
    - Tone: Is it professional and respectful?
    - Clarity: Is it clear and concise?
    - Relevance: Does it address all the necessary points?
    - Follow-up: Does it include all the necessary follow-up questions?
    
    Return only a number between 0 and 1. Be as objective and critical as possible.
    """
    result = judge_agent.run_sync(prompt, output_type=float)
    score = result.output
    return score

In [ ]:

from braintrust import Eval, EvalAsync

eval_result = await EvalAsync(
    "default-otel-project",
    data=dataset,
    task=lambda input_row: triage_tip(
        input_row["name"],
        input_row["email"],
        input_row["tip"],
    ),
    scores=[credible_match, email_politeness], # ADD THE SCORERS HERE
    experiment_name="tip-eval-with-judge", # NAME IS IMPORTANT
    max_concurrency=5
)

print(f"Summary: {eval_result.summary}")