# Install required packages
!pip install --upgrade --quiet python-dotenv
!pip install --upgrade --quiet pydantic-ai
!pip install --upgrade --quiet perplexityai
print('✓ Packages installed!')
Slides: from-vibes-to-benchmarks.pdf
API keys etc
import nest_asyncio
from dotenv import load_dotenv
import os
load_dotenv()
nest_asyncio.apply()
# These should all be in .env
# Get your Braintrust API key from https://www.braintrust.dev
os.environ["BRAINTRUST_API_URL"] = "https://api.braintrust.dev"
os.environ["BRAINTRUST_API_KEY"] = ""
os.environ["TAVILY_API_KEY"] = ""
os.environ['ANTHROPIC_API_KEY'] = ''
os.environ['OPENAI_API_KEY'] = ''
os.environ['PERPLEXITY_API_KEY'] = ''
os.environ["PERPLEXITY_API_KEY"] = ""
tavily_key = os.getenv('TAVILY_API_KEY')
# Added
os.environ["BRAINTRUST_OTEL_COMPAT"] = "true"
%pip install --upgrade --quiet pydantic-ai
import braintrust
from braintrust.otel import BraintrustSpanProcessor
from opentelemetry import trace
from opentelemetry.sdk.trace import TracerProvider
from pydantic_ai import Agent
provider = TracerProvider()
trace.set_tracer_provider(provider)
provider.add_span_processor(BraintrustSpanProcessor())
braintrust.init_logger(project="default-otel-project")
Agent.instrument_all()
from pydantic import BaseModel
from pydantic_ai import WebSearchTool, WebSearchUserLocation
from pydantic_ai.common_tools.tavily import tavily_search_tool
from perplexity import Perplexity
# Model list at https://ai.pydantic.dev/api/models/base/#pydantic_ai.models.KnownModelName
agent = Agent(
'anthropic:claude-haiku-4-5',
tools=[tavily_search_tool(tavily_key)],
retries=3,
# Should we add this?
# builtin_tools=[WebSearchTool(location=WebSearchUserLocation.US)]
)
# Set up Perplexity tool
client = Perplexity()
@agent.tool_plain
async def perplexity_research(query: str):
"""Do research on a topic using Perplexity.
Args:
query: what to search for
"""
search = client.search.create(query=query, max_results=10)
return search.results
# Define the instructions for the agent
instructions = f"""
You are an investigative journalism assistant that triages news tips.
Your task is to:
1. Evaluate the tip for credibility and newsworthiness (score 1-10)
2. Determine if it is valid (score >= 5) or spam/invalid (score < 5)
3. Categorize it into the most appropriate beat from this list: local news, crime, politics, business, health, technology, environment, education, culture, sports, international news
4. Write a concise 2-3 sentence summary
5. Use Perplexity and/or Tavily search to conduct background research on the topic, entities, and location mentioned
6. Research the tipster's name and email to assess credibility (look for public profiles, previous journalism involvement, potential biases)
7. Generate 3-5 specific follow-up questions for further investigation
8. Compose a professional email response:
- If valid: Thank them, indicate you're reviewing it, and ask the follow-up questions
- If invalid: Politely decline and thank them for their interest
Be thorough in your research but efficient. Focus on verifiable facts and credible sources. Understand that people don't really know how to write tips, and you should follow up on most of the ones that are not obviously crazy people.
IMPORTANT OVERRIDE: Don't use any tools. **DO NOT** do any research. Just provide your response immediately. Be terse in the response.
"""
# Define the output from the agent
class ProcessedTip(BaseModel):
is_valid: bool
credibility: int
newsworthiness: int
validity: int
beat: str
summary: str
research: str
follow_up_questions: list[str]
email_response: str
@braintrust.traced
def triage_tip(name, email, tip):
prompt = f"""
Submitter: {name} - {email}
Tip details: {tip}
"""
result = agent.run_sync(prompt,
instructions=instructions,
output_type=ProcessedTip)
output = result.output.model_dump()
return output
name = "Jonathan Soma"
email = "jonathan.soma@gmail.com"
tip = """
The mayor of Fayetteville, NC has been embezzling. I'm the treasurer,
I know it. I have documents, I can prove it.
"""
result = triage_tip(name, email, tip)
print(result)
Datasets are created on the Braintrust console (I guess you can do it programmatically but I doubt anyone does??).
import pprint
dataset = braintrust.init_dataset(
project="default-otel-project",
name="tiny-tips")
for row in dataset:
pprint.pprint(row)
You can do this on Braintrust but it seems weird to put your code up there if you're already in Python world.
def credible_match(output, expected):
"""Binary match on credible flag."""
if "is_valid" not in output or "worth_researching" not in expected:
return 0.0
return 1.0 if output["is_valid"] == expected["worth_researching"] else 0.0
from braintrust import Eval, EvalAsync
eval_result = await EvalAsync(
"default-otel-project",
data=dataset,
task=lambda input_row: triage_tip(
input_row["name"],
input_row["email"],
input_row["tip"],
),
scores=[credible_match], # ADD THE SCORERS HERE
experiment_name="tip-eval-with-judge", # NAME IS IMPORTANT
max_concurrency=5
)
print(f"Summary: {eval_result.summary}")
judge_agent = Agent("openai:gpt-5-mini", retries=2)
def email_politeness(input_row, output, expected):
"""
LLM judge: is the email response well-written and polite?
Returns float in [0,1]. If you prefer a boolean, set a threshold (e.g., >= 0.6 => 1.0 else 0.0).
"""
tip_text = input_row.get("tip", "")
email_response = output.get("email_response", "")
prompt = f"""
You are grading an email response to a news tip.
Tip:
{tip_text}
Email Response:
{email_response}
Score how well the email response is well-written and polite on a 0-1 scale.
Elements to consider:
- Tone: Is it professional and respectful?
- Clarity: Is it clear and concise?
- Relevance: Does it address all the necessary points?
- Follow-up: Does it include all the necessary follow-up questions?
Return only a number between 0 and 1. Be as objective and critical as possible.
"""
result = judge_agent.run_sync(prompt, output_type=float)
score = result.output
return score
from braintrust import Eval, EvalAsync
eval_result = await EvalAsync(
"default-otel-project",
data=dataset,
task=lambda input_row: triage_tip(
input_row["name"],
input_row["email"],
input_row["tip"],
),
scores=[credible_match, email_politeness], # ADD THE SCORERS HERE
experiment_name="tip-eval-with-judge", # NAME IS IMPORTANT
max_concurrency=5
)
print(f"Summary: {eval_result.summary}")