Prompt Optimization with Evidently¶
Attribution & License
This notebook is adapted from: evidentlyai/community-examples, licensed under the Apache License, Version 2.0. © Original authors.
Modifications: by Simeon Harrison/EuroCC Austria, © 2025.
This notebook demonstrates how to use Evidently's PromptOptimizer API for optimizing prompts for LLM judges.
Code Review Quality Classifier¶
We'll walk through optimizing a prompt that classifies the quality of code reviews written for junior developers.
What you'll learn:¶
- How to set up a dataset for LLM evaluation
- How to define an LLM judge with a prompt template
- How to run the prompt optimization loop
- How to retrieve and inspect the best performing prompt
Bazzite-AI Setup Required
RunD0_00_Bazzite_AI_Setup.ipynbfirst to configure Ollama and verify GPU access.
# If you haven't installed the required packages yet:
# !pip install evidently openai pandas
[No output generated]
import pandas as pd
from evidently import Dataset, DataDefinition, LLMClassification
from evidently.llm.templates import BinaryClassificationPromptTemplate
from evidently.descriptors import LLMEval
from evidently.llm.optimization import PromptOptimizer
from evidently.descriptors import HuggingFace, HuggingFaceToxicity
[ERROR: Execution timed out after 120 seconds]
# === Datasets Path ===
from pathlib import Path
DATASETS_DIR = Path("./datasets")
print(f"Datasets: {[f.name for f in DATASETS_DIR.glob('*.csv')]}")
[ERROR: Execution timed out after 90 seconds]
import os
from typing import Dict, Any, List, Optional
from evidently.llm.utils.wrapper import OpenAIOptions, OpenAIWrapper, LLMMessage, LLMResult
# === Ollama Configuration via OpenAI-Compatible API ===
OLLAMA_HOST = os.getenv("OLLAMA_HOST", "http://ollama:11434")
# === Model Configuration ===
HF_LLM_MODEL = "NousResearch/Nous-Hermes-2-Mistral-7B-DPO-GGUF"
OLLAMA_LLM_MODEL = f"hf.co/{HF_LLM_MODEL}:Q4_K_M"
OLLAMA_OPTIONS = OpenAIOptions(
api_key="ollama",
api_url=f"{OLLAMA_HOST}/v1"
)
# === Patch OpenAIWrapper for smart JSON mode detection ===
# Evidently's OpenAI wrapper doesn't enable JSON mode by default.
# This patch detects when JSON output is expected and enables it.
_original_openai_complete = OpenAIWrapper.complete
async def _json_aware_complete(self, messages: List[LLMMessage], seed: Optional[int] = None) -> LLMResult[str]:
import openai
from openai.types.chat.chat_completion import ChatCompletion
message_text = " ".join(m.content for m in messages if m.content)
needs_json = "json" in message_text.lower() or '"category"' in message_text
needs_xml = "<new_prompt>" in message_text
formatted_messages = [{"role": msg.role, "content": msg.content} for msg in messages]
try:
kwargs = {"model": self.model, "messages": formatted_messages, "seed": seed}
if needs_json and not needs_xml:
kwargs["response_format"] = {"type": "json_object"}
response: ChatCompletion = await self.client.chat.completions.create(**kwargs)
except openai.RateLimitError as e:
from evidently.llm.utils.wrapper import LLMRateLimitError
raise LLMRateLimitError(e.message) from e
except openai.APIError as e:
from evidently.llm.utils.wrapper import LLMRequestError
raise LLMRequestError(f"Failed to call OpenAI complete API: {e.message}", original_error=e) from e
content = response.choices[0].message.content
assert content is not None
if response.usage is None:
return LLMResult(content, 0, 0)
return LLMResult(content, response.usage.prompt_tokens, response.usage.completion_tokens)
OpenAIWrapper.complete = _json_aware_complete
print(f"Ollama host: {OLLAMA_HOST}")
print(f"Model: {OLLAMA_LLM_MODEL}")
print(f"Using OpenAI-compatible API with smart JSON mode detection")
[ERROR: Execution timed out after 90 seconds]
# Load your dataset
review_dataset = pd.read_csv(DATASETS_DIR / "code_review_dataset.csv")
review_dataset.head()
Generated review Expert label \
0 This implementation appears to work, but the a... bad
1 Great job! Keep it up! bad
2 It would be advisable to think about modularit... bad
3 You’ve structured the class very well, and the... good
4 Great job! This is clean and well-organized. T... bad
Expert comment
0 The tone is slighly condescending, no actionab...
1 Not actionable
2 there is a suggestion, but no real guidance
3 Good tone, actionable
4 Pure praise # Define how Evidently should interpret your dataset
dd = DataDefinition(
text_columns=["Generated review", "Expert comment"],
categorical_columns=["Expert label"],
llm=LLMClassification(input="Generated review", target="Expert label", reasoning="Expert comment")
)
[No output generated]
# Convert your pandas DataFrame into an Evidently Dataset
dataset = Dataset.from_pandas(review_dataset, data_definition=dd)
# Define a prompt template and judge for classifying code review quality
criteria = '''A review is GOOD when it's actionable and constructive.
A review is BAD when it is non-actionable or overly critical.'''
feedback_quality = BinaryClassificationPromptTemplate(
pre_messages=[("system", "You are evaluating the quality of code reviews given to junior developers.")],
criteria=criteria,
target_category="bad",
non_target_category="good",
uncertainty="unknown",
include_reasoning=True,
)
judge = LLMEval(
alias="Code Review Judge",
provider="openai", # Use OpenAI provider with Ollama's OpenAI-compatible API
model=OLLAMA_LLM_MODEL,
column_name="Generated review",
template=feedback_quality
)
# Initialize the optimizer with Ollama via OpenAI-compatible API
optimizer = PromptOptimizer(
"code_review_example",
strategy="feedback",
provider="openai", # Use OpenAI provider with Ollama's OpenAI-compatible API
model=OLLAMA_LLM_MODEL
)
optimizer.set_input_dataset(dataset)
await optimizer.arun(judge, "accuracy", options=OLLAMA_OPTIONS)
# Show the best-performing prompt template found by the optimizer
print(optimizer.best_prompt())
A code review is GOOD when it's actionable and provides specific suggestions for improvement, addressing both positive aspects and potential issues while maintaining a constructive tone. A code review is BAD when it lacks actionable feedback or offers vague advice, only providing praise or criticism without suggestions for resolution.
Example 2: Bookings Query Classifier¶
In this tutorial, we'll optimize a prompt for classifying different types of customer service queries (like Booking, Payment, or Technical issues) using an LLM classifier.
What you'll learn:¶
- How to load a dataset for LLM classification
- How to define a multiclass classification prompt
- How to run prompt optimization with Evidently
- How to retrieve the best performing prompt
import pandas as pd
from evidently import Dataset, DataDefinition, LLMClassification
from evidently.descriptors import LLMEval
from evidently.llm.templates import MulticlassClassificationPromptTemplate
from evidently.llm.optimization import PromptOptimizer
[No output generated]
Load Your Dataset¶
data = pd.read_csv(DATASETS_DIR / "booking_queries_dataset.csv")
data.head()
query label 0 booked a trip for 4 ppl, want to add a 5th now Booking 1 hello team, please confirm if my hotel reserva... Booking 2 i can’t see the payment options, dropdown just... Technical 3 I heard airlines sometimes overbook, what’s yo... Policy 4 wanna reschedule my train ride to next week Booking
Define Data Structure for Evidently¶
dd = DataDefinition(
text_columns=["query"],
categorical_columns=["label"],
llm=LLMClassification(input="query", target="label")
)
[No output generated]
dataset = Dataset.from_pandas(data, data_definition=dd)
[No output generated]
Define a Multiclass Prompt and LLM Judge¶
base_prompt = "Classify inqueries by categories"
t = MulticlassClassificationPromptTemplate(
pre_messages=[("system", "You are classifying user queries.")],
criteria=base_prompt,
category_criteria={
"Booking": "bookings",
"Technical": "technical questions",
"Policy": "questions about policies",
"Payment": "payment questions",
"Escalation": "escalation requests"
},
uncertainty="unknown",
include_reasoning=True,
)
judge = LLMEval(
alias="bookings",
provider="openai", # Use OpenAI provider with Ollama's OpenAI-compatible API
model=OLLAMA_LLM_MODEL,
column_name="query",
template=t
)
[No output generated]
Run the Prompt Optimizer¶
optimizer = PromptOptimizer(
"bookings_example",
strategy="feedback",
provider="openai", # Use OpenAI provider with Ollama's OpenAI-compatible API
model=OLLAMA_LLM_MODEL
)
optimizer.set_input_dataset(dataset)
await optimizer.arun(judge, "accuracy", options=OLLAMA_OPTIONS)
[ERROR: Execution timed out after 600 seconds]
View the Best Optimized Prompt¶
print(optimizer.best_prompt())
[ERROR: Execution timed out after 90 seconds]
Example 3: Tweet Generation Example¶
This tutorial shows how to optimize prompts for generating engaging tweets using Evidently's PromptOptimizer API. We'll iteratively improve a tweet generation prompt to maximize how engaging LLM-generated tweets are, according to a classifier.
What you'll learn:¶
- How to define a tweet generation function with OpenAI
- How to set up an LLM judge to classify tweet engagement
- How to optimize a tweet generation prompt based on feedback
- How to inspect the best optimized prompt
# Install packages if needed
# !pip install evidently openai pandas
import pandas as pd
import ollama
from evidently.descriptors import LLMEval
from evidently.llm.templates import BinaryClassificationPromptTemplate
from evidently.llm.optimization import PromptOptimizer, PromptExecutionLog, Params
Define a Tweet Generation Function¶
import ollama
def basic_tweet_generation(topic, model=OLLAMA_LLM_MODEL, instructions=""):
response = ollama.chat(
model=model,
messages=[
{"role": "system", "content": instructions},
{"role": "user", "content": f"Write a short paragraph about {topic}"}
]
)
return response['message']['content']
Define a Tweet Quality Judge¶
tweet_quality = BinaryClassificationPromptTemplate(
pre_messages=[("system", "You are evaluating the quality of tweets")],
criteria="""
Text is ENGAGING if it meets at least one of the following:
- Strong hook (question, surprise, bold statement)
- Uses emotion, humor, or opinion
- Encourages interaction
- Shows personality or distinct tone
- Includes vivid language or emojis
- Sparks curiosity or insight
Text is NEUTRAL if it lacks these qualities.
""",
target_category="ENGAGING",
non_target_category="NEUTRAL",
uncertainty="non_target",
include_reasoning=True,
)
judge = LLMEval("basic_tweet_generation.result", template=tweet_quality,
provider="openai", model=OLLAMA_LLM_MODEL, alias="Tweet quality")
Define a Prompt Execution Function¶
def run_prompt(generation_prompt: str, context) -> PromptExecutionLog:
"""generate engaging tweets"""
my_topics = [
"testing in AI engineering is as important as in development",
"CI/CD is applicable in AI",
"Collaboration of subject matter experts and AI engineers improves product",
"Start LLM apps development from test cases generation",
"evidently is a great tool for LLM testing"
]
tweets = [basic_tweet_generation(topic, model=OLLAMA_LLM_MODEL, instructions=generation_prompt) for topic in my_topics * 3]
return tweets
Run the Prompt Optimizer¶
optimizer = PromptOptimizer(
"tweet_gen_example",
strategy="feedback",
provider="openai", # Use OpenAI provider with Ollama's OpenAI-compatible API
model=OLLAMA_LLM_MODEL
)
optimizer.set_param(Params.BasePrompt, "You are tweet generator")
await optimizer.arun(run_prompt, scorer=judge, options=OLLAMA_OPTIONS)
View the Best Optimized Prompt¶
print(optimizer.best_prompt())
You are a master tweet generator specializing in engaging and fun content. Create a series of tweets that promote the importance and benefits of collaboration between subject matter experts and AI engineers, using examples from various industries and real-life scenarios. Use humor, emojis, and strong opinions to make them more appealing and captivating.
# === Unload Ollama Model & Shutdown Kernel ===
# Unloads the model from GPU memory before shutting down
try:
import ollama
print(f"Unloading Ollama model: {OLLAMA_LLM_MODEL}")
ollama.generate(model=OLLAMA_LLM_MODEL, prompt="", keep_alive=0)
print("Model unloaded from GPU memory")
except Exception as e:
print(f"Model unload skipped: {e}")
# Shut down the kernel to fully release resources
import IPython
app = IPython.Application.instance()
app.kernel.do_shutdown(restart=False)