Prompt Optimization with Evidently¶

Attribution & License

This notebook is adapted from: evidentlyai/community-examples, licensed under the Apache License, Version 2.0. © Original authors.

This notebook demonstrates how to use Evidently's PromptOptimizer API for optimizing prompts for LLM judges.

Code Review Quality Classifier¶

We'll walk through optimizing a prompt that classifies the quality of code reviews written for junior developers.

What you'll learn:¶

How to set up a dataset for LLM evaluation
How to define an LLM judge with a prompt template
How to run the prompt optimization loop
How to retrieve and inspect the best performing prompt

Bazzite-AI Setup Required
Run D0_00_Bazzite_AI_Setup.ipynb first to configure Ollama and verify GPU access.

In [39]:

  Copied!     
 
# If you haven't installed the required packages yet:
# !pip install evidently openai pandas
# If you haven't installed the required packages yet: # !pip install evidently openai pandas

Out[39]:

[No output generated]

In [60]:

  Copied!     
 
import pandas as pd

from evidently import Dataset, DataDefinition, LLMClassification
from evidently.llm.templates import BinaryClassificationPromptTemplate
from evidently.descriptors import LLMEval
from evidently.llm.optimization import PromptOptimizer
from evidently.descriptors import HuggingFace, HuggingFaceToxicity
import pandas as pd from evidently import Dataset, DataDefinition, LLMClassification from evidently.llm.templates import BinaryClassificationPromptTemplate from evidently.descriptors import LLMEval from evidently.llm.optimization import PromptOptimizer from evidently.descriptors import HuggingFace, HuggingFaceToxicity

[ERROR: Execution timed out after 120 seconds]

In [58]:

  Copied!     
 
# === Datasets Path ===
from pathlib import Path
DATASETS_DIR = Path("./datasets")
print(f"Datasets: {[f.name for f in DATASETS_DIR.glob('*.csv')]}")
# === Datasets Path === from pathlib import Path DATASETS_DIR = Path("./datasets") print(f"Datasets: {[f.name for f in DATASETS_DIR.glob('*.csv')]}")

[ERROR: Execution timed out after 90 seconds]

In [59]:

  Copied!     
 
import os
from typing import Dict, Any, List, Optional
from evidently.llm.utils.wrapper import OpenAIOptions, OpenAIWrapper, LLMMessage, LLMResult

# === Ollama Configuration via OpenAI-Compatible API ===
OLLAMA_HOST = os.getenv("OLLAMA_HOST", "http://ollama:11434")

# === Model Configuration ===
HF_LLM_MODEL = "NousResearch/Nous-Hermes-2-Mistral-7B-DPO-GGUF"
OLLAMA_LLM_MODEL = f"hf.co/{HF_LLM_MODEL}:Q4_K_M"

OLLAMA_OPTIONS = OpenAIOptions(
    api_key="ollama",
    api_url=f"{OLLAMA_HOST}/v1"
)

# === Patch OpenAIWrapper for smart JSON mode detection ===
# Evidently's OpenAI wrapper doesn't enable JSON mode by default.
# This patch detects when JSON output is expected and enables it.
_original_openai_complete = OpenAIWrapper.complete

async def _json_aware_complete(self, messages: List[LLMMessage], seed: Optional[int] = None) -> LLMResult[str]:
    import openai
    from openai.types.chat.chat_completion import ChatCompletion
    
    message_text = " ".join(m.content for m in messages if m.content)
    needs_json = "json" in message_text.lower() or '"category"' in message_text
    needs_xml = "<new_prompt>" in message_text
    
    formatted_messages = [{"role": msg.role, "content": msg.content} for msg in messages]
    
    try:
        kwargs = {"model": self.model, "messages": formatted_messages, "seed": seed}
        if needs_json and not needs_xml:
            kwargs["response_format"] = {"type": "json_object"}
        
        response: ChatCompletion = await self.client.chat.completions.create(**kwargs)
    except openai.RateLimitError as e:
        from evidently.llm.utils.wrapper import LLMRateLimitError
        raise LLMRateLimitError(e.message) from e
    except openai.APIError as e:
        from evidently.llm.utils.wrapper import LLMRequestError
        raise LLMRequestError(f"Failed to call OpenAI complete API: {e.message}", original_error=e) from e

    content = response.choices[0].message.content
    assert content is not None
    if response.usage is None:
        return LLMResult(content, 0, 0)
    return LLMResult(content, response.usage.prompt_tokens, response.usage.completion_tokens)

OpenAIWrapper.complete = _json_aware_complete

print(f"Ollama host: {OLLAMA_HOST}")
print(f"Model: {OLLAMA_LLM_MODEL}")
print(f"Using OpenAI-compatible API with smart JSON mode detection")
import os from typing import Dict, Any, List, Optional from evidently.llm.utils.wrapper import OpenAIOptions, OpenAIWrapper, LLMMessage, LLMResult # === Ollama Configuration via OpenAI-Compatible API === OLLAMA_HOST = os.getenv("OLLAMA_HOST", "http://ollama:11434") # === Model Configuration === HF_LLM_MODEL = "NousResearch/Nous-Hermes-2-Mistral-7B-DPO-GGUF" OLLAMA_LLM_MODEL = f"hf.co/{HF_LLM_MODEL}:Q4_K_M" OLLAMA_OPTIONS = OpenAIOptions( api_key="ollama", api_url=f"{OLLAMA_HOST}/v1" ) # === Patch OpenAIWrapper for smart JSON mode detection === # Evidently's OpenAI wrapper doesn't enable JSON mode by default. # This patch detects when JSON output is expected and enables it. _original_openai_complete = OpenAIWrapper.complete async def _json_aware_complete(self, messages: List[LLMMessage], seed: Optional[int] = None) -> LLMResult[str]: import openai from openai.types.chat.chat_completion import ChatCompletion message_text = " ".join(m.content for m in messages if m.content) needs_json = "json" in message_text.lower() or '"category"' in message_text needs_xml = "" in message_text formatted_messages = [{"role": msg.role, "content": msg.content} for msg in messages] try: kwargs = {"model": self.model, "messages": formatted_messages, "seed": seed} if needs_json and not needs_xml: kwargs["response_format"] = {"type": "json_object"} response: ChatCompletion = await self.client.chat.completions.create(**kwargs) except openai.RateLimitError as e: from evidently.llm.utils.wrapper import LLMRateLimitError raise LLMRateLimitError(e.message) from e except openai.APIError as e: from evidently.llm.utils.wrapper import LLMRequestError raise LLMRequestError(f"Failed to call OpenAI complete API: {e.message}", original_error=e) from e content = response.choices[0].message.content assert content is not None if response.usage is None: return LLMResult(content, 0, 0) return LLMResult(content, response.usage.prompt_tokens, response.usage.completion_tokens) OpenAIWrapper.complete = _json_aware_complete print(f"Ollama host: {OLLAMA_HOST}") print(f"Model: {OLLAMA_LLM_MODEL}") print(f"Using OpenAI-compatible API with smart JSON mode detection")

[ERROR: Execution timed out after 90 seconds]

In [43]:

  Copied!     
 
# Load your dataset
review_dataset = pd.read_csv(DATASETS_DIR / "code_review_dataset.csv")
review_dataset.head()
# Load your dataset review_dataset = pd.read_csv(DATASETS_DIR / "code_review_dataset.csv") review_dataset.head()

Out[43]:

                                    Generated review Expert label  \
0  This implementation appears to work, but the a...          bad   
1                             Great job! Keep it up!          bad   
2  It would be advisable to think about modularit...          bad   
3  You’ve structured the class very well, and the...         good   
4  Great job! This is clean and well-organized. T...          bad   

                                      Expert comment  
0  The tone is slighly condescending, no actionab...  
1                                     Not actionable  
2        there is a suggestion, but no real guidance  
3                              Good tone, actionable  
4                                        Pure praise

In [44]:

  Copied!     
 
# Define how Evidently should interpret your dataset
dd = DataDefinition(
    text_columns=["Generated review", "Expert comment"],
    categorical_columns=["Expert label"],
    llm=LLMClassification(input="Generated review", target="Expert label", reasoning="Expert comment")
)
# Define how Evidently should interpret your dataset dd = DataDefinition( text_columns=["Generated review", "Expert comment"], categorical_columns=["Expert label"], llm=LLMClassification(input="Generated review", target="Expert label", reasoning="Expert comment") )

Out[44]:

[No output generated]

In [8]:

  Copied!     
 
# Convert your pandas DataFrame into an Evidently Dataset
dataset = Dataset.from_pandas(review_dataset, data_definition=dd)
# Convert your pandas DataFrame into an Evidently Dataset dataset = Dataset.from_pandas(review_dataset, data_definition=dd)

In [9]:

  Copied!     
 
# Define a prompt template and judge for classifying code review quality
criteria = '''A review is GOOD when it's actionable and constructive.
A review is BAD when it is non-actionable or overly critical.'''

feedback_quality = BinaryClassificationPromptTemplate(
    pre_messages=[("system", "You are evaluating the quality of code reviews given to junior developers.")],
    criteria=criteria,
    target_category="bad",
    non_target_category="good",
    uncertainty="unknown",
    include_reasoning=True,
)

judge = LLMEval(
    alias="Code Review Judge",
    provider="openai",  # Use OpenAI provider with Ollama's OpenAI-compatible API
    model=OLLAMA_LLM_MODEL,
    column_name="Generated review",
    template=feedback_quality
)
# Define a prompt template and judge for classifying code review quality criteria = '''A review is GOOD when it's actionable and constructive. A review is BAD when it is non-actionable or overly critical.''' feedback_quality = BinaryClassificationPromptTemplate( pre_messages=[("system", "You are evaluating the quality of code reviews given to junior developers.")], criteria=criteria, target_category="bad", non_target_category="good", uncertainty="unknown", include_reasoning=True, ) judge = LLMEval( alias="Code Review Judge", provider="openai", # Use OpenAI provider with Ollama's OpenAI-compatible API model=OLLAMA_LLM_MODEL, column_name="Generated review", template=feedback_quality )

In [19]:

  Copied!     
 
# Initialize the optimizer with Ollama via OpenAI-compatible API
optimizer = PromptOptimizer(
    "code_review_example",
    strategy="feedback",
    provider="openai",  # Use OpenAI provider with Ollama's OpenAI-compatible API
    model=OLLAMA_LLM_MODEL
)
optimizer.set_input_dataset(dataset)
await optimizer.arun(judge, "accuracy", options=OLLAMA_OPTIONS)
# Initialize the optimizer with Ollama via OpenAI-compatible API optimizer = PromptOptimizer( "code_review_example", strategy="feedback", provider="openai", # Use OpenAI provider with Ollama's OpenAI-compatible API model=OLLAMA_LLM_MODEL ) optimizer.set_input_dataset(dataset) await optimizer.arun(judge, "accuracy", options=OLLAMA_OPTIONS)

In [20]:

  Copied!     
 
# Show the best-performing prompt template found by the optimizer
print(optimizer.best_prompt())
# Show the best-performing prompt template found by the optimizer print(optimizer.best_prompt())

A code review is GOOD when it's actionable and provides specific suggestions for improvement, addressing both positive aspects and potential issues while maintaining a constructive tone. A code review is BAD when it lacks actionable feedback or offers vague advice, only providing praise or criticism without suggestions for resolution.

Example 2: Bookings Query Classifier¶

In this tutorial, we'll optimize a prompt for classifying different types of customer service queries (like Booking, Payment, or Technical issues) using an LLM classifier.

What you'll learn:¶

How to load a dataset for LLM classification
How to define a multiclass classification prompt
How to run prompt optimization with Evidently
How to retrieve the best performing prompt

In [45]:

  Copied!     
 
import pandas as pd

from evidently import Dataset, DataDefinition, LLMClassification
from evidently.descriptors import LLMEval
from evidently.llm.templates import MulticlassClassificationPromptTemplate
from evidently.llm.optimization import PromptOptimizer
import pandas as pd from evidently import Dataset, DataDefinition, LLMClassification from evidently.descriptors import LLMEval from evidently.llm.templates import MulticlassClassificationPromptTemplate from evidently.llm.optimization import PromptOptimizer

Out[45]:

[No output generated]

Load Your Dataset¶

In [49]:

  Copied!     
 
data = pd.read_csv(DATASETS_DIR / "booking_queries_dataset.csv")
data.head()
data = pd.read_csv(DATASETS_DIR / "booking_queries_dataset.csv") data.head()

Out[49]:

                                               query      label
0     booked a trip for 4 ppl, want to add a 5th now    Booking
1  hello team, please confirm if my hotel reserva...    Booking
2  i can’t see the payment options, dropdown just...  Technical
3  I heard airlines sometimes overbook, what’s yo...     Policy
4        wanna reschedule my train ride to next week    Booking

Define Data Structure for Evidently¶

In [50]:

  Copied!     
 
dd = DataDefinition(
    text_columns=["query"],
    categorical_columns=["label"],
    llm=LLMClassification(input="query", target="label")
)
dd = DataDefinition( text_columns=["query"], categorical_columns=["label"], llm=LLMClassification(input="query", target="label") )

Out[50]:

[No output generated]

In [51]:

  Copied!     
 
dataset = Dataset.from_pandas(data, data_definition=dd)
dataset = Dataset.from_pandas(data, data_definition=dd)

Out[51]:

[No output generated]

Define a Multiclass Prompt and LLM Judge¶

In [52]:

  Copied!     
 
base_prompt = "Classify inqueries by categories"

t = MulticlassClassificationPromptTemplate(
    pre_messages=[("system", "You are classifying user queries.")],
    criteria=base_prompt,
    category_criteria={
        "Booking": "bookings",
        "Technical": "technical questions",
        "Policy": "questions about policies",
        "Payment": "payment questions",
        "Escalation": "escalation requests"
    },
    uncertainty="unknown",
    include_reasoning=True,
)

judge = LLMEval(
    alias="bookings",
    provider="openai",  # Use OpenAI provider with Ollama's OpenAI-compatible API
    model=OLLAMA_LLM_MODEL,
    column_name="query",
    template=t
)
base_prompt = "Classify inqueries by categories" t = MulticlassClassificationPromptTemplate( pre_messages=[("system", "You are classifying user queries.")], criteria=base_prompt, category_criteria={ "Booking": "bookings", "Technical": "technical questions", "Policy": "questions about policies", "Payment": "payment questions", "Escalation": "escalation requests" }, uncertainty="unknown", include_reasoning=True, ) judge = LLMEval( alias="bookings", provider="openai", # Use OpenAI provider with Ollama's OpenAI-compatible API model=OLLAMA_LLM_MODEL, column_name="query", template=t )

Out[52]:

[No output generated]

Run the Prompt Optimizer¶

In [57]:

  Copied!     
 
optimizer = PromptOptimizer(
    "bookings_example", 
    strategy="feedback",
    provider="openai",  # Use OpenAI provider with Ollama's OpenAI-compatible API
    model=OLLAMA_LLM_MODEL
)
optimizer.set_input_dataset(dataset)
await optimizer.arun(judge, "accuracy", options=OLLAMA_OPTIONS)
optimizer = PromptOptimizer( "bookings_example", strategy="feedback", provider="openai", # Use OpenAI provider with Ollama's OpenAI-compatible API model=OLLAMA_LLM_MODEL ) optimizer.set_input_dataset(dataset) await optimizer.arun(judge, "accuracy", options=OLLAMA_OPTIONS)

[ERROR: Execution timed out after 600 seconds]

View the Best Optimized Prompt¶

In [55]:

  Copied!     
 
print(optimizer.best_prompt())
print(optimizer.best_prompt())

[ERROR: Execution timed out after 90 seconds]

Example 3: Tweet Generation Example¶

This tutorial shows how to optimize prompts for generating engaging tweets using Evidently's PromptOptimizer API. We'll iteratively improve a tweet generation prompt to maximize how engaging LLM-generated tweets are, according to a classifier.

What you'll learn:¶

How to define a tweet generation function with OpenAI
How to set up an LLM judge to classify tweet engagement
How to optimize a tweet generation prompt based on feedback
How to inspect the best optimized prompt

In [28]:

  Copied!     
 
# Install packages if needed
# !pip install evidently openai pandas
# Install packages if needed # !pip install evidently openai pandas

In [29]:

  Copied!     
 
import pandas as pd
import ollama

from evidently.descriptors import LLMEval
from evidently.llm.templates import BinaryClassificationPromptTemplate
from evidently.llm.optimization import PromptOptimizer, PromptExecutionLog, Params
import pandas as pd import ollama from evidently.descriptors import LLMEval from evidently.llm.templates import BinaryClassificationPromptTemplate from evidently.llm.optimization import PromptOptimizer, PromptExecutionLog, Params

Define a Tweet Generation Function¶

In [30]:

  Copied!     
 
import ollama

def basic_tweet_generation(topic, model=OLLAMA_LLM_MODEL, instructions=""):
    response = ollama.chat(
        model=model,
        messages=[
            {"role": "system", "content": instructions},
            {"role": "user", "content": f"Write a short paragraph about {topic}"}
        ]
    )
    return response['message']['content']
import ollama def basic_tweet_generation(topic, model=OLLAMA_LLM_MODEL, instructions=""): response = ollama.chat( model=model, messages=[ {"role": "system", "content": instructions}, {"role": "user", "content": f"Write a short paragraph about {topic}"} ] ) return response['message']['content']

Define a Tweet Quality Judge¶

In [31]:

  Copied!     
 
tweet_quality = BinaryClassificationPromptTemplate(
    pre_messages=[("system", "You are evaluating the quality of tweets")],
    criteria="""
Text is ENGAGING if it meets at least one of the following:
  - Strong hook (question, surprise, bold statement)
  - Uses emotion, humor, or opinion
  - Encourages interaction
  - Shows personality or distinct tone
  - Includes vivid language or emojis
  - Sparks curiosity or insight

Text is NEUTRAL if it lacks these qualities.
""",
    target_category="ENGAGING",
    non_target_category="NEUTRAL",
    uncertainty="non_target",
    include_reasoning=True,
)

judge = LLMEval("basic_tweet_generation.result", template=tweet_quality,
                provider="openai", model=OLLAMA_LLM_MODEL, alias="Tweet quality")
tweet_quality = BinaryClassificationPromptTemplate( pre_messages=[("system", "You are evaluating the quality of tweets")], criteria=""" Text is ENGAGING if it meets at least one of the following: - Strong hook (question, surprise, bold statement) - Uses emotion, humor, or opinion - Encourages interaction - Shows personality or distinct tone - Includes vivid language or emojis - Sparks curiosity or insight Text is NEUTRAL if it lacks these qualities. """, target_category="ENGAGING", non_target_category="NEUTRAL", uncertainty="non_target", include_reasoning=True, ) judge = LLMEval("basic_tweet_generation.result", template=tweet_quality, provider="openai", model=OLLAMA_LLM_MODEL, alias="Tweet quality")

Define a Prompt Execution Function¶

In [32]:

  Copied!     
 
def run_prompt(generation_prompt: str, context) -> PromptExecutionLog:
    """generate engaging tweets"""
    my_topics = [
        "testing in AI engineering is as important as in development",
        "CI/CD is applicable in AI",
        "Collaboration of subject matter experts and AI engineers improves product",
        "Start LLM apps development from test cases generation",
        "evidently is a great tool for LLM testing"
    ]
    tweets = [basic_tweet_generation(topic, model=OLLAMA_LLM_MODEL, instructions=generation_prompt) for topic in my_topics * 3]
    return tweets
def run_prompt(generation_prompt: str, context) -> PromptExecutionLog: """generate engaging tweets""" my_topics = [ "testing in AI engineering is as important as in development", "CI/CD is applicable in AI", "Collaboration of subject matter experts and AI engineers improves product", "Start LLM apps development from test cases generation", "evidently is a great tool for LLM testing" ] tweets = [basic_tweet_generation(topic, model=OLLAMA_LLM_MODEL, instructions=generation_prompt) for topic in my_topics * 3] return tweets

Run the Prompt Optimizer¶

In [33]:

  Copied!     
 
optimizer = PromptOptimizer(
    "tweet_gen_example", 
    strategy="feedback",
    provider="openai",  # Use OpenAI provider with Ollama's OpenAI-compatible API
    model=OLLAMA_LLM_MODEL
)
optimizer.set_param(Params.BasePrompt, "You are tweet generator")
await optimizer.arun(run_prompt, scorer=judge, options=OLLAMA_OPTIONS)
optimizer = PromptOptimizer( "tweet_gen_example", strategy="feedback", provider="openai", # Use OpenAI provider with Ollama's OpenAI-compatible API model=OLLAMA_LLM_MODEL ) optimizer.set_param(Params.BasePrompt, "You are tweet generator") await optimizer.arun(run_prompt, scorer=judge, options=OLLAMA_OPTIONS)

View the Best Optimized Prompt¶

In [34]:

  Copied!     
 
print(optimizer.best_prompt())
print(optimizer.best_prompt())

You are a master tweet generator specializing in engaging and fun content. Create a series of tweets that promote the importance and benefits of collaboration between subject matter experts and AI engineers, using examples from various industries and real-life scenarios. Use humor, emojis, and strong opinions to make them more appealing and captivating.

In [ ]:

  Copied!     
 
# === Unload Ollama Model & Shutdown Kernel ===
# Unloads the model from GPU memory before shutting down

try:
    import ollama
    print(f"Unloading Ollama model: {OLLAMA_LLM_MODEL}")
    ollama.generate(model=OLLAMA_LLM_MODEL, prompt="", keep_alive=0)
    print("Model unloaded from GPU memory")
except Exception as e:
    print(f"Model unload skipped: {e}")

# Shut down the kernel to fully release resources
import IPython
app = IPython.Application.instance()
app.kernel.do_shutdown(restart=False)
# === Unload Ollama Model & Shutdown Kernel === # Unloads the model from GPU memory before shutting down try: import ollama print(f"Unloading Ollama model: {OLLAMA_LLM_MODEL}") ollama.generate(model=OLLAMA_LLM_MODEL, prompt="", keep_alive=0) print("Model unloaded from GPU memory") except Exception as e: print(f"Model unload skipped: {e}") # Shut down the kernel to fully release resources import IPython app = IPython.Application.instance() app.kernel.do_shutdown(restart=False)