Skip to content

Evaluation API Reference

PromptMetrics

@dataclass
class PromptMetrics:
    """Metrics for evaluating expanded prompt quality."""

    expansion_ratio: float = 0.0
    structure_score: float = 0.0
    specificity_score: float = 0.0
    completeness_score: float = 0.0
    readability_score: float = 0.0
    overall_score: float = 0.0
    details: dict = field(default_factory=dict)

    def to_dict(self) -> dict:
        """Convert metrics to dictionary."""

Attributes

Attribute Type Description
expansion_ratio float Ratio of expanded to original length
structure_score float Score for structural elements (0-1)
specificity_score float Score for specific instructions (0-1)
completeness_score float Score for section coverage (0-1)
readability_score float Score for readability (0-1)
overall_score float Weighted overall score (0-1)
details dict Detailed breakdown of scores

RetrievalMetrics

@dataclass
class RetrievalMetrics:
    """Metrics for evaluating retrieval quality."""

    precision_at_k: float = 0.0
    recall_at_k: float = 0.0
    mrr: float = 0.0
    ndcg: float = 0.0
    average_score: float = 0.0
    relevance_scores: list[float] = field(default_factory=list)

    def to_dict(self) -> dict:
        """Convert metrics to dictionary."""

Attributes

Attribute Type Description
precision_at_k float Precision at k documents
recall_at_k float Recall at k documents
mrr float Mean Reciprocal Rank
ndcg float Normalized Discounted Cumulative Gain
average_score float Average similarity score
relevance_scores list[float] Individual relevance scores

Functions

calculate_expansion_quality

def calculate_expansion_quality(
    original_prompt: str,
    expanded_prompt: str,
    weights: Optional[dict] = None,
) -> PromptMetrics:
    """
    Calculate quality metrics for an expanded prompt.

    Args:
        original_prompt: The original short prompt.
        expanded_prompt: The expanded prompt.
        weights: Optional weights for overall score. Default:
            {"structure": 0.25, "specificity": 0.25, 
             "completeness": 0.25, "readability": 0.25}

    Returns:
        PromptMetrics with all calculated scores.
    """

Example

from prompt_amplifier.evaluation import calculate_expansion_quality

metrics = calculate_expansion_quality(
    original_prompt="Summarize data",
    expanded_prompt="**GOAL:** Generate summary...",
    weights={"structure": 0.4, "specificity": 0.3, 
             "completeness": 0.2, "readability": 0.1}
)

calculate_retrieval_metrics

def calculate_retrieval_metrics(
    retrieved_scores: list[float],
    relevant_indices: Optional[list[int]] = None,
    k: int = 5,
) -> RetrievalMetrics:
    """
    Calculate retrieval quality metrics.

    Args:
        retrieved_scores: Similarity scores of retrieved documents.
        relevant_indices: Indices of known relevant documents.
        k: Number of retrieved documents to consider.

    Returns:
        RetrievalMetrics with calculated scores.
    """

Example

from prompt_amplifier.evaluation import calculate_retrieval_metrics

metrics = calculate_retrieval_metrics(
    retrieved_scores=[0.9, 0.8, 0.6, 0.4, 0.3],
    relevant_indices=[0, 1, 4],
    k=5
)

calculate_diversity_score

def calculate_diversity_score(embeddings: list[list[float]]) -> float:
    """
    Calculate diversity score for embeddings.

    Higher score means more diverse (less similar) results.

    Args:
        embeddings: List of embedding vectors.

    Returns:
        Diversity score (0-1).
    """

Example

from prompt_amplifier.evaluation import calculate_diversity_score

score = calculate_diversity_score([
    [0.1, 0.2, 0.3],
    [0.4, 0.5, 0.6],
])

calculate_coherence_score

def calculate_coherence_score(
    prompt: str,
    context_chunks: list[str],
) -> float:
    """
    Calculate coherence between prompt and context.

    Args:
        prompt: The expanded prompt.
        context_chunks: Context strings used for expansion.

    Returns:
        Coherence score (0-1).
    """

Example

from prompt_amplifier.evaluation import calculate_coherence_score

score = calculate_coherence_score(
    prompt="Analyze Q4 sales in North America...",
    context_chunks=["Q4 sales reached $1.2M in North America"]
)

compare_embedders

def compare_embedders(
    texts: list[str],
    queries: list[str],
    embedders: list[Any],
    embedder_names: Optional[list[str]] = None,
) -> dict[str, dict]:
    """
    Compare multiple embedders on the same data.

    Args:
        texts: Documents to embed.
        queries: Queries to test retrieval.
        embedders: List of embedder instances.
        embedder_names: Optional names for embedders.

    Returns:
        Dictionary with comparison results:
        {
            "EmbedderName": {
                "embedding_time_ms": float,
                "query_time_ms": float,
                "dimension": int,
                "avg_query_scores": list[float],
                "error": str  # if failed
            }
        }
    """

Example

from prompt_amplifier.evaluation import compare_embedders
from prompt_amplifier.embedders import TFIDFEmbedder, SentenceTransformerEmbedder

results = compare_embedders(
    texts=["doc1", "doc2", "doc3"],
    queries=["query1", "query2"],
    embedders=[TFIDFEmbedder(), SentenceTransformerEmbedder()],
    embedder_names=["TF-IDF", "Sentence Transformers"]
)

benchmark_generators

def benchmark_generators(
    prompt: str,
    context: str,
    generators: list[Any],
    generator_names: Optional[list[str]] = None,
    num_runs: int = 1,
) -> dict[str, dict]:
    """
    Benchmark multiple generators on the same prompt.

    Args:
        prompt: The prompt to expand.
        context: Context for expansion.
        generators: List of generator instances.
        generator_names: Optional names for generators.
        num_runs: Number of runs to average.

    Returns:
        Dictionary with benchmark results:
        {
            "GeneratorName": {
                "avg_time_ms": float,
                "avg_expansion_ratio": float,
                "avg_quality_score": float,
                "outputs": list[str],
                "error": str  # if failed
            }
        }
    """

Example

from prompt_amplifier.evaluation import benchmark_generators
from prompt_amplifier.generators import OpenAIGenerator, AnthropicGenerator

results = benchmark_generators(
    prompt="Summarize performance",
    context="Q4 revenue was $5.2M...",
    generators=[OpenAIGenerator(), AnthropicGenerator()],
    generator_names=["GPT-4", "Claude"],
    num_runs=3
)

EvaluationSuite

class EvaluationSuite:
    """Comprehensive evaluation suite for Prompt Amplifier."""

    def __init__(self):
        self.test_cases: list[dict] = []

    def add_test_case(
        self,
        name: str,
        prompt: str,
        relevant_docs: Optional[list[str]] = None,
        expected_keywords: Optional[list[str]] = None,
    ) -> None:
        """Add a test case to the suite."""

    def run(self, forge: Any) -> list[dict]:
        """Run all test cases against a PromptForge instance."""

    def print_report(self, results: list[dict]) -> None:
        """Print a formatted report of results."""

Methods

add_test_case

def add_test_case(
    self,
    name: str,
    prompt: str,
    relevant_docs: Optional[list[str]] = None,
    expected_keywords: Optional[list[str]] = None,
) -> None:
    """
    Add a test case.

    Args:
        name: Name of the test case.
        prompt: The prompt to test.
        relevant_docs: Optional list of relevant document contents.
        expected_keywords: Optional keywords to check in output.
    """

run

def run(self, forge: Any) -> list[dict]:
    """
    Run all test cases.

    Args:
        forge: PromptForge instance to test.

    Returns:
        List of result dictionaries:
        [
            {
                "name": str,
                "prompt": str,
                "success": bool,
                "time_ms": float,
                "expanded_prompt": str,
                "expansion_ratio": float,
                "quality_metrics": dict,
                "keyword_coverage": float,  # if keywords provided
                "error": str  # if failed
            }
        ]
    """
def print_report(self, results: list[dict]) -> None:
    """
    Print formatted report.

    Args:
        results: Results from run().
    """

Complete Example

from prompt_amplifier import PromptForge
from prompt_amplifier.evaluation import EvaluationSuite

# Setup
forge = PromptForge()
forge.add_texts(["POC Health tracking...", "Winscore metrics..."])

# Create suite
suite = EvaluationSuite()
suite.add_test_case(
    name="Deal Status",
    prompt="How's the deal?",
    expected_keywords=["POC", "health"]
)
suite.add_test_case(
    name="Metrics",
    prompt="What metrics to track?",
    expected_keywords=["Winscore"]
)

# Run and report
results = suite.run(forge)
suite.print_report(results)