- Rename skill/ to skills/ for consistency with naming conventions - Rename agent/ to agents/ and command/ to commands/ - Update AGENTS.md with all directory references - Update scripts/test-skill.sh paths - Update prompts/athena.txt documentation This aligns with best practices of using plural directory names and updates all documentation to reflect the new structure.
415 lines
12 KiB
Markdown
415 lines
12 KiB
Markdown
# Prompt Optimization Guide
|
|
|
|
## Systematic Refinement Process
|
|
|
|
### 1. Baseline Establishment
|
|
```python
|
|
def establish_baseline(prompt, test_cases):
|
|
results = {
|
|
'accuracy': 0,
|
|
'avg_tokens': 0,
|
|
'avg_latency': 0,
|
|
'success_rate': 0
|
|
}
|
|
|
|
for test_case in test_cases:
|
|
response = llm.complete(prompt.format(**test_case['input']))
|
|
|
|
results['accuracy'] += evaluate_accuracy(response, test_case['expected'])
|
|
results['avg_tokens'] += count_tokens(response)
|
|
results['avg_latency'] += measure_latency(response)
|
|
results['success_rate'] += is_valid_response(response)
|
|
|
|
# Average across test cases
|
|
n = len(test_cases)
|
|
return {k: v/n for k, v in results.items()}
|
|
```
|
|
|
|
### 2. Iterative Refinement Workflow
|
|
```
|
|
Initial Prompt → Test → Analyze Failures → Refine → Test → Repeat
|
|
```
|
|
|
|
```python
|
|
class PromptOptimizer:
|
|
def __init__(self, initial_prompt, test_suite):
|
|
self.prompt = initial_prompt
|
|
self.test_suite = test_suite
|
|
self.history = []
|
|
|
|
def optimize(self, max_iterations=10):
|
|
for i in range(max_iterations):
|
|
# Test current prompt
|
|
results = self.evaluate_prompt(self.prompt)
|
|
self.history.append({
|
|
'iteration': i,
|
|
'prompt': self.prompt,
|
|
'results': results
|
|
})
|
|
|
|
# Stop if good enough
|
|
if results['accuracy'] > 0.95:
|
|
break
|
|
|
|
# Analyze failures
|
|
failures = self.analyze_failures(results)
|
|
|
|
# Generate refinement suggestions
|
|
refinements = self.generate_refinements(failures)
|
|
|
|
# Apply best refinement
|
|
self.prompt = self.select_best_refinement(refinements)
|
|
|
|
return self.get_best_prompt()
|
|
```
|
|
|
|
### 3. A/B Testing Framework
|
|
```python
|
|
class PromptABTest:
|
|
def __init__(self, variant_a, variant_b):
|
|
self.variant_a = variant_a
|
|
self.variant_b = variant_b
|
|
|
|
def run_test(self, test_queries, metrics=['accuracy', 'latency']):
|
|
results = {
|
|
'A': {m: [] for m in metrics},
|
|
'B': {m: [] for m in metrics}
|
|
}
|
|
|
|
for query in test_queries:
|
|
# Randomly assign variant (50/50 split)
|
|
variant = 'A' if random.random() < 0.5 else 'B'
|
|
prompt = self.variant_a if variant == 'A' else self.variant_b
|
|
|
|
response, metrics_data = self.execute_with_metrics(
|
|
prompt.format(query=query['input'])
|
|
)
|
|
|
|
for metric in metrics:
|
|
results[variant][metric].append(metrics_data[metric])
|
|
|
|
return self.analyze_results(results)
|
|
|
|
def analyze_results(self, results):
|
|
from scipy import stats
|
|
|
|
analysis = {}
|
|
for metric in results['A'].keys():
|
|
a_values = results['A'][metric]
|
|
b_values = results['B'][metric]
|
|
|
|
# Statistical significance test
|
|
t_stat, p_value = stats.ttest_ind(a_values, b_values)
|
|
|
|
analysis[metric] = {
|
|
'A_mean': np.mean(a_values),
|
|
'B_mean': np.mean(b_values),
|
|
'improvement': (np.mean(b_values) - np.mean(a_values)) / np.mean(a_values),
|
|
'statistically_significant': p_value < 0.05,
|
|
'p_value': p_value,
|
|
'winner': 'B' if np.mean(b_values) > np.mean(a_values) else 'A'
|
|
}
|
|
|
|
return analysis
|
|
```
|
|
|
|
## Optimization Strategies
|
|
|
|
### Token Reduction
|
|
```python
|
|
def optimize_for_tokens(prompt):
|
|
optimizations = [
|
|
# Remove redundant phrases
|
|
('in order to', 'to'),
|
|
('due to the fact that', 'because'),
|
|
('at this point in time', 'now'),
|
|
|
|
# Consolidate instructions
|
|
('First, ...\\nThen, ...\\nFinally, ...', 'Steps: 1) ... 2) ... 3) ...'),
|
|
|
|
# Use abbreviations (after first definition)
|
|
('Natural Language Processing (NLP)', 'NLP'),
|
|
|
|
# Remove filler words
|
|
(' actually ', ' '),
|
|
(' basically ', ' '),
|
|
(' really ', ' ')
|
|
]
|
|
|
|
optimized = prompt
|
|
for old, new in optimizations:
|
|
optimized = optimized.replace(old, new)
|
|
|
|
return optimized
|
|
```
|
|
|
|
### Latency Reduction
|
|
```python
|
|
def optimize_for_latency(prompt):
|
|
strategies = {
|
|
'shorter_prompt': reduce_token_count(prompt),
|
|
'streaming': enable_streaming_response(prompt),
|
|
'caching': add_cacheable_prefix(prompt),
|
|
'early_stopping': add_stop_sequences(prompt)
|
|
}
|
|
|
|
# Test each strategy
|
|
best_strategy = None
|
|
best_latency = float('inf')
|
|
|
|
for name, modified_prompt in strategies.items():
|
|
latency = measure_average_latency(modified_prompt)
|
|
if latency < best_latency:
|
|
best_latency = latency
|
|
best_strategy = modified_prompt
|
|
|
|
return best_strategy
|
|
```
|
|
|
|
### Accuracy Improvement
|
|
```python
|
|
def improve_accuracy(prompt, failure_cases):
|
|
improvements = []
|
|
|
|
# Add constraints for common failures
|
|
if has_format_errors(failure_cases):
|
|
improvements.append("Output must be valid JSON with no additional text.")
|
|
|
|
# Add examples for edge cases
|
|
edge_cases = identify_edge_cases(failure_cases)
|
|
if edge_cases:
|
|
improvements.append(f"Examples of edge cases:\\n{format_examples(edge_cases)}")
|
|
|
|
# Add verification step
|
|
if has_logical_errors(failure_cases):
|
|
improvements.append("Before responding, verify your answer is logically consistent.")
|
|
|
|
# Strengthen instructions
|
|
if has_ambiguity_errors(failure_cases):
|
|
improvements.append(clarify_ambiguous_instructions(prompt))
|
|
|
|
return integrate_improvements(prompt, improvements)
|
|
```
|
|
|
|
## Performance Metrics
|
|
|
|
### Core Metrics
|
|
```python
|
|
class PromptMetrics:
|
|
@staticmethod
|
|
def accuracy(responses, ground_truth):
|
|
return sum(r == gt for r, gt in zip(responses, ground_truth)) / len(responses)
|
|
|
|
@staticmethod
|
|
def consistency(responses):
|
|
# Measure how often identical inputs produce identical outputs
|
|
from collections import defaultdict
|
|
input_responses = defaultdict(list)
|
|
|
|
for inp, resp in responses:
|
|
input_responses[inp].append(resp)
|
|
|
|
consistency_scores = []
|
|
for inp, resps in input_responses.items():
|
|
if len(resps) > 1:
|
|
# Percentage of responses that match the most common response
|
|
most_common_count = Counter(resps).most_common(1)[0][1]
|
|
consistency_scores.append(most_common_count / len(resps))
|
|
|
|
return np.mean(consistency_scores) if consistency_scores else 1.0
|
|
|
|
@staticmethod
|
|
def token_efficiency(prompt, responses):
|
|
avg_prompt_tokens = np.mean([count_tokens(prompt.format(**r['input'])) for r in responses])
|
|
avg_response_tokens = np.mean([count_tokens(r['output']) for r in responses])
|
|
return avg_prompt_tokens + avg_response_tokens
|
|
|
|
@staticmethod
|
|
def latency_p95(latencies):
|
|
return np.percentile(latencies, 95)
|
|
```
|
|
|
|
### Automated Evaluation
|
|
```python
|
|
def evaluate_prompt_comprehensively(prompt, test_suite):
|
|
results = {
|
|
'accuracy': [],
|
|
'consistency': [],
|
|
'latency': [],
|
|
'tokens': [],
|
|
'success_rate': []
|
|
}
|
|
|
|
# Run each test case multiple times for consistency measurement
|
|
for test_case in test_suite:
|
|
runs = []
|
|
for _ in range(3): # 3 runs per test case
|
|
start = time.time()
|
|
response = llm.complete(prompt.format(**test_case['input']))
|
|
latency = time.time() - start
|
|
|
|
runs.append(response)
|
|
results['latency'].append(latency)
|
|
results['tokens'].append(count_tokens(prompt) + count_tokens(response))
|
|
|
|
# Accuracy (best of 3 runs)
|
|
accuracies = [evaluate_accuracy(r, test_case['expected']) for r in runs]
|
|
results['accuracy'].append(max(accuracies))
|
|
|
|
# Consistency (how similar are the 3 runs?)
|
|
results['consistency'].append(calculate_similarity(runs))
|
|
|
|
# Success rate (all runs successful?)
|
|
results['success_rate'].append(all(is_valid(r) for r in runs))
|
|
|
|
return {
|
|
'avg_accuracy': np.mean(results['accuracy']),
|
|
'avg_consistency': np.mean(results['consistency']),
|
|
'p95_latency': np.percentile(results['latency'], 95),
|
|
'avg_tokens': np.mean(results['tokens']),
|
|
'success_rate': np.mean(results['success_rate'])
|
|
}
|
|
```
|
|
|
|
## Failure Analysis
|
|
|
|
### Categorizing Failures
|
|
```python
|
|
class FailureAnalyzer:
|
|
def categorize_failures(self, test_results):
|
|
categories = {
|
|
'format_errors': [],
|
|
'factual_errors': [],
|
|
'logic_errors': [],
|
|
'incomplete_responses': [],
|
|
'hallucinations': [],
|
|
'off_topic': []
|
|
}
|
|
|
|
for result in test_results:
|
|
if not result['success']:
|
|
category = self.determine_failure_type(
|
|
result['response'],
|
|
result['expected']
|
|
)
|
|
categories[category].append(result)
|
|
|
|
return categories
|
|
|
|
def generate_fixes(self, categorized_failures):
|
|
fixes = []
|
|
|
|
if categorized_failures['format_errors']:
|
|
fixes.append({
|
|
'issue': 'Format errors',
|
|
'fix': 'Add explicit format examples and constraints',
|
|
'priority': 'high'
|
|
})
|
|
|
|
if categorized_failures['hallucinations']:
|
|
fixes.append({
|
|
'issue': 'Hallucinations',
|
|
'fix': 'Add grounding instruction: "Base your answer only on provided context"',
|
|
'priority': 'critical'
|
|
})
|
|
|
|
if categorized_failures['incomplete_responses']:
|
|
fixes.append({
|
|
'issue': 'Incomplete responses',
|
|
'fix': 'Add: "Ensure your response fully addresses all parts of the question"',
|
|
'priority': 'medium'
|
|
})
|
|
|
|
return fixes
|
|
```
|
|
|
|
## Versioning and Rollback
|
|
|
|
### Prompt Version Control
|
|
```python
|
|
class PromptVersionControl:
|
|
def __init__(self, storage_path):
|
|
self.storage = storage_path
|
|
self.versions = []
|
|
|
|
def save_version(self, prompt, metadata):
|
|
version = {
|
|
'id': len(self.versions),
|
|
'prompt': prompt,
|
|
'timestamp': datetime.now(),
|
|
'metrics': metadata.get('metrics', {}),
|
|
'description': metadata.get('description', ''),
|
|
'parent_id': metadata.get('parent_id')
|
|
}
|
|
self.versions.append(version)
|
|
self.persist()
|
|
return version['id']
|
|
|
|
def rollback(self, version_id):
|
|
if version_id < len(self.versions):
|
|
return self.versions[version_id]['prompt']
|
|
raise ValueError(f"Version {version_id} not found")
|
|
|
|
def compare_versions(self, v1_id, v2_id):
|
|
v1 = self.versions[v1_id]
|
|
v2 = self.versions[v2_id]
|
|
|
|
return {
|
|
'diff': generate_diff(v1['prompt'], v2['prompt']),
|
|
'metrics_comparison': {
|
|
metric: {
|
|
'v1': v1['metrics'].get(metric),
|
|
'v2': v2['metrics'].get(metric'),
|
|
'change': v2['metrics'].get(metric, 0) - v1['metrics'].get(metric, 0)
|
|
}
|
|
for metric in set(v1['metrics'].keys()) | set(v2['metrics'].keys())
|
|
}
|
|
}
|
|
```
|
|
|
|
## Best Practices
|
|
|
|
1. **Establish Baseline**: Always measure initial performance
|
|
2. **Change One Thing**: Isolate variables for clear attribution
|
|
3. **Test Thoroughly**: Use diverse, representative test cases
|
|
4. **Track Metrics**: Log all experiments and results
|
|
5. **Validate Significance**: Use statistical tests for A/B comparisons
|
|
6. **Document Changes**: Keep detailed notes on what and why
|
|
7. **Version Everything**: Enable rollback to previous versions
|
|
8. **Monitor Production**: Continuously evaluate deployed prompts
|
|
|
|
## Common Optimization Patterns
|
|
|
|
### Pattern 1: Add Structure
|
|
```
|
|
Before: "Analyze this text"
|
|
After: "Analyze this text for:\n1. Main topic\n2. Key arguments\n3. Conclusion"
|
|
```
|
|
|
|
### Pattern 2: Add Examples
|
|
```
|
|
Before: "Extract entities"
|
|
After: "Extract entities\\n\\nExample:\\nText: Apple released iPhone\\nEntities: {company: Apple, product: iPhone}"
|
|
```
|
|
|
|
### Pattern 3: Add Constraints
|
|
```
|
|
Before: "Summarize this"
|
|
After: "Summarize in exactly 3 bullet points, 15 words each"
|
|
```
|
|
|
|
### Pattern 4: Add Verification
|
|
```
|
|
Before: "Calculate..."
|
|
After: "Calculate... Then verify your calculation is correct before responding."
|
|
```
|
|
|
|
## Tools and Utilities
|
|
|
|
- Prompt diff tools for version comparison
|
|
- Automated test runners
|
|
- Metric dashboards
|
|
- A/B testing frameworks
|
|
- Token counting utilities
|
|
- Latency profilers
|