Module 5: Production Context Engineering

Overview

Building a proof-of-concept is one thing. Scaling it to handle thousands of users, managing costs, and ensuring reliability is another challenge entirely. This module transforms your context engineering knowledge into production-ready systems that can handle real-world complexity at scale.

What You'll Learn

Production architecture patterns for context-aware systems
Performance optimization techniques
Cost management and token economics
Error handling and recovery strategies
Monitoring and debugging at scale

Prerequisites

Completed Modules 1-4
Basic understanding of system architecture
Experience with deployment (helpful but not required)
60 minutes for comprehensive implementation

Production Architecture Patterns

The Scalability Challenge

Your local prototype works great, but production demands:

Handling 1000+ concurrent users
Sub-second response times
99.9% uptime
Costs that don't bankrupt you

Reference Architecture

┌─────────────────────────────────────────────────────────────┐
│                        Load Balancer                         │
└───────────────────────────┬─────────────────────────────┬──────────────────┘
                  │                       │
        ┌─────────▼─────────┐   ┌────────▼─────────┐
        │   API Gateway     │   │   API Gateway    │
        └─────────┬─────────┘   └────────┬─────────┘
                  │                       │
        ┌─────────┴─────────────────────────┴─────────┐
        │          Message Queue (Redis)          │
        └─────────┬───────────────────────────────────────┘
                  │
     ┌────────────┼────────────┬────────────────┐
     ▼            ▼            ▼                ▼
┌─────────┐ ┌─────────┐ ┌─────────┐    ┌─────────┐
│Worker 1 │ │Worker 2 │ │Worker 3 │... │Worker N │
└────┬────┘ └────┬────┘ └────┬────┘    └────┬────┘
     │           │           │                │
     └───────────┴───────────┴────────────────┘
                          │
            ┌─────────────┴──────────────┐
            │     Shared Resources       │
            ├────────────────────────────┤
            │ • Vector DB (Pinecone)     │
            │ • Cache (Redis)            │
            │ • Storage (S3)             │
            │ • LLM APIs                 │
            └────────────────────────────┘

Implementation Blueprint

# production_architecture.py
from typing import Dict, Any, List
import asyncio
import redis
from celery import Celery
import aiohttp
from dataclasses import dataclass

@dataclass
class ProductionConfig:
    redis_url: str = "redis://localhost:6379"
    vector_db_url: str
    llm_api_keys: Dict[str, str]
    max_workers: int = 10
    cache_ttl: int = 3600
    rate_limit: int = 100  # per minute

class ContextEngineeringProduction:
    def __init__(self, config: ProductionConfig):
        self.config = config
        self.redis = redis.from_url(config.redis_url)
        self.celery = Celery('context_engine', broker=config.redis_url)
        self.init_connection_pools()
    
    def init_connection_pools(self):
        """Initialize connection pools for efficiency"""
        # Vector DB connection pool
        self.vector_pool = VectorDBPool(
            url=self.config.vector_db_url,
            max_connections=50
        )
        
        # LLM API session pool
        self.llm_sessions = {
            provider: aiohttp.ClientSession(
                connector=aiohttp.TCPConnector(limit=30)
            )
            for provider in self.config.llm_api_keys
        }
    
    @celery.task(bind=True, max_retries=3)
    def process_context_request(self, request_id: str, 
                               user_input: str, 
                               context_params: Dict[str, Any]):
        """Celery task for async processing"""
        try:
            # Check cache first
            cached = self.get_cached_response(user_input)
            if cached:
                return cached
            
            # SCIC implementation
            selected = self.distributed_select(context_params)
            compressed = self.parallel_compress(selected)
            isolated = self.isolate_contexts(compressed)
            response = self.compose_response(isolated, user_input)
            
            # Cache successful response
            self.cache_response(user_input, response)
            
            return response
            
        except Exception as e:
            # Retry with exponential backoff
            raise self.retry(exc=e, countdown=2 ** self.request.retries)

Distributed SCIC Implementation

class DistributedSCIC:
    def __init__(self, config: ProductionConfig):
        self.config = config
        
    async def distributed_select(self, params: Dict[str, Any]) -> Dict:
        """Parallel selection across multiple sources"""
        tasks = []
        
        # Create selection tasks
        if params.get('search_code'):
            tasks.append(self.select_from_codebase(params))
        if params.get('search_docs'):
            tasks.append(self.select_from_docs(params))
        if params.get('search_history'):
            tasks.append(self.select_from_history(params))
        if params.get('search_vector'):
            tasks.append(self.select_from_vectors(params))
        
        # Execute in parallel
        results = await asyncio.gather(*tasks, return_exceptions=True)
        
        # Handle partial failures
        selected = {}
        for i, result in enumerate(results):
            if isinstance(result, Exception):
                self.log_selection_error(tasks[i], result)
            else:
                selected.update(result)
        
        return selected
    
    async def parallel_compress(self, selected: Dict) -> Dict:
        """Compress different context types in parallel"""
        compression_tasks = {
            'code': self.compress_code_context(selected.get('code', '')),
            'docs': self.compress_documentation(selected.get('docs', '')),
            'history': self.compress_conversation(selected.get('history', [])),
            'vectors': self.compress_vector_results(selected.get('vectors', []))
        }
        
        compressed = {}
        for key, task in compression_tasks.items():
            try:
                compressed[key] = await task
            except Exception as e:
                self.log_compression_error(key, e)
                compressed[key] = ''  # Graceful degradation
        
        return compressed

Performance Optimization

Caching Strategies

class IntelligentCache:
    def __init__(self, redis_client: redis.Redis):
        self.redis = redis_client
        self.stats = CacheStats()
    
    def get_cache_key(self, query: str, context_hash: str) -> str:
        """Generate cache key from query and context"""
        # Normalize query
        normalized = self.normalize_query(query)
        
        # Include context hash for accuracy
        key = f"response:{hashlib.md5(f'{normalized}:{context_hash}'.encode()).hexdigest()}"
        
        return key
    
    def should_cache(self, query: str, response: str) -> bool:
        """Determine if response is worth caching"""
        # Don't cache errors
        if "error" in response.lower():
            return False
        
        # Don't cache time-sensitive queries
        time_sensitive = ['current', 'now', 'today', 'latest']
        if any(word in query.lower() for word in time_sensitive):
            return False
        
        # Cache if response is substantial
        return len(response) > 100
    
    async def get_or_compute(self, key: str, compute_func, ttl: int = 3600):
        """Cache-aside pattern with computation"""
        # Try cache
        cached = self.redis.get(key)
        if cached:
            self.stats.hit()
            return json.loads(cached)
        
        # Compute if miss
        self.stats.miss()
        
        # Distributed lock to prevent stampede
        lock_key = f"lock:{key}"
        if self.redis.set(lock_key, "1", nx=True, ex=30):
            try:
                result = await compute_func()
                
                # Cache result
                if self.should_cache(key, str(result)):
                    self.redis.setex(
                        key, 
                        ttl, 
                        json.dumps(result)
                    )
                
                return result
            finally:
                self.redis.delete(lock_key)
        else:
            # Another worker is computing, wait
            for _ in range(30):
                await asyncio.sleep(1)
                cached = self.redis.get(key)
                if cached:
                    return json.loads(cached)
            
            # Timeout, compute anyway
            return await compute_func()

Context Window Optimization

class ContextOptimizer:
    def __init__(self, max_tokens: int = 100000):
        self.max_tokens = max_tokens
        self.tokenizer = self.load_tokenizer()
    
    def optimize_context(self, contexts: Dict[str, Any]) -> Dict[str, Any]:
        """Optimize context to fit window while maximizing value"""
        # Calculate token usage
        token_counts = {
            key: self.count_tokens(str(value))
            for key, value in contexts.items()
        }
        
        total_tokens = sum(token_counts.values())
        
        if total_tokens <= self.max_tokens:
            return contexts  # Everything fits
        
        # Priority-based optimization
        priorities = {
            'current_task': 1.0,
            'recent_context': 0.8,
            'relevant_history': 0.6,
            'general_knowledge': 0.4,
            'examples': 0.2
        }
        
        # Allocate tokens by priority
        optimized = {}
        remaining_tokens = self.max_tokens
        
        for key in sorted(priorities.keys(), 
                         key=lambda k: priorities.get(k, 0), 
                         reverse=True):
            if key in contexts:
                allocated = int(remaining_tokens * priorities[key])
                optimized[key] = self.truncate_to_tokens(
                    contexts[key], 
                    min(allocated, token_counts[key])
                )
                remaining_tokens -= self.count_tokens(optimized[key])
        
        return optimized
    
    def progressive_summarization(self, text: str, target_tokens: int) -> str:
        """Progressively summarize to fit token budget"""
        current_tokens = self.count_tokens(text)
        
        if current_tokens <= target_tokens:
            return text
        
        # Compression levels
        compression_ratio = target_tokens / current_tokens
        
        if compression_ratio > 0.7:
            # Light compression: Remove redundancy
            return self.remove_redundancy(text)
        elif compression_ratio > 0.4:
            # Medium compression: Extract key points
            return self.extract_key_points(text)
        else:
            # Heavy compression: High-level summary
            return self.generate_summary(text, target_tokens)

Cost Management

Token Economics

class CostManager:
    def __init__(self, budgets: Dict[str, float]):
        self.budgets = budgets  # Monthly budgets per model
        self.usage = defaultdict(float)
        self.costs_per_1k = {
            'gpt-4': 0.03,
            'gpt-3.5': 0.002,
            'claude-opus': 0.015,
            'claude-sonnet': 0.003
        }
    
    def select_model(self, task_complexity: float, 
                    priority: str = 'balanced') -> str:
        """Select model based on task and budget"""
        # Calculate remaining budget
        remaining = {
            model: self.budgets[model] - self.usage[model]
            for model in self.budgets
        }
        
        # Task complexity to model mapping
        if task_complexity > 0.8 and remaining['claude-opus'] > 10:
            return 'claude-opus'
        elif task_complexity > 0.5 and remaining['gpt-4'] > 10:
            return 'gpt-4'
        elif remaining['claude-sonnet'] > 5:
            return 'claude-sonnet'
        else:
            return 'gpt-3.5'  # Fallback
    
    def optimize_prompt(self, prompt: str, model: str) -> str:
        """Optimize prompt for cost without losing effectiveness"""
        optimizations = {
            'remove_examples': self.should_remove_examples(model),
            'compress_context': self.should_compress_context(model),
            'use_references': self.should_use_references(model)
        }
        
        optimized = prompt
        
        if optimizations['remove_examples']:
            optimized = self.remove_verbose_examples(optimized)
        
        if optimizations['compress_context']:
            optimized = self.aggressive_compression(optimized)
        
        if optimizations['use_references']:
            optimized = self.replace_with_references(optimized)
        
        return optimized
    
    def track_usage(self, model: str, tokens: int):
        """Track usage and alert on budget approach"""
        cost = (tokens / 1000) * self.costs_per_1k[model]
        self.usage[model] += cost
        
        # Alert thresholds
        usage_percent = self.usage[model] / self.budgets[model]
        
        if usage_percent > 0.9:
            self.send_budget_alert(model, 'critical')
        elif usage_percent > 0.75:
            self.send_budget_alert(model, 'warning')

Hybrid Approach: Local + Cloud

class HybridContextEngine:
    def __init__(self):
        self.local_models = self.init_local_models()
        self.cloud_clients = self.init_cloud_clients()
        
    def init_local_models(self):
        """Initialize local models for cost-effective processing"""
        return {
            'embeddings': SentenceTransformer('all-MiniLM-L6-v2'),
            'classification': load_local_classifier(),
            'ner': load_local_ner()
        }
    
    async def process_with_routing(self, task: Dict) -> Any:
        """Route to local or cloud based on task"""
        # Classification is cheap - do locally
        task_type = self.local_models['classification'].predict(
            task['input']
        )
        
        # Simple tasks handled locally
        if task_type in ['simple_query', 'classification', 'extraction']:
            return await self.process_locally(task)
        
        # Complex tasks need cloud
        if task_type in ['generation', 'complex_reasoning', 'coding']:
            return await self.process_cloud(task)
        
        # Hybrid approach for medium complexity
        local_context = await self.prepare_context_locally(task)
        return await self.process_cloud_with_context(task, local_context)

Error Handling and Recovery

Resilient Context Systems

class ResilientContextManager:
    def __init__(self):
        self.circuit_breaker = CircuitBreaker()
        self.fallback_strategies = self.init_fallbacks()
    
    async def get_context_with_fallback(self, query: str) -> Dict:
        """Get context with multiple fallback strategies"""
        strategies = [
            self.get_from_primary_source,
            self.get_from_cache,
            self.get_from_backup_source,
            self.get_minimal_context
        ]
        
        for strategy in strategies:
            try:
                if self.circuit_breaker.is_open(strategy.__name__):
                    continue
                
                context = await strategy(query)
                
                if context:
                    self.circuit_breaker.record_success(strategy.__name__)
                    return context
                    
            except Exception as e:
                self.circuit_breaker.record_failure(strategy.__name__)
                self.log_error(f"Strategy {strategy.__name__} failed", e)
                continue
        
        # All strategies failed
        return self.get_emergency_context()
    
    def handle_partial_failure(self, results: Dict[str, Any]) -> Dict:
        """Handle partial context retrieval failures"""
        required_contexts = ['user_history', 'project_state']
        optional_contexts = ['similar_examples', 'documentation']
        
        # Check required contexts
        missing_required = [
            ctx for ctx in required_contexts 
            if ctx not in results or results[ctx] is None
        ]
        
        if missing_required:
            # Try to reconstruct from cache or alternatives
            for ctx in missing_required:
                results[ctx] = self.reconstruct_context(ctx)
        
        # Handle optional contexts gracefully
        for ctx in optional_contexts:
            if ctx not in results or results[ctx] is None:
                results[ctx] = self.get_default_context(ctx)
        
        return results

Monitoring and Observability

class ContextEngineMonitor:
    def __init__(self):
        self.metrics = self.init_metrics()
        self.tracer = self.init_tracing()
    
    def init_metrics(self):
        """Initialize Prometheus metrics"""
        return {
            'request_duration': Histogram(
                'context_request_duration_seconds',
                'Time spent processing context requests',
                ['operation', 'status']
            ),
            'cache_hits': Counter(
                'context_cache_hits_total',
                'Number of cache hits',
                ['cache_type']
            ),
            'token_usage': Counter(
                'llm_tokens_used_total',
                'Total tokens used',
                ['model', 'operation']
            ),
            'errors': Counter(
                'context_errors_total',
                'Total errors',
                ['error_type', 'operation']
            )
        }
    
    @contextmanager
    def monitor_operation(self, operation: str):
        """Monitor any operation with automatic metrics"""
        start_time = time.time()
        status = 'success'
        
        try:
            yield
        except Exception as e:
            status = 'error'
            self.metrics['errors'].labels(
                error_type=type(e).__name__,
                operation=operation
            ).inc()
            raise
        finally:
            duration = time.time() - start_time
            self.metrics['request_duration'].labels(
                operation=operation,
                status=status
            ).observe(duration)
    
    def create_dashboard_config(self):
        """Grafana dashboard configuration"""
        return {
            'panels': [
                {
                    'title': 'Request Rate',
                    'query': 'rate(context_request_duration_seconds_count[5m])'
                },
                {
                    'title': 'Cache Hit Rate',
                    'query': 'rate(context_cache_hits_total[5m]) / rate(context_request_duration_seconds_count[5m])'
                },
                {
                    'title': 'Token Usage by Model',
                    'query': 'sum by (model) (rate(llm_tokens_used_total[5m]))'
                },
                {
                    'title': 'Error Rate',
                    'query': 'rate(context_errors_total[5m])'
                },
                {
                    'title': 'P95 Response Time',
                    'query': 'histogram_quantile(0.95, rate(context_request_duration_seconds_bucket[5m]))'
                }
            ]
        }

Debugging Context Systems

Context Debugging Tools

class ContextDebugger:
    def __init__(self):
        self.enabled = os.getenv('CONTEXT_DEBUG', 'false') == 'true'
        
    def trace_context_flow(self, request_id: str):
        """Trace context through the system"""
        if not self.enabled:
            return
        
        trace = {
            'request_id': request_id,
            'timestamp': datetime.now().isoformat(),
            'stages': []
        }
        
        # Hook into each stage
        stages = ['select', 'compress', 'isolate', 'compose']
        
        for stage in stages:
            stage_data = {
                'name': stage,
                'start': time.time(),
                'input_size': 0,
                'output_size': 0,
                'errors': []
            }
            
            # Capture stage execution
            yield stage_data
            
            stage_data['duration'] = time.time() - stage_data['start']
            trace['stages'].append(stage_data)
        
        # Save trace for analysis
        self.save_trace(trace)
    
    def analyze_context_quality(self, context: Dict) -> Dict:
        """Analyze context quality metrics"""
        analysis = {
            'completeness': self.check_completeness(context),
            'relevance': self.calculate_relevance(context),
            'redundancy': self.detect_redundancy(context),
            'coherence': self.check_coherence(context),
            'size_efficiency': self.calculate_efficiency(context)
        }
        
        # Generate recommendations
        if analysis['redundancy'] > 0.3:
            analysis['recommendations'] = [
                'High redundancy detected. Consider better compression.'
            ]
        
        if analysis['relevance'] < 0.7:
            analysis['recommendations'].append(
                'Low relevance score. Review selection criteria.'
            )
        
        return analysis

Production Deployment Checklist

Pre-deployment Validation

class DeploymentValidator:
    def __init__(self):
        self.checks = []
        
    def validate_deployment(self) -> bool:
        """Run all deployment checks"""
        checks = [
            self.check_configuration,
            self.check_dependencies,
            self.check_performance,
            self.check_security,
            self.check_monitoring,
            self.check_rollback_plan
        ]
        
        results = {}
        all_passed = True
        
        for check in checks:
            name = check.__name__
            try:
                result = check()
                results[name] = {'status': 'passed', 'details': result}
            except Exception as e:
                results[name] = {'status': 'failed', 'error': str(e)}
                all_passed = False
        
        self.generate_report(results)
        return all_passed
    
    def check_performance(self):
        """Run performance benchmarks"""
        benchmarks = {
            'context_selection': self.benchmark_selection,
            'compression': self.benchmark_compression,
            'end_to_end': self.benchmark_full_pipeline
        }
        
        results = {}
        for name, benchmark in benchmarks.items():
            duration, throughput = benchmark()
            results[name] = {
                'duration_ms': duration,
                'throughput_rps': throughput
            }
            
            # Check against SLAs
            if name == 'end_to_end' and duration > 1000:
                raise ValueError(f"End-to-end latency {duration}ms exceeds 1s SLA")
        
        return results

Deployment Strategy

# kubernetes deployment
apiVersion: apps/v1
kind: Deployment
metadata:
  name: context-engine
spec:
  replicas: 3
  strategy:
    type: RollingUpdate
    rollingUpdate:
      maxSurge: 1
      maxUnavailable: 0
  template:
    spec:
      containers:
      - name: context-engine
        image: context-engine:latest
        resources:
          requests:
            memory: "2Gi"
            cpu: "1"
          limits:
            memory: "4Gi"
            cpu: "2"
        env:
        - name: WORKER_CONCURRENCY
          value: "4"
        readinessProbe:
          httpGet:
            path: /health
            port: 8080
          initialDelaySeconds: 10
        livenessProbe:
          httpGet:
            path: /health
            port: 8080
          periodSeconds: 30

Real-World Production Examples

Example 1: E-commerce Assistant

class EcommerceContextEngine:
    """Production context engine for e-commerce support"""
    
    def __init__(self):
        self.config = ProductionConfig(
            max_workers=20,
            cache_ttl=1800,  # 30 min for product data
            rate_limit=200   # Higher limit for peak sales
        )
        
    async def handle_customer_query(self, query: str, customer_id: str):
        """Handle customer query with full context"""
        
        # Parallel context gathering
        context_tasks = [
            self.get_customer_history(customer_id),
            self.get_relevant_products(query),
            self.get_current_promotions(),
            self.get_similar_resolved_issues(query)
        ]
        
        contexts = await asyncio.gather(*context_tasks)
        
        # Smart routing based on query type
        if self.is_order_query(query):
            return await self.handle_order_query(query, contexts)
        elif self.is_product_query(query):
            return await self.handle_product_query(query, contexts)
        else:
            return await self.handle_general_query(query, contexts)

Example 2: Code Review System

class ProductionCodeReviewer:
    """Scalable code review system"""
    
    def __init__(self):
        self.review_queue = RedisQueue('code-reviews')
        self.workers = []
        
    async def review_pull_request(self, pr_data: Dict):
        """Review PR with distributed processing"""
        
        # Split PR into reviewable chunks
        chunks = self.split_pr_into_chunks(pr_data)
        
        # Distribute review tasks
        review_tasks = []
        for chunk in chunks:
            task_id = self.review_queue.enqueue({
                'type': 'review_chunk',
                'chunk': chunk,
                'pr_id': pr_data['id']
            })
            review_tasks.append(task_id)
        
        # Wait for all reviews with timeout
        reviews = await self.wait_for_reviews(review_tasks, timeout=300)
        
        # Aggregate results
        final_review = self.aggregate_reviews(reviews)
        
        return final_review