# Qwen-Image-Layered 클라우드 전환 (9/10): 비용 모니터링 대시보드 ## 비용 가시성의 중요성 클라우드 전환 후 가장 큰 우려: - "이번 달 비용이 얼마나 나올까?" - "어느 백엔드가 가장 많이 사용되고 있나?" - "예산을 초과하면 어떻게 알 수 있나?" **해결책**: 실시간 비용 모니터링 대시보드 ## 아키텍처 ``` [Application] → [Cost Tracker] → [Redis] → [Prometheus] → [Grafana] ↓ [Slack Alerts] ``` **컴포넌트**: 1. **Cost Tracker**: 모든 API 호출 비용 기록 2. **Redis**: 실시간 데이터 저장 3. **Prometheus**: 메트릭 수집 4. **Grafana**: 시각화 대시보드 5. **Slack**: 알림 ## 비용 추적 구현 ### 1. Cost Tracker 서비스 ```python # services/cost_tracker.py import redis from datetime import datetime from typing import Dict class CostTracker: # 백엔드별 시간당 비용 COST_PER_HOUR = { "VertexAI_T4": 0.45, "VertexAI_T4_Spot": 0.135, "HuggingFace_T4": 0.60, "Local": 0.0 } def __init__(self): self.redis = redis.Redis(decode_responses=True) def track_inference( self, backend: str, duration_seconds: float, job_id: str, spot_instance: bool = False ) -> float: """추론 비용 기록""" # 비용 계산 backend_key = f"{backend}_Spot" if spot_instance else backend cost_per_hour = self.COST_PER_HOUR.get(backend_key, 0) cost = (duration_seconds / 3600) * cost_per_hour # 메타데이터 now = datetime.now() date_key = now.strftime("%Y-%m-%d") hour_key = now.strftime("%Y-%m-%d:%H") # Redis에 저장 # 1. 일일 총 비용 self.redis.incrbyfloat(f"cost:daily:{date_key}", cost) self.redis.expire(f"cost:daily:{date_key}", 86400 * 90) # 90일 보관 # 2. 시간별 비용 self.redis.incrbyfloat(f"cost:hourly:{hour_key}", cost) self.redis.expire(f"cost:hourly:{hour_key}", 86400 * 7) # 7일 보관 # 3. 백엔드별 비용 self.redis.incrbyfloat(f"cost:backend:{backend}:{date_key}", cost) self.redis.expire(f"cost:backend:{backend}:{date_key}", 86400 * 90) # 4. 백엔드별 호출 횟수 self.redis.incr(f"count:backend:{backend}:{date_key}") self.redis.expire(f"count:backend:{backend}:{date_key}", 86400 * 90) # 5. 작업별 비용 self.redis.hset(f"job:{job_id}", "cost", cost) self.redis.hset(f"job:{job_id}", "backend", backend) return cost def get_daily_cost(self, date: str = None) -> float: """특정 날짜의 총 비용""" if not date: date = datetime.now().strftime("%Y-%m-%d") cost = self.redis.get(f"cost:daily:{date}") return float(cost) if cost else 0.0 def get_monthly_cost(self, year_month: str = None) -> float: """특정 월의 총 비용""" if not year_month: year_month = datetime.now().strftime("%Y-%m") # 해당 월의 모든 일자 합산 total = 0.0 for day in range(1, 32): date = f"{year_month}-{day:02d}" total += self.get_daily_cost(date) return total def get_backend_breakdown(self, date: str = None) -> Dict[str, float]: """백엔드별 비용 분석""" if not date: date = datetime.now().strftime("%Y-%m-%d") backends = ["VertexAI", "HuggingFace", "Local"] breakdown = {} for backend in backends: cost = self.redis.get(f"cost:backend:{backend}:{date}") breakdown[backend] = float(cost) if cost else 0.0 return breakdown ``` ### 2. Decomposer 통합 ```python # models/vertex_ai_decomposer.py (수정) from services.cost_tracker import CostTracker import time class VertexAIDecomposer(BaseDecomposer): def __init__(self): self.endpoint = aiplatform.Endpoint(ENDPOINT_NAME) self.cost_tracker = CostTracker() self._available = True async def decompose(self, image_path, num_layers, resolution, job_id=None): start_time = time.time() try: # ... 추론 로직 ... end_time = time.time() duration = end_time - start_time # 비용 기록 cost = self.cost_tracker.track_inference( backend="VertexAI", duration_seconds=duration, job_id=job_id, spot_instance=False # Spot 여부 ) logger.info(f"Inference cost: ${cost:.4f}") return layers except Exception as e: # ... ``` ## Prometheus 메트릭 Exporter ### 메트릭 정의 ```python # services/prometheus_exporter.py from prometheus_client import Gauge, Counter, Histogram, start_http_server from services.cost_tracker import CostTracker import time # Gauge (현재 값) daily_cost = Gauge('poster_decomposer_daily_cost_usd', 'Daily cost in USD') monthly_cost = Gauge('poster_decomposer_monthly_cost_usd', 'Monthly cost in USD') # Counter (누적) total_requests = Counter('poster_decomposer_requests_total', 'Total requests', ['backend']) total_cost = Counter('poster_decomposer_cost_total_usd', 'Total cost', ['backend']) # Histogram (분포) inference_duration = Histogram('poster_decomposer_inference_duration_seconds', 'Inference duration') inference_cost = Histogram('poster_decomposer_inference_cost_usd', 'Inference cost per request') class PrometheusExporter: def __init__(self): self.cost_tracker = CostTracker() def update_metrics(self): """메트릭 업데이트""" # 일일 비용 today_cost = self.cost_tracker.get_daily_cost() daily_cost.set(today_cost) # 월간 비용 month_cost = self.cost_tracker.get_monthly_cost() monthly_cost.set(month_cost) # 백엔드별 분석 breakdown = self.cost_tracker.get_backend_breakdown() for backend, cost in breakdown.items(): # Counter는 증가만 가능하므로 차이만 추가 # (실제로는 더 정교한 로직 필요) pass def start(self, port=8001): """Prometheus HTTP 서버 시작""" start_http_server(port) print(f"✅ Prometheus exporter started on port {port}") while True: self.update_metrics() time.sleep(60) # 1분마다 업데이트 if __name__ == "__main__": exporter = PrometheusExporter() exporter.start() ``` ### Prometheus 설정 ```yaml # prometheus.yml global: scrape_interval: 15s scrape_configs: - job_name: 'poster-decomposer' static_configs: - targets: ['localhost:8001'] ``` ## Grafana 대시보드 ### 대시보드 JSON ```json { "dashboard": { "title": "Poster Decomposer Cost Dashboard", "panels": [ { "title": "Daily Cost (USD)", "type": "graph", "targets": [ { "expr": "poster_decomposer_daily_cost_usd", "legendFormat": "Daily Cost" } ] }, { "title": "Monthly Cost Trend", "type": "graph", "targets": [ { "expr": "poster_decomposer_monthly_cost_usd", "legendFormat": "Monthly Cost" } ] }, { "title": "Backend Usage (%)", "type": "piechart", "targets": [ { "expr": "sum by (backend) (poster_decomposer_requests_total)" } ] }, { "title": "Cost per Request", "type": "histogram", "targets": [ { "expr": "histogram_quantile(0.95, poster_decomposer_inference_cost_usd)" } ] } ] } } ``` ### 스크린샷 예시 ``` ┌─────────────────────────────────────────────┐ │ Daily Cost (USD) │ │ ┌─────────────────────────────────────────┐ │ │ │ ╱──╲ │ │ │ │ ╱──╲ ╱ ╲ │ │ │ │ ╱──╲ ╱ ╲ ──╱ ╲── │ │ │ │ ──╱ ╲╱ │ │ │ └─────────────────────────────────────────┘ │ │ $12.34 today │ └─────────────────────────────────────────────┘ ┌────────────────────────────────────────────┐ │ Backend Usage │ │ VertexAI: 85% ████████████░░░░ │ │ HuggingFace: 10% ██░░░░░░░░░░░░░░░░░░ │ │ Local: 5% █░░░░░░░░░░░░░░░░░░░░░░░░░░ │ └────────────────────────────────────────────┘ ``` ## 비용 알림 설정 ### 1. Slack Webhook 설정 ```python # services/alert_service.py import requests from datetime import datetime class AlertService: def __init__(self, webhook_url): self.webhook_url = webhook_url def send_alert(self, title: str, message: str, color: str = "warning"): """Slack 알림 전송""" payload = { "attachments": [{ "title": title, "text": message, "color": color, "footer": "Poster Decomposer", "ts": int(datetime.now().timestamp()) }] } response = requests.post(self.webhook_url, json=payload) response.raise_for_status() def alert_daily_cost(self, cost: float, budget: float): """일일 비용 알림""" percentage = (cost / budget) * 100 if percentage > 100: color = "danger" title = "🚨 Budget Exceeded!" elif percentage > 80: color = "warning" title = "⚠️ Budget Warning" else: return # 알림 불필요 message = f"Today's cost: ${cost:.2f} ({percentage:.1f}% of daily budget ${budget})" self.send_alert(title, message, color) ``` ### 2. 자동 알림 스크립트 ```python # scripts/daily_cost_monitor.py from services.cost_tracker import CostTracker from services.alert_service import AlertService import schedule import time DAILY_BUDGET = 15.0 # $15/day SLACK_WEBHOOK = os.getenv("SLACK_WEBHOOK_URL") tracker = CostTracker() alerter = AlertService(SLACK_WEBHOOK) def check_daily_cost(): """일일 비용 체크""" today_cost = tracker.get_daily_cost() print(f"Today's cost: ${today_cost:.2f}") # 알림 전송 (필요시) alerter.alert_daily_cost(today_cost, DAILY_BUDGET) # 매일 18시에 체크 schedule.every().day.at("18:00").do(check_daily_cost) while True: schedule.run_pending() time.sleep(60) ``` ## 비용 최적화 제안 시스템 ### 자동 분석 ```python # services/cost_optimizer.py from services.cost_tracker import CostTracker from typing import List, Dict class CostOptimizer: def __init__(self): self.tracker = CostTracker() def analyze_and_suggest(self) -> List[Dict]: """비용 분석 및 최적화 제안""" suggestions = [] # 1. 백엔드 사용 패턴 분석 breakdown = self.tracker.get_backend_breakdown() hf_cost = breakdown.get("HuggingFace", 0) vertex_cost = breakdown.get("VertexAI", 0) # Hugging Face 사용량이 많으면 경고 if hf_cost > vertex_cost: suggestions.append({ "severity": "high", "title": "High Hugging Face cost detected", "message": f"Hugging Face cost (${hf_cost:.2f}) is higher than Vertex AI (${vertex_cost:.2f})", "action": "Consider migrating more workload to Vertex AI (15x cheaper)" }) # 2. Spot Instances 사용 분석 # (Spot 사용량 추적 구현 필요) # 3. 캐싱 효과 분석 cache_hits = int(self.tracker.redis.get("cache_hits:today") or 0) total_requests = int(self.tracker.redis.get("requests:today") or 1) cache_hit_rate = cache_hits / total_requests if cache_hit_rate < 0.1: suggestions.append({ "severity": "medium", "title": "Low cache hit rate", "message": f"Cache hit rate is only {cache_hit_rate*100:.1f}%", "action": "Consider increasing cache TTL or improving cache key strategy" }) return suggestions def generate_report(self) -> str: """최적화 리포트 생성""" suggestions = self.analyze_and_suggest() if not suggestions: return "✅ No optimization suggestions. Cost efficiency is optimal." report = "=== Cost Optimization Suggestions ===\n\n" for i, suggestion in enumerate(suggestions, 1): report += f"{i}. [{suggestion['severity'].upper()}] {suggestion['title']}\n" report += f" {suggestion['message']}\n" report += f" 💡 {suggestion['action']}\n\n" return report if __name__ == "__main__": optimizer = CostOptimizer() print(optimizer.generate_report()) ``` ## API 엔드포인트 ### 비용 조회 API ```python # api/cost.py from fastapi import APIRouter from services.cost_tracker import CostTracker from services.cost_optimizer import CostOptimizer router = APIRouter() @router.get("/cost/today") async def get_today_cost(): """오늘 비용 조회""" tracker = CostTracker() return { "date": datetime.now().strftime("%Y-%m-%d"), "cost": tracker.get_daily_cost() } @router.get("/cost/month") async def get_month_cost(): """이번 달 비용 조회""" tracker = CostTracker() return { "month": datetime.now().strftime("%Y-%m"), "cost": tracker.get_monthly_cost() } @router.get("/cost/breakdown") async def get_cost_breakdown(): """백엔드별 비용 분석""" tracker = CostTracker() return tracker.get_backend_breakdown() @router.get("/cost/suggestions") async def get_cost_suggestions(): """비용 최적화 제안""" optimizer = CostOptimizer() suggestions = optimizer.analyze_and_suggest() return {"suggestions": suggestions} ``` ## 실전 활용 ### 주간 리포트 자동 생성 ```python # scripts/weekly_cost_report.py from services.cost_tracker import CostTracker from services.cost_optimizer import CostOptimizer from datetime import datetime, timedelta def generate_weekly_report(): """주간 비용 리포트""" tracker = CostTracker() optimizer = CostOptimizer() # 지난 7일 비용 weekly_cost = 0.0 for i in range(7): date = (datetime.now() - timedelta(days=i)).strftime("%Y-%m-%d") daily_cost = tracker.get_daily_cost(date) weekly_cost += daily_cost # 백엔드 분석 breakdown = tracker.get_backend_breakdown() # 최적화 제안 suggestions = optimizer.analyze_and_suggest() # 리포트 생성 report = f""" === Weekly Cost Report === Period: {(datetime.now() - timedelta(days=7)).strftime('%Y-%m-%d')} to {datetime.now().strftime('%Y-%m-%d')} Total Cost: ${weekly_cost:.2f} Average Daily: ${weekly_cost/7:.2f} Backend Breakdown: - Vertex AI: ${breakdown.get('VertexAI', 0):.2f} - Hugging Face: ${breakdown.get('HuggingFace', 0):.2f} - Local: ${breakdown.get('Local', 0):.2f} {optimizer.generate_report()} """ return report if __name__ == "__main__": print(generate_weekly_report()) ``` ## 다음 단계 v10에서는 **운영 가이드 및 결론**을 다룬다: 1. 프로덕션 배포 체크리스트 2. 일반적인 문제 해결 방법 3. 프로젝트 회고 4. 향후 개선 방향 이제 모니터링 시스템이 완성되었고, 마지막으로 운영 지식을 정리할 차례다. --- **이전 글**: [하이브리드 아키텍처 (8/10)](./update-qwen-image-layered-project-v8.md) **다음 글**: [운영 가이드 및 결론 (10/10)](./update-qwen-image-layered-project-v10.md) **참고 자료**: - [Prometheus Best Practices](https://prometheus.io/docs/practices/) - [Grafana Dashboard Design](https://grafana.com/docs/grafana/latest/dashboards/) - [Slack Incoming Webhooks](https://api.slack.com/messaging/webhooks)