监控与维护
确保自动化系统长期稳定运行的关键。
📊 监控体系
三层监控模型
┌─────────────────────────────────┐
│ 业务层监控 │
│ (自动化达成了目标吗?) │
└─────────────────────────────────┘
↓
┌─────────────────────────────────┐
│ 应用层监控 │
│ (脚本正常执行吗?) │
└─────────────────────────────────┘
↓
┌─────────────────────────────────┐
│ 基础层监控 │
│ (系统资源充足吗?) │
└─────────────────────────────────┘
1. 执行监控
#!/bin/bash
# execution-monitor.sh
MONITOR_LOG="$HOME/.automation/monitor.log"
ALERT_THRESHOLD=3 # 连续失败3次告警
function monitor_execution() {
local script=$1
local start_time=$(date +%s)
# 执行脚本
if timeout 300 "$script"; then
local end_time=$(date +%s)
local duration=$((end_time - start_time))
log_success "$script" "$duration"
reset_failure_count "$script"
else
local exit_code=$?
log_failure "$script" "$exit_code"
check_alert "$script"
fi
}
function log_success() {
local script=$1
local duration=$2
echo "$(date +%s),success,$script,$duration" >> "$MONITOR_LOG"
}
function log_failure() {
local script=$1
local exit_code=$2
echo "$(date +%s),failure,$script,$exit_code" >> "$MONITOR_LOG"
# 增加失败计数
local count_file="$HOME/.automation/failure_count_$(basename $script)"
local count=$(cat "$count_file" 2>/dev/null || echo 0)
echo $((count + 1)) > "$count_file"
}
function reset_failure_count() {
local script=$1
local count_file="$HOME/.automation/failure_count_$(basename $script)"
echo 0 > "$count_file"
}
function check_alert() {
local script=$1
local count_file="$HOME/.automation/failure_count_$(basename $script)"
local count=$(cat "$count_file" 2>/dev/null || echo 0)
if [ "$count" -ge "$ALERT_THRESHOLD" ]; then
send_alert "$script" "$count"
fi
}
function send_alert() {
local script=$1
local count=$2
# 系统通知
osascript -e "display notification \"$script 连续失败 $count 次\" with title \"🚨 自动化告警\""
# Slack通知 (如果配置)
if [ -n "$SLACK_WEBHOOK" ]; then
curl -X POST "$SLACK_WEBHOOK" \
-H 'Content-Type: application/json' \
-d "{\"text\": \"⚠️ $script 连续失败 $count 次\"}"
fi
# 邮件通知
echo "$script 连续失败 $count 次,请检查" | mail -s "自动化告警" admin@company.com
}
2. 性能监控
#!/usr/bin/env python3
# performance-monitor.py
import time
import psutil
import sqlite3
from datetime import datetime
class PerformanceMonitor:
def __init__(self):
self.db = sqlite3.connect('~/.automation/performance.db')
self.setup_db()
def setup_db(self):
self.db.execute('''
CREATE TABLE IF NOT EXISTS metrics (
timestamp INTEGER,
script TEXT,
duration_ms INTEGER,
cpu_percent REAL,
memory_mb REAL,
exit_code INTEGER
)
''')
def monitor(self, script_name, func):
"""监控函数执行"""
process = psutil.Process()
# 开始监控
start_time = time.time()
start_cpu = process.cpu_percent()
start_mem = process.memory_info().rss / 1024 / 1024
try:
result = func()
exit_code = 0
except Exception as e:
exit_code = 1
result = None
# 结束监控
end_time = time.time()
duration_ms = int((end_time - start_time) * 1000)
end_cpu = process.cpu_percent()
end_mem = process.memory_info().rss / 1024 / 1024
# 记录
self.db.execute('''
INSERT INTO metrics VALUES (?, ?, ?, ?, ?, ?)
''', (
int(time.time()),
script_name,
duration_ms,
end_cpu - start_cpu,
end_mem - start_mem,
exit_code
))
self.db.commit()
return result
def report(self, script_name=None, days=7):
"""生成性能报告"""
cutoff = int(time.time()) - (days * 86400)
query = '''
SELECT
script,
COUNT(*) as executions,
AVG(duration_ms) as avg_duration,
MAX(duration_ms) as max_duration,
AVG(cpu_percent) as avg_cpu,
AVG(memory_mb) as avg_memory,
SUM(CASE WHEN exit_code != 0 THEN 1 ELSE 0 END) as failures
FROM metrics
WHERE timestamp > ?
'''
if script_name:
query += ' AND script = ?'
params = (cutoff, script_name)
else:
query += ' GROUP BY script'
params = (cutoff,)
cursor = self.db.execute(query, params)
print(f"📊 性能报告 (最近{days}天)")
print("=" * 80)
print(f"{'脚本':<30} {'执行次数':<10} {'平均耗时':<12} {'失败次数':<10}")
print("-" * 80)
for row in cursor:
script, executions, avg_duration, max_duration, avg_cpu, avg_memory, failures = row
print(f"{script:<30} {executions:<10} {avg_duration/1000:.2f}s {failures:<10}")
# 使用
monitor = PerformanceMonitor()
def my_automation():
# 自动化逻辑
pass
monitor.monitor('my-automation', my_automation)
monitor.report(days=7)
3. 健康检查
#!/bin/bash
# health-check.sh
function health_check() {
local status=0
echo "🏥 健康检查"
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━"
# 1. 检查关键脚本
echo ""
echo "📋 脚本检查"
CRITICAL_SCRIPTS=(
"~/.automation/scripts/backup.sh"
"~/.automation/scripts/deploy.sh"
"~/.automation/scripts/monitor.sh"
)
for script in "${CRITICAL_SCRIPTS[@]}"; do
if [ -f "$script" ] && [ -x "$script" ]; then
echo "✅ $script"
else
echo "❌ $script 缺失或无执行权限"
status=1
fi
done
# 2. 检查依赖
echo ""
echo "🔧 依赖检查"
DEPENDENCIES=(
"git"
"jq"
"curl"
"claude-cli"
)
for cmd in "${DEPENDENCIES[@]}"; do
if command -v "$cmd" &> /dev/null; then
echo "✅ $cmd"
else
echo "❌ $cmd 未安装"
status=1
fi
done
# 3. 检查API可用性
echo ""
echo "🌐 API检查"
if curl -s --max-time 5 https://api.anthropic.com/health &> /dev/null; then
echo "✅ Claude API"
else
echo "⚠️ Claude API 无响应"
fi
# 4. 检查磁盘空间
echo ""
echo "💾 磁盘空间"
DISK_USAGE=$(df -h ~ | awk 'NR==2 {print $5}' | sed 's/%//')
if [ "$DISK_USAGE" -lt 80 ]; then
echo "✅ 磁盘使用: ${DISK_USAGE}%"
elif [ "$DISK_USAGE" -lt 90 ]; then
echo "⚠️ 磁盘使用: ${DISK_USAGE}% (接近上限)"
else
echo "❌ 磁盘使用: ${DISK_USAGE}% (需要清理)"
status=1
fi
# 5. 检查日志文件大小
echo ""
echo "📝 日志文件"
LOG_SIZE=$(du -sh ~/.automation/logs 2>/dev/null | cut -f1)
echo "日志总大小: $LOG_SIZE"
# 6. 检查最近失败
echo ""
echo "📊 最近执行"
RECENT_FAILURES=$(tail -100 ~/.automation/monitor.log | grep -c "failure")
echo "最近100次执行中失败: $RECENT_FAILURES 次"
if [ "$RECENT_FAILURES" -gt 10 ]; then
echo "⚠️ 失败率过高"
status=1
fi
echo ""
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━"
if [ "$status" -eq 0 ]; then
echo "✅ 系统健康"
else
echo "❌ 发现问题,需要处理"
fi
return $status
}
# 定时运行健康检查 (crontab: 0 */6 * * *)
health_check
🔧 维护策略
1. 日志管理
#!/bin/bash
# log-management.sh
LOG_DIR="$HOME/.automation/logs"
MAX_LOG_SIZE_MB=100
MAX_LOG_AGE_DAYS=30
function rotate_logs() {
echo "🔄 日志轮转"
find "$LOG_DIR" -name "*.log" -type f | while read log_file; do
# 检查文件大小
size=$(stat -f%z "$log_file")
size_mb=$((size / 1024 / 1024))
if [ "$size_mb" -gt "$MAX_LOG_SIZE_MB" ]; then
echo "轮转: $log_file (${size_mb}MB)"
# 压缩旧日志
gzip "$log_file"
mv "${log_file}.gz" "${log_file}.$(date +%Y%m%d).gz"
# 创建新日志
touch "$log_file"
fi
done
}
function cleanup_old_logs() {
echo "🧹 清理旧日志"
# 删除超过30天的日志
find "$LOG_DIR" -name "*.log.*.gz" -mtime +$MAX_LOG_AGE_DAYS -delete
echo "清理完成"
}
function analyze_logs() {
echo "📊 日志分析"
# 统计错误
echo "错误统计:"
grep -h "ERROR" "$LOG_DIR"/*.log 2>/dev/null | \
awk '{print $4}' | sort | uniq -c | sort -rn | head -10
# 统计警告
echo ""
echo "警告统计:"
grep -h "WARN" "$LOG_DIR"/*.log 2>/dev/null | \
awk '{print $4}' | sort | uniq -c | sort -rn | head -10
}
# 定期执行
rotate_logs
cleanup_old_logs
analyze_logs
2. 依赖更新
#!/bin/bash
# update-dependencies.sh
function update_homebrew() {
echo "🍺 更新Homebrew包"
# 更新Homebrew
brew update
# 列出可更新的包
OUTDATED=$(brew outdated)
if [ -n "$OUTDATED" ]; then
echo "可更新的包:"
echo "$OUTDATED"
read -p "是否更新? (y/N): " -n 1 -r
echo
if [[ $REPLY =~ ^[Yy]$ ]]; then
brew upgrade
fi
else
echo "✅ 所有包都是最新的"
fi
}
function update_npm_global() {
echo "📦 更新全局npm包"
npm outdated -g
read -p "是否更新? (y/N): " -n 1 -r
echo
if [[ $REPLY =~ ^[Yy]$ ]]; then
npm update -g
fi
}
function check_script_compatibility() {
echo "🔍 检查脚本兼容性"
# 运行测试套件
if [ -f "$HOME/.automation/tests/run-all.sh" ]; then
"$HOME/.automation/tests/run-all.sh"
fi
}
# 月度更新流程
echo "🔄 月度依赖更新"
update_homebrew
update_npm_global
check_script_compatibility
3. 备份与恢复
#!/bin/bash
# backup-automation.sh
BACKUP_DIR="$HOME/.automation-backups"
SOURCE_DIR="$HOME/.automation"
function create_backup() {
local timestamp=$(date +%Y%m%d_%H%M%S)
local backup_file="$BACKUP_DIR/automation_${timestamp}.tar.gz"
echo "💾 创建备份: $backup_file"
mkdir -p "$BACKUP_DIR"
# 排除日志和缓存
tar czf "$backup_file" \
--exclude="logs" \
--exclude="cache" \
--exclude="*.log" \
-C ~ .automation
echo "✅ 备份完成: $(du -h $backup_file | cut -f1)"
# 上传到云端
if command -v rclone &> /dev/null; then
rclone copy "$backup_file" remote:automation-backups/
echo "☁️ 已上传到云端"
fi
}
function restore_backup() {
local backup_file=$1
if [ ! -f "$backup_file" ]; then
echo "❌ 备份文件不存在: $backup_file"
return 1
fi
echo "⚠️ 即将恢复备份,当前配置将被覆盖"
read -p "确认? (yes/NO): " confirm
if [ "$confirm" != "yes" ]; then
echo "取消恢复"
return 0
fi
# 备份当前配置
if [ -d "$SOURCE_DIR" ]; then
mv "$SOURCE_DIR" "${SOURCE_DIR}.old.$(date +%s)"
fi
# 恢复
tar xzf "$backup_file" -C ~
echo "✅ 恢复完成"
}
function list_backups() {
echo "📋 可用备份:"
ls -lh "$BACKUP_DIR" | tail -n +2 | awk '{print $9, "(" $5 ")"}'
}
function cleanup_old_backups() {
# 保留最近10个备份
ls -t "$BACKUP_DIR"/automation_*.tar.gz | tail -n +11 | xargs rm -f
echo "✅ 清理旧备份完成"
}
# 使用
case ${1:-backup} in
backup)
create_backup
cleanup_old_backups
;;
restore)
list_backups
read -p "输入备份文件名: " filename
restore_backup "$BACKUP_DIR/$filename"
;;
list)
list_backups
;;
esac
📅 维护计划
日常维护(每天)
#!/bin/bash
# daily-maintenance.sh
# 自动运行,无需人工干预
health_check
monitor_execution
check_disk_space
周维护(每周日)
#!/bin/bash
# weekly-maintenance.sh
# 半自动,需要审查
echo "📅 周维护任务"
# 1. 生成周报
generate_weekly_report
# 2. 清理日志
rotate_logs
cleanup_old_logs
# 3. 备份配置
create_backup
# 4. 审查失败任务
review_failures
echo "✅ 周维护完成"
月维护(每月1号)
#!/bin/bash
# monthly-maintenance.sh
echo "📅 月度维护"
# 1. 更新依赖
update_dependencies
# 2. 性能审查
performance_review
# 3. 安全审计
security_audit
# 4. 脚本优化
identify_optimization_opportunities
# 5. 文档更新
update_documentation
echo "✅ 月度维护完成"
季度维护(每季度)
# 季度深度审查
## 目标
全面审查和优化自动化系统
## 检查清单
- [ ] 审查所有自动化脚本使用情况
- [ ] 识别未使用的脚本(考虑删除)
- [ ] 评估ROI
- [ ] 重构复杂脚本
- [ ] 更新最佳实践文档
- [ ] 团队培训(如适用)
- [ ] 规划下季度改进
## 时间投入
4-6小时
🚨 故障处理
故障响应手册
# 自动化故障响应手册
## 严重程度分级
### P0 - 紧急 (< 15分钟响应)
- 生产部署流程中断
- 数据丢失风险
- 安全漏洞
### P1 - 高 (< 1小时响应)
- 关键自动化失败
- 影响多人工作
- 性能严重下降
### P2 - 中 (< 4小时响应)
- 单个脚本失败
- 影响个人效率
- 非关键功能异常
### P3 - 低 (< 1天响应)
- 优化建议
- 文档更新
- 功能请求
## 响应流程
1. **识别**: 通过告警或手动发现
2. **评估**: 判断严重程度
3. **隔离**: 防止影响扩大
4. **修复**: 解决问题
5. **验证**: 确认修复有效
6. **总结**: 记录事后分析
## 常见问题快速修复
### API配额超限
\`\`\`bash
# 1. 检查使用量
cat ~/.automation/api-usage.log | tail -100
# 2. 清理缓存,启用缓存
enable_api_cache
# 3. 等待配额重置
\`\`\`
### 脚本权限错误
\`\`\`bash
# 修复权限
chmod +x ~/.automation/scripts/*.sh
\`\`\`
### 依赖缺失
\`\`\`bash
# 重新安装依赖
./setup.sh
\`\`\`
事后分析模板
# 事件事后分析
**日期**: 2025-01-04
**严重程度**: P1
**影响时长**: 2小时
## 事件概要
简要描述发生了什么
## 时间线
- 14:00 - 首次发现问题
- 14:15 - 定位到根本原因
- 15:30 - 实施修复
- 16:00 - 验证修复有效
## 根本原因
详细分析为什么会发生
## 影响范围
- 影响用户: 5人
- 影响任务: 部署流程
- 业务损失: 2小时生产力
## 解决方案
实施的临时和永久修复
## 预防措施
- [ ] 添加监控告警
- [ ] 改进错误处理
- [ ] 更新文档
- [ ] 增加测试覆盖
## 经验教训
从这次事件中学到了什么
记住: 好的监控和维护让自动化系统长期可靠运行。投资于维护,避免技术债务!