跳到主要内容

监控与维护

确保自动化系统长期稳定运行的关键。

📊 监控体系

三层监控模型

┌─────────────────────────────────┐
│ 业务层监控 │
│ (自动化达成了目标吗?) │
└─────────────────────────────────┘

┌─────────────────────────────────┐
│ 应用层监控 │
│ (脚本正常执行吗?) │
└─────────────────────────────────┘

┌─────────────────────────────────┐
│ 基础层监控 │
│ (系统资源充足吗?) │
└─────────────────────────────────┘

1. 执行监控

#!/bin/bash
# execution-monitor.sh

MONITOR_LOG="$HOME/.automation/monitor.log"
ALERT_THRESHOLD=3 # 连续失败3次告警

function monitor_execution() {
local script=$1
local start_time=$(date +%s)

# 执行脚本
if timeout 300 "$script"; then
local end_time=$(date +%s)
local duration=$((end_time - start_time))

log_success "$script" "$duration"
reset_failure_count "$script"
else
local exit_code=$?
log_failure "$script" "$exit_code"
check_alert "$script"
fi
}

function log_success() {
local script=$1
local duration=$2

echo "$(date +%s),success,$script,$duration" >> "$MONITOR_LOG"
}

function log_failure() {
local script=$1
local exit_code=$2

echo "$(date +%s),failure,$script,$exit_code" >> "$MONITOR_LOG"

# 增加失败计数
local count_file="$HOME/.automation/failure_count_$(basename $script)"
local count=$(cat "$count_file" 2>/dev/null || echo 0)
echo $((count + 1)) > "$count_file"
}

function reset_failure_count() {
local script=$1
local count_file="$HOME/.automation/failure_count_$(basename $script)"
echo 0 > "$count_file"
}

function check_alert() {
local script=$1
local count_file="$HOME/.automation/failure_count_$(basename $script)"
local count=$(cat "$count_file" 2>/dev/null || echo 0)

if [ "$count" -ge "$ALERT_THRESHOLD" ]; then
send_alert "$script" "$count"
fi
}

function send_alert() {
local script=$1
local count=$2

# 系统通知
osascript -e "display notification \"$script 连续失败 $count\" with title \"🚨 自动化告警\""

# Slack通知 (如果配置)
if [ -n "$SLACK_WEBHOOK" ]; then
curl -X POST "$SLACK_WEBHOOK" \
-H 'Content-Type: application/json' \
-d "{\"text\": \"⚠️ $script 连续失败 $count\"}"
fi

# 邮件通知
echo "$script 连续失败 $count 次,请检查" | mail -s "自动化告警" admin@company.com
}

2. 性能监控

#!/usr/bin/env python3
# performance-monitor.py

import time
import psutil
import sqlite3
from datetime import datetime

class PerformanceMonitor:
def __init__(self):
self.db = sqlite3.connect('~/.automation/performance.db')
self.setup_db()

def setup_db(self):
self.db.execute('''
CREATE TABLE IF NOT EXISTS metrics (
timestamp INTEGER,
script TEXT,
duration_ms INTEGER,
cpu_percent REAL,
memory_mb REAL,
exit_code INTEGER
)
''')

def monitor(self, script_name, func):
"""监控函数执行"""
process = psutil.Process()

# 开始监控
start_time = time.time()
start_cpu = process.cpu_percent()
start_mem = process.memory_info().rss / 1024 / 1024

try:
result = func()
exit_code = 0
except Exception as e:
exit_code = 1
result = None

# 结束监控
end_time = time.time()
duration_ms = int((end_time - start_time) * 1000)
end_cpu = process.cpu_percent()
end_mem = process.memory_info().rss / 1024 / 1024

# 记录
self.db.execute('''
INSERT INTO metrics VALUES (?, ?, ?, ?, ?, ?)
''', (
int(time.time()),
script_name,
duration_ms,
end_cpu - start_cpu,
end_mem - start_mem,
exit_code
))
self.db.commit()

return result

def report(self, script_name=None, days=7):
"""生成性能报告"""
cutoff = int(time.time()) - (days * 86400)

query = '''
SELECT
script,
COUNT(*) as executions,
AVG(duration_ms) as avg_duration,
MAX(duration_ms) as max_duration,
AVG(cpu_percent) as avg_cpu,
AVG(memory_mb) as avg_memory,
SUM(CASE WHEN exit_code != 0 THEN 1 ELSE 0 END) as failures
FROM metrics
WHERE timestamp > ?
'''

if script_name:
query += ' AND script = ?'
params = (cutoff, script_name)
else:
query += ' GROUP BY script'
params = (cutoff,)

cursor = self.db.execute(query, params)

print(f"📊 性能报告 (最近{days}天)")
print("=" * 80)
print(f"{'脚本':<30} {'执行次数':<10} {'平均耗时':<12} {'失败次数':<10}")
print("-" * 80)

for row in cursor:
script, executions, avg_duration, max_duration, avg_cpu, avg_memory, failures = row
print(f"{script:<30} {executions:<10} {avg_duration/1000:.2f}s {failures:<10}")

# 使用
monitor = PerformanceMonitor()

def my_automation():
# 自动化逻辑
pass

monitor.monitor('my-automation', my_automation)
monitor.report(days=7)

3. 健康检查

#!/bin/bash
# health-check.sh

function health_check() {
local status=0

echo "🏥 健康检查"
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━"

# 1. 检查关键脚本
echo ""
echo "📋 脚本检查"
CRITICAL_SCRIPTS=(
"~/.automation/scripts/backup.sh"
"~/.automation/scripts/deploy.sh"
"~/.automation/scripts/monitor.sh"
)

for script in "${CRITICAL_SCRIPTS[@]}"; do
if [ -f "$script" ] && [ -x "$script" ]; then
echo "✅ $script"
else
echo "❌ $script 缺失或无执行权限"
status=1
fi
done

# 2. 检查依赖
echo ""
echo "🔧 依赖检查"
DEPENDENCIES=(
"git"
"jq"
"curl"
"claude-cli"
)

for cmd in "${DEPENDENCIES[@]}"; do
if command -v "$cmd" &> /dev/null; then
echo "✅ $cmd"
else
echo "❌ $cmd 未安装"
status=1
fi
done

# 3. 检查API可用性
echo ""
echo "🌐 API检查"

if curl -s --max-time 5 https://api.anthropic.com/health &> /dev/null; then
echo "✅ Claude API"
else
echo "⚠️ Claude API 无响应"
fi

# 4. 检查磁盘空间
echo ""
echo "💾 磁盘空间"
DISK_USAGE=$(df -h ~ | awk 'NR==2 {print $5}' | sed 's/%//')

if [ "$DISK_USAGE" -lt 80 ]; then
echo "✅ 磁盘使用: ${DISK_USAGE}%"
elif [ "$DISK_USAGE" -lt 90 ]; then
echo "⚠️ 磁盘使用: ${DISK_USAGE}% (接近上限)"
else
echo "❌ 磁盘使用: ${DISK_USAGE}% (需要清理)"
status=1
fi

# 5. 检查日志文件大小
echo ""
echo "📝 日志文件"
LOG_SIZE=$(du -sh ~/.automation/logs 2>/dev/null | cut -f1)
echo "日志总大小: $LOG_SIZE"

# 6. 检查最近失败
echo ""
echo "📊 最近执行"
RECENT_FAILURES=$(tail -100 ~/.automation/monitor.log | grep -c "failure")
echo "最近100次执行中失败: $RECENT_FAILURES 次"

if [ "$RECENT_FAILURES" -gt 10 ]; then
echo "⚠️ 失败率过高"
status=1
fi

echo ""
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━"

if [ "$status" -eq 0 ]; then
echo "✅ 系统健康"
else
echo "❌ 发现问题,需要处理"
fi

return $status
}

# 定时运行健康检查 (crontab: 0 */6 * * *)
health_check

🔧 维护策略

1. 日志管理

#!/bin/bash
# log-management.sh

LOG_DIR="$HOME/.automation/logs"
MAX_LOG_SIZE_MB=100
MAX_LOG_AGE_DAYS=30

function rotate_logs() {
echo "🔄 日志轮转"

find "$LOG_DIR" -name "*.log" -type f | while read log_file; do
# 检查文件大小
size=$(stat -f%z "$log_file")
size_mb=$((size / 1024 / 1024))

if [ "$size_mb" -gt "$MAX_LOG_SIZE_MB" ]; then
echo "轮转: $log_file (${size_mb}MB)"

# 压缩旧日志
gzip "$log_file"
mv "${log_file}.gz" "${log_file}.$(date +%Y%m%d).gz"

# 创建新日志
touch "$log_file"
fi
done
}

function cleanup_old_logs() {
echo "🧹 清理旧日志"

# 删除超过30天的日志
find "$LOG_DIR" -name "*.log.*.gz" -mtime +$MAX_LOG_AGE_DAYS -delete

echo "清理完成"
}

function analyze_logs() {
echo "📊 日志分析"

# 统计错误
echo "错误统计:"
grep -h "ERROR" "$LOG_DIR"/*.log 2>/dev/null | \
awk '{print $4}' | sort | uniq -c | sort -rn | head -10

# 统计警告
echo ""
echo "警告统计:"
grep -h "WARN" "$LOG_DIR"/*.log 2>/dev/null | \
awk '{print $4}' | sort | uniq -c | sort -rn | head -10
}

# 定期执行
rotate_logs
cleanup_old_logs
analyze_logs

2. 依赖更新

#!/bin/bash
# update-dependencies.sh

function update_homebrew() {
echo "🍺 更新Homebrew包"

# 更新Homebrew
brew update

# 列出可更新的包
OUTDATED=$(brew outdated)

if [ -n "$OUTDATED" ]; then
echo "可更新的包:"
echo "$OUTDATED"

read -p "是否更新? (y/N): " -n 1 -r
echo
if [[ $REPLY =~ ^[Yy]$ ]]; then
brew upgrade
fi
else
echo "✅ 所有包都是最新的"
fi
}

function update_npm_global() {
echo "📦 更新全局npm包"

npm outdated -g

read -p "是否更新? (y/N): " -n 1 -r
echo
if [[ $REPLY =~ ^[Yy]$ ]]; then
npm update -g
fi
}

function check_script_compatibility() {
echo "🔍 检查脚本兼容性"

# 运行测试套件
if [ -f "$HOME/.automation/tests/run-all.sh" ]; then
"$HOME/.automation/tests/run-all.sh"
fi
}

# 月度更新流程
echo "🔄 月度依赖更新"
update_homebrew
update_npm_global
check_script_compatibility

3. 备份与恢复

#!/bin/bash
# backup-automation.sh

BACKUP_DIR="$HOME/.automation-backups"
SOURCE_DIR="$HOME/.automation"

function create_backup() {
local timestamp=$(date +%Y%m%d_%H%M%S)
local backup_file="$BACKUP_DIR/automation_${timestamp}.tar.gz"

echo "💾 创建备份: $backup_file"

mkdir -p "$BACKUP_DIR"

# 排除日志和缓存
tar czf "$backup_file" \
--exclude="logs" \
--exclude="cache" \
--exclude="*.log" \
-C ~ .automation

echo "✅ 备份完成: $(du -h $backup_file | cut -f1)"

# 上传到云端
if command -v rclone &> /dev/null; then
rclone copy "$backup_file" remote:automation-backups/
echo "☁️ 已上传到云端"
fi
}

function restore_backup() {
local backup_file=$1

if [ ! -f "$backup_file" ]; then
echo "❌ 备份文件不存在: $backup_file"
return 1
fi

echo "⚠️ 即将恢复备份,当前配置将被覆盖"
read -p "确认? (yes/NO): " confirm

if [ "$confirm" != "yes" ]; then
echo "取消恢复"
return 0
fi

# 备份当前配置
if [ -d "$SOURCE_DIR" ]; then
mv "$SOURCE_DIR" "${SOURCE_DIR}.old.$(date +%s)"
fi

# 恢复
tar xzf "$backup_file" -C ~

echo "✅ 恢复完成"
}

function list_backups() {
echo "📋 可用备份:"
ls -lh "$BACKUP_DIR" | tail -n +2 | awk '{print $9, "(" $5 ")"}'
}

function cleanup_old_backups() {
# 保留最近10个备份
ls -t "$BACKUP_DIR"/automation_*.tar.gz | tail -n +11 | xargs rm -f
echo "✅ 清理旧备份完成"
}

# 使用
case ${1:-backup} in
backup)
create_backup
cleanup_old_backups
;;
restore)
list_backups
read -p "输入备份文件名: " filename
restore_backup "$BACKUP_DIR/$filename"
;;
list)
list_backups
;;
esac

📅 维护计划

日常维护(每天)

#!/bin/bash
# daily-maintenance.sh

# 自动运行,无需人工干预
health_check
monitor_execution
check_disk_space

周维护(每周日)

#!/bin/bash
# weekly-maintenance.sh

# 半自动,需要审查
echo "📅 周维护任务"

# 1. 生成周报
generate_weekly_report

# 2. 清理日志
rotate_logs
cleanup_old_logs

# 3. 备份配置
create_backup

# 4. 审查失败任务
review_failures

echo "✅ 周维护完成"

月维护(每月1号)

#!/bin/bash
# monthly-maintenance.sh

echo "📅 月度维护"

# 1. 更新依赖
update_dependencies

# 2. 性能审查
performance_review

# 3. 安全审计
security_audit

# 4. 脚本优化
identify_optimization_opportunities

# 5. 文档更新
update_documentation

echo "✅ 月度维护完成"

季度维护(每季度)

# 季度深度审查

## 目标
全面审查和优化自动化系统

## 检查清单
- [ ] 审查所有自动化脚本使用情况
- [ ] 识别未使用的脚本(考虑删除)
- [ ] 评估ROI
- [ ] 重构复杂脚本
- [ ] 更新最佳实践文档
- [ ] 团队培训(如适用)
- [ ] 规划下季度改进

## 时间投入
4-6小时

🚨 故障处理

故障响应手册

# 自动化故障响应手册

## 严重程度分级

### P0 - 紧急 (< 15分钟响应)
- 生产部署流程中断
- 数据丢失风险
- 安全漏洞

### P1 - 高 (< 1小时响应)
- 关键自动化失败
- 影响多人工作
- 性能严重下降

### P2 - 中 (< 4小时响应)
- 单个脚本失败
- 影响个人效率
- 非关键功能异常

### P3 - 低 (< 1天响应)
- 优化建议
- 文档更新
- 功能请求

## 响应流程

1. **识别**: 通过告警或手动发现
2. **评估**: 判断严重程度
3. **隔离**: 防止影响扩大
4. **修复**: 解决问题
5. **验证**: 确认修复有效
6. **总结**: 记录事后分析

## 常见问题快速修复

### API配额超限
\`\`\`bash
# 1. 检查使用量
cat ~/.automation/api-usage.log | tail -100

# 2. 清理缓存,启用缓存
enable_api_cache

# 3. 等待配额重置
\`\`\`

### 脚本权限错误
\`\`\`bash
# 修复权限
chmod +x ~/.automation/scripts/*.sh
\`\`\`

### 依赖缺失
\`\`\`bash
# 重新安装依赖
./setup.sh
\`\`\`

事后分析模板

# 事件事后分析

**日期**: 2025-01-04
**严重程度**: P1
**影响时长**: 2小时

## 事件概要
简要描述发生了什么

## 时间线
- 14:00 - 首次发现问题
- 14:15 - 定位到根本原因
- 15:30 - 实施修复
- 16:00 - 验证修复有效

## 根本原因
详细分析为什么会发生

## 影响范围
- 影响用户: 5人
- 影响任务: 部署流程
- 业务损失: 2小时生产力

## 解决方案
实施的临时和永久修复

## 预防措施
- [ ] 添加监控告警
- [ ] 改进错误处理
- [ ] 更新文档
- [ ] 增加测试覆盖

## 经验教训
从这次事件中学到了什么

记住: 好的监控和维护让自动化系统长期可靠运行。投资于维护,避免技术债务!