automated_task_monitor/monitor/tasks.py

407 lines
18 KiB
Python
Raw Normal View History

2025-02-18 19:40:58 +08:00
import psutil
import time
from .models import HighCPUProcess, HighGPUProcess, HighMemoryProcess
import GPUtil # 需要安装 gputil
from django.utils import timezone
import os
import logging
from datetime import datetime
from django.db import models
from django.conf import settings
def setup_logger(pid, resource_type):
"""为每个进程设置独立的日志记录器"""
from django.conf import settings
import os
# 构建完整的日志路径
log_dir = os.path.join(settings.BASE_DIR, 'logs', 'process_monitor', resource_type)
os.makedirs(log_dir, exist_ok=True)
log_file = os.path.abspath(os.path.join(log_dir, f'process_{pid}_{datetime.now().strftime("%Y%m%d")}.log'))
logger = logging.getLogger(f'{resource_type}_process_{pid}')
logger.setLevel(logging.INFO)
if not logger.handlers:
# 指定 utf-8 编码
handler = logging.FileHandler(log_file, encoding='utf-8')
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
logger.addHandler(handler)
return logger, log_file
def get_process_gpu_usage(pid):
"""获取进程的GPU使用情况"""
try:
import subprocess
import logging
logger = logging.getLogger('gpu_check')
# 检查 NVIDIA GPU
try:
# 先检查 nvidia-smi 是否可用
gpu_info = subprocess.check_output(['which', 'nvidia-smi']).decode('utf-8').strip()
logger.info(f"找到 nvidia-smi: {gpu_info}")
# 获取 GPU 列表
gpu_list = subprocess.check_output(['nvidia-smi', '-L']).decode('utf-8').strip()
logger.info(f"检测到的GPU:\n{gpu_list}")
if not gpu_list:
logger.warning("未检测到NVIDIA GPU")
return 0, 0, "无GPU"
except (subprocess.SubprocessError, FileNotFoundError) as e:
logger.error(f"nvidia-smi 命令错误: {str(e)}")
return 0, 0, "驱动未安装"
try:
# 获取所有 GPU 的基本信息
cmd = ['nvidia-smi', '--format=csv,noheader,nounits', '--query-gpu=index,utilization.gpu,memory.used']
gpu_output = subprocess.check_output(cmd, timeout=5).decode('utf-8').strip()
logger.info(f"GPU基本信息:\n{gpu_output}")
# 获取进程的 GPU 使用情况
cmd_process = ['nvidia-smi', '--format=csv,noheader,nounits', '--query-compute-apps=gpu_uuid,pid,used_memory']
process_output = subprocess.check_output(cmd_process, timeout=5).decode('utf-8').strip()
logger.info(f"GPU进程信息:\n{process_output}")
# 如果没有任何进程使用 GPU
if not process_output:
logger.info(f"进程 {pid} 未使用GPU")
return 0, 0, "未使用GPU"
# 解析进程的 GPU 使用情况
total_memory = 0
gpu_util = 0
# 从进程输出中查找指定 PID
for line in process_output.split('\n'):
if str(pid) in line:
try:
_, _, memory = line.split(',')
total_memory += float(memory.strip())
except (ValueError, IndexError) as e:
logger.error(f"解析GPU进程数据错误: {str(e)}, 数据: {line}")
continue
# 如果找到了进程的 GPU 使用
if total_memory > 0:
# 获取最大的 GPU 使用率
for line in gpu_output.split('\n'):
try:
_, util, _ = line.split(',')
gpu_util = max(gpu_util, float(util.strip()))
except (ValueError, IndexError) as e:
logger.error(f"解析GPU使用率错误: {str(e)}, 数据: {line}")
continue
logger.info(
f"进程 {pid} GPU使用情况:\n"
f"├─ GPU使用率: {gpu_util:.1f}%\n"
f"└─ 显存使用: {total_memory:.1f}MB"
)
return gpu_util, total_memory, "正常"
else:
logger.info(f"进程 {pid} 未使用GPU")
return 0, 0, "未使用GPU"
except subprocess.TimeoutExpired as e:
logger.error(f"GPU命令超时: {str(e)}")
return 0, 0, "获取超时"
except subprocess.CalledProcessError as e:
logger.error(f"GPU命令执行错误: {str(e)}, 输出: {e.output.decode('utf-8') if e.output else 'None'}")
return 0, 0, "命令错误"
except Exception as e:
logger.error(f"获取GPU信息时发生错误: {str(e)}")
return 0, 0, "获取错误"
except Exception as e:
logger.error(f"GPU检测失败: {str(e)}")
return 0, 0, "检测失败"
def monitor_process(pid, resource_type):
"""监控进程资源使用情况"""
# 从 Django 设置中获取监控间隔,默认为 60 秒
from django.conf import settings
MONITOR_INTERVAL = getattr(settings, 'MONITOR_INTERVAL', 60) # 单位:秒
logger, log_file = setup_logger(pid, resource_type)
monitor = None
try:
process = psutil.Process(pid)
logger.info(
f"开始监控进程:\n"
f"├─ 进程名称: {process.name()}\n"
f"├─ 进程ID: {pid}\n"
f"├─ 监控类型: {resource_type}\n"
f"├─ 监控间隔: {MONITOR_INTERVAL}\n"
f"└─ 开始时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
)
# 根据资源类型选择模型
ModelClass = {
'cpu': HighCPUProcess,
'gpu': HighGPUProcess,
'memory': HighMemoryProcess
}.get(resource_type)
if not ModelClass:
logger.error(f"未知的资源类型: {resource_type}")
return
# 创建并保存监控记录
monitor = ModelClass.objects.create(
pid=pid,
process_name=process.name(),
log_path=log_file,
status=1,
is_active=True
)
logger.info("创建新的监控记录")
# 设置线程名称
import threading
current_thread = threading.current_thread()
current_thread.name = f'monitor_{pid}_{resource_type}'
current_thread.do_run = True
while current_thread.do_run:
try:
# 检查监控记录是否被手动停止
try:
monitor.refresh_from_db()
if not monitor.is_active:
logger.info("监控被手动停止")
break
except ModelClass.DoesNotExist:
logger.error("监控记录已被删除")
break
# 检查进程状态
if not process.is_running():
logger.warning(f"进程 {pid} 已终止")
monitor.status = 0
monitor.is_active = False
monitor.save()
break
process_status = process.status()
status_map = {
psutil.STATUS_RUNNING: 1, # 运行中
psutil.STATUS_SLEEPING: 1, # 休眠中(正常)
psutil.STATUS_DISK_SLEEP: 1, # 磁盘休眠(正常)
psutil.STATUS_STOPPED: 0, # 已停止
psutil.STATUS_TRACING_STOP: 0, # 跟踪停止
psutil.STATUS_ZOMBIE: 0, # 僵尸进程
psutil.STATUS_DEAD: 0, # 已死亡
psutil.STATUS_WAKING: 1, # 唤醒中
psutil.STATUS_IDLE: 1, # 空闲(正常)
}.get(process_status, 1) # 默认为1运行中
# 更新监控记录
monitor.status = status_map
# 记录进程状态
logger.info(
f"进程状态:\n"
f"├─ 状态码: {status_map}\n"
f"├─ 状态描述: {process_status}\n"
f"├─ 监控状态: {'活跃' if monitor.is_active else '已停止'}\n"
f"└─ 运行时长: {datetime.now() - datetime.fromtimestamp(process.create_time())}"
)
# 获取资源使用情况
with process.oneshot():
# CPU信息
cpu_percent = process.cpu_percent()
cpu_times = process.cpu_times()
cpu_num = psutil.cpu_count()
cpu_freq = psutil.cpu_freq()
# 内存信息
memory_info = process.memory_info()
memory_percent = process.memory_percent()
memory_maps = len(process.memory_maps())
virtual_memory = psutil.virtual_memory()
swap_memory = psutil.swap_memory()
# GPU信息
try:
gpu_usage, gpu_memory, gpu_status = get_process_gpu_usage(pid)
gpu_status_map = {
"无GPU": "未检测到GPU",
"驱动未安装": "GPU驱动未安装",
"未使用GPU": "进程未使用GPU",
"获取超时": "GPU信息获取超时",
"命令错误": "GPU命令执行错误",
"获取错误": "GPU信息获取错误",
"检测失败": "GPU检测失败",
"正常": "正常"
}
gpu_status_text = gpu_status_map.get(gpu_status, "未知状态")
except Exception as e:
logger.error(f"获取GPU信息失败: {str(e)}")
gpu_usage, gpu_memory, gpu_status_text = 0, 0, "异常"
# 在日志中记录GPU状态
logger.info(
f"GPU信息\n"
f"├─ 状态: {gpu_status_text}\n"
f"├─ 使用率: {gpu_usage:.1f}%\n"
f"└─ 显存使用: {gpu_memory:.1f}MB"
)
# IO信息
try:
io_counters = process.io_counters()
disk_io = psutil.disk_io_counters()
except (psutil.AccessDenied, AttributeError):
io_counters = None
disk_io = None
# 网络信息
try:
net_connections = len(process.connections())
net_io = psutil.net_io_counters()
except (psutil.AccessDenied, AttributeError):
net_connections = 0
net_io = None
# 其他系统信息
num_threads = process.num_threads()
num_fds = process.num_fds() if hasattr(process, 'num_fds') else 0
ctx_switches = process.num_ctx_switches()
# 更新监控记录
monitor.cpu_usage = cpu_percent
monitor.memory_usage = memory_info.rss / (1024 * 1024 * 1024) # GB
monitor.virtual_memory = memory_info.vms / (1024 * 1024 * 1024) # GB
monitor.gpu_usage = gpu_usage
monitor.gpu_memory = gpu_memory
# 记录详细的资源使用情况
logger.info(
f"资源使用情况 - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}:\n"
f"├─ CPU信息\n"
f"│ ├─ 使用率: {cpu_percent:.1f}%\n"
f"│ ├─ 用户态时间: {cpu_times.user:.1f}s\n"
f"│ ├─ 内核态时间: {cpu_times.system:.1f}s\n"
f"│ ├─ CPU核心数: {cpu_num}\n"
f"│ ├─ CPU频率: {cpu_freq.current:.1f}MHz\n"
f"│ └─ 上下文切换: {ctx_switches.voluntary}/{ctx_switches.involuntary}\n"
f"├─ 内存信息\n"
f"│ ├─ 物理内存: {memory_info.rss/1024/1024:.1f}MB ({memory_percent:.1f}%)\n"
f"│ ├─ 虚拟内存: {memory_info.vms/1024/1024:.1f}MB\n"
f"│ ├─ 内存映射: {memory_maps}\n"
f"│ ├─ 系统内存使用: {virtual_memory.percent:.1f}%\n"
f"│ └─ 交换空间使用: {swap_memory.percent:.1f}%\n"
f"├─ GPU信息\n"
f"│ ├─ 状态: {gpu_status_text}\n"
f"│ ├─ 使用率: {gpu_usage:.1f}%\n"
f"│ └─ 显存使用: {gpu_memory:.1f}MB\n"
f"├─ IO信息\n"
f"│ ├─ 读取: {io_counters.read_bytes/1024/1024:.1f}MB ({io_counters.read_count}次)\n" if io_counters else "│ ├─ 读取: 无法获取\n"
f"│ └─ 写入: {io_counters.write_bytes/1024/1024:.1f}MB ({io_counters.write_count}次)\n" if io_counters else "│ └─ 写入: 无法获取\n"
f"├─ 网络信息\n"
f"│ ├─ 连接数: {net_connections}\n"
f"│ ├─ 发送: {net_io.bytes_sent/1024/1024:.1f}MB\n" if net_io else "│ ├─ 发送: 无法获取\n"
f"│ └─ 接收: {net_io.bytes_recv/1024/1024:.1f}MB\n" if net_io else "│ └─ 接收: 无法获取\n"
f"└─ 其他信息\n"
f" ├─ 线程数: {num_threads}\n"
f" ├─ 文件描述符: {num_fds}\n"
f" └─ 子进程数: {len(process.children())}"
)
# 如果进程已经变为非活跃状态,更新状态并退出
if status_map == 0:
monitor.is_active = False
monitor.save()
logger.info(f"进程状态变为 {process_status},停止监控")
break
monitor.save()
time.sleep(MONITOR_INTERVAL) # 使用配置的间隔时间
except Exception as e:
logger.error(f"监控出错: {str(e)}")
logger.exception("详细错误信息:")
time.sleep(5) # 错误后短暂等待
continue
except Exception as e:
logger.error(f"监控初始化失败: {str(e)}")
logger.exception("详细错误信息:")
finally:
if monitor:
try:
monitor.refresh_from_db()
monitor.is_active = False
if not process.is_running():
monitor.status = 0
monitor.save()
except (ModelClass.DoesNotExist, psutil.NoSuchProcess):
pass
logger.info(
f"监控结束:\n"
f"├─ 进程名称: {monitor.process_name}\n"
f"├─ 进程ID: {monitor.pid}\n"
f"├─ 监控类型: {resource_type}\n"
f"├─ 进程状态: {'运行中' if monitor.status == 1 else '已终止'}\n"
f"├─ 监控状态: 已停止\n"
f"├─ 开始时间: {monitor.created_at}\n"
f"└─ 结束时间: {monitor.updated_at}"
)
def get_high_resource_processes():
"""获取高资源占用的进程"""
high_resource_procs = {
'cpu': [],
'gpu': [],
'memory': []
}
for proc in psutil.process_iter(['pid', 'name']):
try:
process = psutil.Process(proc.info['pid'])
# 检查CPU使用率
cpu_percent = process.cpu_percent(interval=1.0)
if cpu_percent > 200: # 使用超过2个核心
high_resource_procs['cpu'].append({
'pid': proc.info['pid'],
'name': proc.info['name'],
'cpu_usage': cpu_percent,
'cpu_cores': cpu_percent / 100
})
# 检查内存使用量
memory_gb = process.memory_info().rss / (1024 * 1024 * 1024)
if memory_gb > 20: # 使用超过20GB内存
high_resource_procs['memory'].append({
'pid': proc.info['pid'],
'name': proc.info['name'],
'memory_usage': memory_gb
})
# 检查GPU使用率
gpu_index, gpu_usage, gpu_status = get_process_gpu_usage(proc.info['pid'])
if gpu_usage > 50: # GPU使用率超过50%
high_resource_procs['gpu'].append({
'pid': proc.info['pid'],
'name': proc.info['name'],
'gpu_usage': gpu_usage,
'gpu_memory': gpu_memory,
'gpu_index': gpu_index
})
except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
continue
return high_resource_procs