automated_task_monitor/monitor/tasks.py
2025-02-18 19:40:58 +08:00

407 lines
18 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import psutil
import time
from .models import HighCPUProcess, HighGPUProcess, HighMemoryProcess
import GPUtil # 需要安装 gputil
from django.utils import timezone
import os
import logging
from datetime import datetime
from django.db import models
from django.conf import settings
def setup_logger(pid, resource_type):
"""为每个进程设置独立的日志记录器"""
from django.conf import settings
import os
# 构建完整的日志路径
log_dir = os.path.join(settings.BASE_DIR, 'logs', 'process_monitor', resource_type)
os.makedirs(log_dir, exist_ok=True)
log_file = os.path.abspath(os.path.join(log_dir, f'process_{pid}_{datetime.now().strftime("%Y%m%d")}.log'))
logger = logging.getLogger(f'{resource_type}_process_{pid}')
logger.setLevel(logging.INFO)
if not logger.handlers:
# 指定 utf-8 编码
handler = logging.FileHandler(log_file, encoding='utf-8')
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
logger.addHandler(handler)
return logger, log_file
def get_process_gpu_usage(pid):
"""获取进程的GPU使用情况"""
try:
import subprocess
import logging
logger = logging.getLogger('gpu_check')
# 检查 NVIDIA GPU
try:
# 先检查 nvidia-smi 是否可用
gpu_info = subprocess.check_output(['which', 'nvidia-smi']).decode('utf-8').strip()
logger.info(f"找到 nvidia-smi: {gpu_info}")
# 获取 GPU 列表
gpu_list = subprocess.check_output(['nvidia-smi', '-L']).decode('utf-8').strip()
logger.info(f"检测到的GPU:\n{gpu_list}")
if not gpu_list:
logger.warning("未检测到NVIDIA GPU")
return 0, 0, "无GPU"
except (subprocess.SubprocessError, FileNotFoundError) as e:
logger.error(f"nvidia-smi 命令错误: {str(e)}")
return 0, 0, "驱动未安装"
try:
# 获取所有 GPU 的基本信息
cmd = ['nvidia-smi', '--format=csv,noheader,nounits', '--query-gpu=index,utilization.gpu,memory.used']
gpu_output = subprocess.check_output(cmd, timeout=5).decode('utf-8').strip()
logger.info(f"GPU基本信息:\n{gpu_output}")
# 获取进程的 GPU 使用情况
cmd_process = ['nvidia-smi', '--format=csv,noheader,nounits', '--query-compute-apps=gpu_uuid,pid,used_memory']
process_output = subprocess.check_output(cmd_process, timeout=5).decode('utf-8').strip()
logger.info(f"GPU进程信息:\n{process_output}")
# 如果没有任何进程使用 GPU
if not process_output:
logger.info(f"进程 {pid} 未使用GPU")
return 0, 0, "未使用GPU"
# 解析进程的 GPU 使用情况
total_memory = 0
gpu_util = 0
# 从进程输出中查找指定 PID
for line in process_output.split('\n'):
if str(pid) in line:
try:
_, _, memory = line.split(',')
total_memory += float(memory.strip())
except (ValueError, IndexError) as e:
logger.error(f"解析GPU进程数据错误: {str(e)}, 数据: {line}")
continue
# 如果找到了进程的 GPU 使用
if total_memory > 0:
# 获取最大的 GPU 使用率
for line in gpu_output.split('\n'):
try:
_, util, _ = line.split(',')
gpu_util = max(gpu_util, float(util.strip()))
except (ValueError, IndexError) as e:
logger.error(f"解析GPU使用率错误: {str(e)}, 数据: {line}")
continue
logger.info(
f"进程 {pid} GPU使用情况:\n"
f"├─ GPU使用率: {gpu_util:.1f}%\n"
f"└─ 显存使用: {total_memory:.1f}MB"
)
return gpu_util, total_memory, "正常"
else:
logger.info(f"进程 {pid} 未使用GPU")
return 0, 0, "未使用GPU"
except subprocess.TimeoutExpired as e:
logger.error(f"GPU命令超时: {str(e)}")
return 0, 0, "获取超时"
except subprocess.CalledProcessError as e:
logger.error(f"GPU命令执行错误: {str(e)}, 输出: {e.output.decode('utf-8') if e.output else 'None'}")
return 0, 0, "命令错误"
except Exception as e:
logger.error(f"获取GPU信息时发生错误: {str(e)}")
return 0, 0, "获取错误"
except Exception as e:
logger.error(f"GPU检测失败: {str(e)}")
return 0, 0, "检测失败"
def monitor_process(pid, resource_type):
"""监控进程资源使用情况"""
# 从 Django 设置中获取监控间隔,默认为 60 秒
from django.conf import settings
MONITOR_INTERVAL = getattr(settings, 'MONITOR_INTERVAL', 60) # 单位:秒
logger, log_file = setup_logger(pid, resource_type)
monitor = None
try:
process = psutil.Process(pid)
logger.info(
f"开始监控进程:\n"
f"├─ 进程名称: {process.name()}\n"
f"├─ 进程ID: {pid}\n"
f"├─ 监控类型: {resource_type}\n"
f"├─ 监控间隔: {MONITOR_INTERVAL}\n"
f"└─ 开始时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
)
# 根据资源类型选择模型
ModelClass = {
'cpu': HighCPUProcess,
'gpu': HighGPUProcess,
'memory': HighMemoryProcess
}.get(resource_type)
if not ModelClass:
logger.error(f"未知的资源类型: {resource_type}")
return
# 创建并保存监控记录
monitor = ModelClass.objects.create(
pid=pid,
process_name=process.name(),
log_path=log_file,
status=1,
is_active=True
)
logger.info("创建新的监控记录")
# 设置线程名称
import threading
current_thread = threading.current_thread()
current_thread.name = f'monitor_{pid}_{resource_type}'
current_thread.do_run = True
while current_thread.do_run:
try:
# 检查监控记录是否被手动停止
try:
monitor.refresh_from_db()
if not monitor.is_active:
logger.info("监控被手动停止")
break
except ModelClass.DoesNotExist:
logger.error("监控记录已被删除")
break
# 检查进程状态
if not process.is_running():
logger.warning(f"进程 {pid} 已终止")
monitor.status = 0
monitor.is_active = False
monitor.save()
break
process_status = process.status()
status_map = {
psutil.STATUS_RUNNING: 1, # 运行中
psutil.STATUS_SLEEPING: 1, # 休眠中(正常)
psutil.STATUS_DISK_SLEEP: 1, # 磁盘休眠(正常)
psutil.STATUS_STOPPED: 0, # 已停止
psutil.STATUS_TRACING_STOP: 0, # 跟踪停止
psutil.STATUS_ZOMBIE: 0, # 僵尸进程
psutil.STATUS_DEAD: 0, # 已死亡
psutil.STATUS_WAKING: 1, # 唤醒中
psutil.STATUS_IDLE: 1, # 空闲(正常)
}.get(process_status, 1) # 默认为1运行中
# 更新监控记录
monitor.status = status_map
# 记录进程状态
logger.info(
f"进程状态:\n"
f"├─ 状态码: {status_map}\n"
f"├─ 状态描述: {process_status}\n"
f"├─ 监控状态: {'活跃' if monitor.is_active else '已停止'}\n"
f"└─ 运行时长: {datetime.now() - datetime.fromtimestamp(process.create_time())}"
)
# 获取资源使用情况
with process.oneshot():
# CPU信息
cpu_percent = process.cpu_percent()
cpu_times = process.cpu_times()
cpu_num = psutil.cpu_count()
cpu_freq = psutil.cpu_freq()
# 内存信息
memory_info = process.memory_info()
memory_percent = process.memory_percent()
memory_maps = len(process.memory_maps())
virtual_memory = psutil.virtual_memory()
swap_memory = psutil.swap_memory()
# GPU信息
try:
gpu_usage, gpu_memory, gpu_status = get_process_gpu_usage(pid)
gpu_status_map = {
"无GPU": "未检测到GPU",
"驱动未安装": "GPU驱动未安装",
"未使用GPU": "进程未使用GPU",
"获取超时": "GPU信息获取超时",
"命令错误": "GPU命令执行错误",
"获取错误": "GPU信息获取错误",
"检测失败": "GPU检测失败",
"正常": "正常"
}
gpu_status_text = gpu_status_map.get(gpu_status, "未知状态")
except Exception as e:
logger.error(f"获取GPU信息失败: {str(e)}")
gpu_usage, gpu_memory, gpu_status_text = 0, 0, "异常"
# 在日志中记录GPU状态
logger.info(
f"GPU信息\n"
f"├─ 状态: {gpu_status_text}\n"
f"├─ 使用率: {gpu_usage:.1f}%\n"
f"└─ 显存使用: {gpu_memory:.1f}MB"
)
# IO信息
try:
io_counters = process.io_counters()
disk_io = psutil.disk_io_counters()
except (psutil.AccessDenied, AttributeError):
io_counters = None
disk_io = None
# 网络信息
try:
net_connections = len(process.connections())
net_io = psutil.net_io_counters()
except (psutil.AccessDenied, AttributeError):
net_connections = 0
net_io = None
# 其他系统信息
num_threads = process.num_threads()
num_fds = process.num_fds() if hasattr(process, 'num_fds') else 0
ctx_switches = process.num_ctx_switches()
# 更新监控记录
monitor.cpu_usage = cpu_percent
monitor.memory_usage = memory_info.rss / (1024 * 1024 * 1024) # GB
monitor.virtual_memory = memory_info.vms / (1024 * 1024 * 1024) # GB
monitor.gpu_usage = gpu_usage
monitor.gpu_memory = gpu_memory
# 记录详细的资源使用情况
logger.info(
f"资源使用情况 - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}:\n"
f"├─ CPU信息\n"
f"│ ├─ 使用率: {cpu_percent:.1f}%\n"
f"│ ├─ 用户态时间: {cpu_times.user:.1f}s\n"
f"│ ├─ 内核态时间: {cpu_times.system:.1f}s\n"
f"│ ├─ CPU核心数: {cpu_num}\n"
f"│ ├─ CPU频率: {cpu_freq.current:.1f}MHz\n"
f"│ └─ 上下文切换: {ctx_switches.voluntary}/{ctx_switches.involuntary}\n"
f"├─ 内存信息\n"
f"│ ├─ 物理内存: {memory_info.rss/1024/1024:.1f}MB ({memory_percent:.1f}%)\n"
f"│ ├─ 虚拟内存: {memory_info.vms/1024/1024:.1f}MB\n"
f"│ ├─ 内存映射: {memory_maps}\n"
f"│ ├─ 系统内存使用: {virtual_memory.percent:.1f}%\n"
f"│ └─ 交换空间使用: {swap_memory.percent:.1f}%\n"
f"├─ GPU信息\n"
f"│ ├─ 状态: {gpu_status_text}\n"
f"│ ├─ 使用率: {gpu_usage:.1f}%\n"
f"│ └─ 显存使用: {gpu_memory:.1f}MB\n"
f"├─ IO信息\n"
f"│ ├─ 读取: {io_counters.read_bytes/1024/1024:.1f}MB ({io_counters.read_count}次)\n" if io_counters else "│ ├─ 读取: 无法获取\n"
f"│ └─ 写入: {io_counters.write_bytes/1024/1024:.1f}MB ({io_counters.write_count}次)\n" if io_counters else "│ └─ 写入: 无法获取\n"
f"├─ 网络信息\n"
f"│ ├─ 连接数: {net_connections}\n"
f"│ ├─ 发送: {net_io.bytes_sent/1024/1024:.1f}MB\n" if net_io else "│ ├─ 发送: 无法获取\n"
f"│ └─ 接收: {net_io.bytes_recv/1024/1024:.1f}MB\n" if net_io else "│ └─ 接收: 无法获取\n"
f"└─ 其他信息\n"
f" ├─ 线程数: {num_threads}\n"
f" ├─ 文件描述符: {num_fds}\n"
f" └─ 子进程数: {len(process.children())}"
)
# 如果进程已经变为非活跃状态,更新状态并退出
if status_map == 0:
monitor.is_active = False
monitor.save()
logger.info(f"进程状态变为 {process_status},停止监控")
break
monitor.save()
time.sleep(MONITOR_INTERVAL) # 使用配置的间隔时间
except Exception as e:
logger.error(f"监控出错: {str(e)}")
logger.exception("详细错误信息:")
time.sleep(5) # 错误后短暂等待
continue
except Exception as e:
logger.error(f"监控初始化失败: {str(e)}")
logger.exception("详细错误信息:")
finally:
if monitor:
try:
monitor.refresh_from_db()
monitor.is_active = False
if not process.is_running():
monitor.status = 0
monitor.save()
except (ModelClass.DoesNotExist, psutil.NoSuchProcess):
pass
logger.info(
f"监控结束:\n"
f"├─ 进程名称: {monitor.process_name}\n"
f"├─ 进程ID: {monitor.pid}\n"
f"├─ 监控类型: {resource_type}\n"
f"├─ 进程状态: {'运行中' if monitor.status == 1 else '已终止'}\n"
f"├─ 监控状态: 已停止\n"
f"├─ 开始时间: {monitor.created_at}\n"
f"└─ 结束时间: {monitor.updated_at}"
)
def get_high_resource_processes():
"""获取高资源占用的进程"""
high_resource_procs = {
'cpu': [],
'gpu': [],
'memory': []
}
for proc in psutil.process_iter(['pid', 'name']):
try:
process = psutil.Process(proc.info['pid'])
# 检查CPU使用率
cpu_percent = process.cpu_percent(interval=1.0)
if cpu_percent > 200: # 使用超过2个核心
high_resource_procs['cpu'].append({
'pid': proc.info['pid'],
'name': proc.info['name'],
'cpu_usage': cpu_percent,
'cpu_cores': cpu_percent / 100
})
# 检查内存使用量
memory_gb = process.memory_info().rss / (1024 * 1024 * 1024)
if memory_gb > 20: # 使用超过20GB内存
high_resource_procs['memory'].append({
'pid': proc.info['pid'],
'name': proc.info['name'],
'memory_usage': memory_gb
})
# 检查GPU使用率
gpu_index, gpu_usage, gpu_status = get_process_gpu_usage(proc.info['pid'])
if gpu_usage > 50: # GPU使用率超过50%
high_resource_procs['gpu'].append({
'pid': proc.info['pid'],
'name': proc.info['name'],
'gpu_usage': gpu_usage,
'gpu_memory': gpu_memory,
'gpu_index': gpu_index
})
except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
continue
return high_resource_procs