407 lines
18 KiB
Python
407 lines
18 KiB
Python
import psutil
|
||
import time
|
||
from .models import HighCPUProcess, HighGPUProcess, HighMemoryProcess
|
||
import GPUtil # 需要安装 gputil
|
||
from django.utils import timezone
|
||
import os
|
||
import logging
|
||
from datetime import datetime
|
||
from django.db import models
|
||
from django.conf import settings
|
||
|
||
def setup_logger(pid, resource_type):
|
||
"""为每个进程设置独立的日志记录器"""
|
||
from django.conf import settings
|
||
import os
|
||
|
||
# 构建完整的日志路径
|
||
log_dir = os.path.join(settings.BASE_DIR, 'logs', 'process_monitor', resource_type)
|
||
os.makedirs(log_dir, exist_ok=True)
|
||
|
||
log_file = os.path.abspath(os.path.join(log_dir, f'process_{pid}_{datetime.now().strftime("%Y%m%d")}.log'))
|
||
|
||
logger = logging.getLogger(f'{resource_type}_process_{pid}')
|
||
logger.setLevel(logging.INFO)
|
||
|
||
if not logger.handlers:
|
||
# 指定 utf-8 编码
|
||
handler = logging.FileHandler(log_file, encoding='utf-8')
|
||
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
|
||
handler.setFormatter(formatter)
|
||
logger.addHandler(handler)
|
||
|
||
return logger, log_file
|
||
|
||
def get_process_gpu_usage(pid):
|
||
"""获取进程的GPU使用情况"""
|
||
try:
|
||
import subprocess
|
||
import logging
|
||
|
||
logger = logging.getLogger('gpu_check')
|
||
|
||
# 检查 NVIDIA GPU
|
||
try:
|
||
# 先检查 nvidia-smi 是否可用
|
||
gpu_info = subprocess.check_output(['which', 'nvidia-smi']).decode('utf-8').strip()
|
||
logger.info(f"找到 nvidia-smi: {gpu_info}")
|
||
|
||
# 获取 GPU 列表
|
||
gpu_list = subprocess.check_output(['nvidia-smi', '-L']).decode('utf-8').strip()
|
||
logger.info(f"检测到的GPU:\n{gpu_list}")
|
||
|
||
if not gpu_list:
|
||
logger.warning("未检测到NVIDIA GPU")
|
||
return 0, 0, "无GPU"
|
||
|
||
except (subprocess.SubprocessError, FileNotFoundError) as e:
|
||
logger.error(f"nvidia-smi 命令错误: {str(e)}")
|
||
return 0, 0, "驱动未安装"
|
||
|
||
try:
|
||
# 获取所有 GPU 的基本信息
|
||
cmd = ['nvidia-smi', '--format=csv,noheader,nounits', '--query-gpu=index,utilization.gpu,memory.used']
|
||
gpu_output = subprocess.check_output(cmd, timeout=5).decode('utf-8').strip()
|
||
logger.info(f"GPU基本信息:\n{gpu_output}")
|
||
|
||
# 获取进程的 GPU 使用情况
|
||
cmd_process = ['nvidia-smi', '--format=csv,noheader,nounits', '--query-compute-apps=gpu_uuid,pid,used_memory']
|
||
process_output = subprocess.check_output(cmd_process, timeout=5).decode('utf-8').strip()
|
||
logger.info(f"GPU进程信息:\n{process_output}")
|
||
|
||
# 如果没有任何进程使用 GPU
|
||
if not process_output:
|
||
logger.info(f"进程 {pid} 未使用GPU")
|
||
return 0, 0, "未使用GPU"
|
||
|
||
# 解析进程的 GPU 使用情况
|
||
total_memory = 0
|
||
gpu_util = 0
|
||
|
||
# 从进程输出中查找指定 PID
|
||
for line in process_output.split('\n'):
|
||
if str(pid) in line:
|
||
try:
|
||
_, _, memory = line.split(',')
|
||
total_memory += float(memory.strip())
|
||
except (ValueError, IndexError) as e:
|
||
logger.error(f"解析GPU进程数据错误: {str(e)}, 数据: {line}")
|
||
continue
|
||
|
||
# 如果找到了进程的 GPU 使用
|
||
if total_memory > 0:
|
||
# 获取最大的 GPU 使用率
|
||
for line in gpu_output.split('\n'):
|
||
try:
|
||
_, util, _ = line.split(',')
|
||
gpu_util = max(gpu_util, float(util.strip()))
|
||
except (ValueError, IndexError) as e:
|
||
logger.error(f"解析GPU使用率错误: {str(e)}, 数据: {line}")
|
||
continue
|
||
|
||
logger.info(
|
||
f"进程 {pid} GPU使用情况:\n"
|
||
f"├─ GPU使用率: {gpu_util:.1f}%\n"
|
||
f"└─ 显存使用: {total_memory:.1f}MB"
|
||
)
|
||
return gpu_util, total_memory, "正常"
|
||
else:
|
||
logger.info(f"进程 {pid} 未使用GPU")
|
||
return 0, 0, "未使用GPU"
|
||
|
||
except subprocess.TimeoutExpired as e:
|
||
logger.error(f"GPU命令超时: {str(e)}")
|
||
return 0, 0, "获取超时"
|
||
except subprocess.CalledProcessError as e:
|
||
logger.error(f"GPU命令执行错误: {str(e)}, 输出: {e.output.decode('utf-8') if e.output else 'None'}")
|
||
return 0, 0, "命令错误"
|
||
except Exception as e:
|
||
logger.error(f"获取GPU信息时发生错误: {str(e)}")
|
||
return 0, 0, "获取错误"
|
||
|
||
except Exception as e:
|
||
logger.error(f"GPU检测失败: {str(e)}")
|
||
return 0, 0, "检测失败"
|
||
|
||
def monitor_process(pid, resource_type):
|
||
"""监控进程资源使用情况"""
|
||
# 从 Django 设置中获取监控间隔,默认为 60 秒
|
||
from django.conf import settings
|
||
MONITOR_INTERVAL = getattr(settings, 'MONITOR_INTERVAL', 60) # 单位:秒
|
||
|
||
logger, log_file = setup_logger(pid, resource_type)
|
||
monitor = None
|
||
|
||
try:
|
||
process = psutil.Process(pid)
|
||
logger.info(
|
||
f"开始监控进程:\n"
|
||
f"├─ 进程名称: {process.name()}\n"
|
||
f"├─ 进程ID: {pid}\n"
|
||
f"├─ 监控类型: {resource_type}\n"
|
||
f"├─ 监控间隔: {MONITOR_INTERVAL}秒\n"
|
||
f"└─ 开始时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
|
||
)
|
||
|
||
# 根据资源类型选择模型
|
||
ModelClass = {
|
||
'cpu': HighCPUProcess,
|
||
'gpu': HighGPUProcess,
|
||
'memory': HighMemoryProcess
|
||
}.get(resource_type)
|
||
|
||
if not ModelClass:
|
||
logger.error(f"未知的资源类型: {resource_type}")
|
||
return
|
||
|
||
# 创建并保存监控记录
|
||
monitor = ModelClass.objects.create(
|
||
pid=pid,
|
||
process_name=process.name(),
|
||
log_path=log_file,
|
||
status=1,
|
||
is_active=True
|
||
)
|
||
logger.info("创建新的监控记录")
|
||
|
||
# 设置线程名称
|
||
import threading
|
||
current_thread = threading.current_thread()
|
||
current_thread.name = f'monitor_{pid}_{resource_type}'
|
||
current_thread.do_run = True
|
||
|
||
while current_thread.do_run:
|
||
try:
|
||
# 检查监控记录是否被手动停止
|
||
try:
|
||
monitor.refresh_from_db()
|
||
if not monitor.is_active:
|
||
logger.info("监控被手动停止")
|
||
break
|
||
except ModelClass.DoesNotExist:
|
||
logger.error("监控记录已被删除")
|
||
break
|
||
|
||
# 检查进程状态
|
||
if not process.is_running():
|
||
logger.warning(f"进程 {pid} 已终止")
|
||
monitor.status = 0
|
||
monitor.is_active = False
|
||
monitor.save()
|
||
break
|
||
|
||
process_status = process.status()
|
||
status_map = {
|
||
psutil.STATUS_RUNNING: 1, # 运行中
|
||
psutil.STATUS_SLEEPING: 1, # 休眠中(正常)
|
||
psutil.STATUS_DISK_SLEEP: 1, # 磁盘休眠(正常)
|
||
psutil.STATUS_STOPPED: 0, # 已停止
|
||
psutil.STATUS_TRACING_STOP: 0, # 跟踪停止
|
||
psutil.STATUS_ZOMBIE: 0, # 僵尸进程
|
||
psutil.STATUS_DEAD: 0, # 已死亡
|
||
psutil.STATUS_WAKING: 1, # 唤醒中
|
||
psutil.STATUS_IDLE: 1, # 空闲(正常)
|
||
}.get(process_status, 1) # 默认为1(运行中)
|
||
|
||
# 更新监控记录
|
||
monitor.status = status_map
|
||
|
||
# 记录进程状态
|
||
logger.info(
|
||
f"进程状态:\n"
|
||
f"├─ 状态码: {status_map}\n"
|
||
f"├─ 状态描述: {process_status}\n"
|
||
f"├─ 监控状态: {'活跃' if monitor.is_active else '已停止'}\n"
|
||
f"└─ 运行时长: {datetime.now() - datetime.fromtimestamp(process.create_time())}"
|
||
)
|
||
|
||
# 获取资源使用情况
|
||
with process.oneshot():
|
||
# CPU信息
|
||
cpu_percent = process.cpu_percent()
|
||
cpu_times = process.cpu_times()
|
||
cpu_num = psutil.cpu_count()
|
||
cpu_freq = psutil.cpu_freq()
|
||
|
||
# 内存信息
|
||
memory_info = process.memory_info()
|
||
memory_percent = process.memory_percent()
|
||
memory_maps = len(process.memory_maps())
|
||
virtual_memory = psutil.virtual_memory()
|
||
swap_memory = psutil.swap_memory()
|
||
|
||
# GPU信息
|
||
try:
|
||
gpu_usage, gpu_memory, gpu_status = get_process_gpu_usage(pid)
|
||
gpu_status_map = {
|
||
"无GPU": "未检测到GPU",
|
||
"驱动未安装": "GPU驱动未安装",
|
||
"未使用GPU": "进程未使用GPU",
|
||
"获取超时": "GPU信息获取超时",
|
||
"命令错误": "GPU命令执行错误",
|
||
"获取错误": "GPU信息获取错误",
|
||
"检测失败": "GPU检测失败",
|
||
"正常": "正常"
|
||
}
|
||
gpu_status_text = gpu_status_map.get(gpu_status, "未知状态")
|
||
except Exception as e:
|
||
logger.error(f"获取GPU信息失败: {str(e)}")
|
||
gpu_usage, gpu_memory, gpu_status_text = 0, 0, "异常"
|
||
|
||
# 在日志中记录GPU状态
|
||
logger.info(
|
||
f"GPU信息\n"
|
||
f"├─ 状态: {gpu_status_text}\n"
|
||
f"├─ 使用率: {gpu_usage:.1f}%\n"
|
||
f"└─ 显存使用: {gpu_memory:.1f}MB"
|
||
)
|
||
|
||
# IO信息
|
||
try:
|
||
io_counters = process.io_counters()
|
||
disk_io = psutil.disk_io_counters()
|
||
except (psutil.AccessDenied, AttributeError):
|
||
io_counters = None
|
||
disk_io = None
|
||
|
||
# 网络信息
|
||
try:
|
||
net_connections = len(process.connections())
|
||
net_io = psutil.net_io_counters()
|
||
except (psutil.AccessDenied, AttributeError):
|
||
net_connections = 0
|
||
net_io = None
|
||
|
||
# 其他系统信息
|
||
num_threads = process.num_threads()
|
||
num_fds = process.num_fds() if hasattr(process, 'num_fds') else 0
|
||
ctx_switches = process.num_ctx_switches()
|
||
|
||
# 更新监控记录
|
||
monitor.cpu_usage = cpu_percent
|
||
monitor.memory_usage = memory_info.rss / (1024 * 1024 * 1024) # GB
|
||
monitor.virtual_memory = memory_info.vms / (1024 * 1024 * 1024) # GB
|
||
monitor.gpu_usage = gpu_usage
|
||
monitor.gpu_memory = gpu_memory
|
||
|
||
# 记录详细的资源使用情况
|
||
logger.info(
|
||
f"资源使用情况 - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}:\n"
|
||
f"├─ CPU信息\n"
|
||
f"│ ├─ 使用率: {cpu_percent:.1f}%\n"
|
||
f"│ ├─ 用户态时间: {cpu_times.user:.1f}s\n"
|
||
f"│ ├─ 内核态时间: {cpu_times.system:.1f}s\n"
|
||
f"│ ├─ CPU核心数: {cpu_num}\n"
|
||
f"│ ├─ CPU频率: {cpu_freq.current:.1f}MHz\n"
|
||
f"│ └─ 上下文切换: {ctx_switches.voluntary}/{ctx_switches.involuntary}\n"
|
||
f"├─ 内存信息\n"
|
||
f"│ ├─ 物理内存: {memory_info.rss/1024/1024:.1f}MB ({memory_percent:.1f}%)\n"
|
||
f"│ ├─ 虚拟内存: {memory_info.vms/1024/1024:.1f}MB\n"
|
||
f"│ ├─ 内存映射: {memory_maps}个\n"
|
||
f"│ ├─ 系统内存使用: {virtual_memory.percent:.1f}%\n"
|
||
f"│ └─ 交换空间使用: {swap_memory.percent:.1f}%\n"
|
||
f"├─ GPU信息\n"
|
||
f"│ ├─ 状态: {gpu_status_text}\n"
|
||
f"│ ├─ 使用率: {gpu_usage:.1f}%\n"
|
||
f"│ └─ 显存使用: {gpu_memory:.1f}MB\n"
|
||
f"├─ IO信息\n"
|
||
f"│ ├─ 读取: {io_counters.read_bytes/1024/1024:.1f}MB ({io_counters.read_count}次)\n" if io_counters else "│ ├─ 读取: 无法获取\n"
|
||
f"│ └─ 写入: {io_counters.write_bytes/1024/1024:.1f}MB ({io_counters.write_count}次)\n" if io_counters else "│ └─ 写入: 无法获取\n"
|
||
f"├─ 网络信息\n"
|
||
f"│ ├─ 连接数: {net_connections}\n"
|
||
f"│ ├─ 发送: {net_io.bytes_sent/1024/1024:.1f}MB\n" if net_io else "│ ├─ 发送: 无法获取\n"
|
||
f"│ └─ 接收: {net_io.bytes_recv/1024/1024:.1f}MB\n" if net_io else "│ └─ 接收: 无法获取\n"
|
||
f"└─ 其他信息\n"
|
||
f" ├─ 线程数: {num_threads}\n"
|
||
f" ├─ 文件描述符: {num_fds}\n"
|
||
f" └─ 子进程数: {len(process.children())}"
|
||
)
|
||
|
||
# 如果进程已经变为非活跃状态,更新状态并退出
|
||
if status_map == 0:
|
||
monitor.is_active = False
|
||
monitor.save()
|
||
logger.info(f"进程状态变为 {process_status},停止监控")
|
||
break
|
||
|
||
monitor.save()
|
||
time.sleep(MONITOR_INTERVAL) # 使用配置的间隔时间
|
||
|
||
except Exception as e:
|
||
logger.error(f"监控出错: {str(e)}")
|
||
logger.exception("详细错误信息:")
|
||
time.sleep(5) # 错误后短暂等待
|
||
continue
|
||
|
||
except Exception as e:
|
||
logger.error(f"监控初始化失败: {str(e)}")
|
||
logger.exception("详细错误信息:")
|
||
finally:
|
||
if monitor:
|
||
try:
|
||
monitor.refresh_from_db()
|
||
monitor.is_active = False
|
||
if not process.is_running():
|
||
monitor.status = 0
|
||
monitor.save()
|
||
except (ModelClass.DoesNotExist, psutil.NoSuchProcess):
|
||
pass
|
||
|
||
logger.info(
|
||
f"监控结束:\n"
|
||
f"├─ 进程名称: {monitor.process_name}\n"
|
||
f"├─ 进程ID: {monitor.pid}\n"
|
||
f"├─ 监控类型: {resource_type}\n"
|
||
f"├─ 进程状态: {'运行中' if monitor.status == 1 else '已终止'}\n"
|
||
f"├─ 监控状态: 已停止\n"
|
||
f"├─ 开始时间: {monitor.created_at}\n"
|
||
f"└─ 结束时间: {monitor.updated_at}"
|
||
)
|
||
|
||
def get_high_resource_processes():
|
||
"""获取高资源占用的进程"""
|
||
high_resource_procs = {
|
||
'cpu': [],
|
||
'gpu': [],
|
||
'memory': []
|
||
}
|
||
|
||
for proc in psutil.process_iter(['pid', 'name']):
|
||
try:
|
||
process = psutil.Process(proc.info['pid'])
|
||
|
||
# 检查CPU使用率
|
||
cpu_percent = process.cpu_percent(interval=1.0)
|
||
if cpu_percent > 200: # 使用超过2个核心
|
||
high_resource_procs['cpu'].append({
|
||
'pid': proc.info['pid'],
|
||
'name': proc.info['name'],
|
||
'cpu_usage': cpu_percent,
|
||
'cpu_cores': cpu_percent / 100
|
||
})
|
||
|
||
# 检查内存使用量
|
||
memory_gb = process.memory_info().rss / (1024 * 1024 * 1024)
|
||
if memory_gb > 20: # 使用超过20GB内存
|
||
high_resource_procs['memory'].append({
|
||
'pid': proc.info['pid'],
|
||
'name': proc.info['name'],
|
||
'memory_usage': memory_gb
|
||
})
|
||
|
||
# 检查GPU使用率
|
||
gpu_index, gpu_usage, gpu_status = get_process_gpu_usage(proc.info['pid'])
|
||
if gpu_usage > 50: # GPU使用率超过50%
|
||
high_resource_procs['gpu'].append({
|
||
'pid': proc.info['pid'],
|
||
'name': proc.info['name'],
|
||
'gpu_usage': gpu_usage,
|
||
'gpu_memory': gpu_memory,
|
||
'gpu_index': gpu_index
|
||
})
|
||
|
||
except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
|
||
continue
|
||
|
||
return high_resource_procs
|