import psutil import time from .models import HighCPUProcess, HighGPUProcess, HighMemoryProcess import GPUtil # 需要安装 gputil from django.utils import timezone import os import logging from datetime import datetime from django.db import models from django.conf import settings def setup_logger(pid, resource_type): """为每个进程设置独立的日志记录器""" from django.conf import settings import os # 构建完整的日志路径 log_dir = os.path.join(settings.BASE_DIR, 'logs', 'process_monitor', resource_type) os.makedirs(log_dir, exist_ok=True) log_file = os.path.abspath(os.path.join(log_dir, f'process_{pid}_{datetime.now().strftime("%Y%m%d")}.log')) logger = logging.getLogger(f'{resource_type}_process_{pid}') logger.setLevel(logging.INFO) if not logger.handlers: # 指定 utf-8 编码 handler = logging.FileHandler(log_file, encoding='utf-8') formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') handler.setFormatter(formatter) logger.addHandler(handler) return logger, log_file def get_process_gpu_usage(pid): """获取进程的GPU使用情况""" try: import subprocess import logging logger = logging.getLogger('gpu_check') # 检查 NVIDIA GPU try: # 先检查 nvidia-smi 是否可用 gpu_info = subprocess.check_output(['which', 'nvidia-smi']).decode('utf-8').strip() logger.info(f"找到 nvidia-smi: {gpu_info}") # 获取 GPU 列表 gpu_list = subprocess.check_output(['nvidia-smi', '-L']).decode('utf-8').strip() logger.info(f"检测到的GPU:\n{gpu_list}") if not gpu_list: logger.warning("未检测到NVIDIA GPU") return 0, 0, "无GPU" except (subprocess.SubprocessError, FileNotFoundError) as e: logger.error(f"nvidia-smi 命令错误: {str(e)}") return 0, 0, "驱动未安装" try: # 获取所有 GPU 的基本信息 cmd = ['nvidia-smi', '--format=csv,noheader,nounits', '--query-gpu=index,utilization.gpu,memory.used'] gpu_output = subprocess.check_output(cmd, timeout=5).decode('utf-8').strip() logger.info(f"GPU基本信息:\n{gpu_output}") # 获取进程的 GPU 使用情况 cmd_process = ['nvidia-smi', '--format=csv,noheader,nounits', '--query-compute-apps=gpu_uuid,pid,used_memory'] process_output = subprocess.check_output(cmd_process, timeout=5).decode('utf-8').strip() logger.info(f"GPU进程信息:\n{process_output}") # 如果没有任何进程使用 GPU if not process_output: logger.info(f"进程 {pid} 未使用GPU") return 0, 0, "未使用GPU" # 解析进程的 GPU 使用情况 total_memory = 0 gpu_util = 0 # 从进程输出中查找指定 PID for line in process_output.split('\n'): if str(pid) in line: try: _, _, memory = line.split(',') total_memory += float(memory.strip()) except (ValueError, IndexError) as e: logger.error(f"解析GPU进程数据错误: {str(e)}, 数据: {line}") continue # 如果找到了进程的 GPU 使用 if total_memory > 0: # 获取最大的 GPU 使用率 for line in gpu_output.split('\n'): try: _, util, _ = line.split(',') gpu_util = max(gpu_util, float(util.strip())) except (ValueError, IndexError) as e: logger.error(f"解析GPU使用率错误: {str(e)}, 数据: {line}") continue logger.info( f"进程 {pid} GPU使用情况:\n" f"├─ GPU使用率: {gpu_util:.1f}%\n" f"└─ 显存使用: {total_memory:.1f}MB" ) return gpu_util, total_memory, "正常" else: logger.info(f"进程 {pid} 未使用GPU") return 0, 0, "未使用GPU" except subprocess.TimeoutExpired as e: logger.error(f"GPU命令超时: {str(e)}") return 0, 0, "获取超时" except subprocess.CalledProcessError as e: logger.error(f"GPU命令执行错误: {str(e)}, 输出: {e.output.decode('utf-8') if e.output else 'None'}") return 0, 0, "命令错误" except Exception as e: logger.error(f"获取GPU信息时发生错误: {str(e)}") return 0, 0, "获取错误" except Exception as e: logger.error(f"GPU检测失败: {str(e)}") return 0, 0, "检测失败" def monitor_process(pid, resource_type): """监控进程资源使用情况""" # 从 Django 设置中获取监控间隔,默认为 60 秒 from django.conf import settings MONITOR_INTERVAL = getattr(settings, 'MONITOR_INTERVAL', 60) # 单位:秒 logger, log_file = setup_logger(pid, resource_type) monitor = None try: process = psutil.Process(pid) logger.info( f"开始监控进程:\n" f"├─ 进程名称: {process.name()}\n" f"├─ 进程ID: {pid}\n" f"├─ 监控类型: {resource_type}\n" f"├─ 监控间隔: {MONITOR_INTERVAL}秒\n" f"└─ 开始时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}" ) # 根据资源类型选择模型 ModelClass = { 'cpu': HighCPUProcess, 'gpu': HighGPUProcess, 'memory': HighMemoryProcess }.get(resource_type) if not ModelClass: logger.error(f"未知的资源类型: {resource_type}") return # 创建并保存监控记录 monitor = ModelClass.objects.create( pid=pid, process_name=process.name(), log_path=log_file, status=1, is_active=True ) logger.info("创建新的监控记录") # 设置线程名称 import threading current_thread = threading.current_thread() current_thread.name = f'monitor_{pid}_{resource_type}' current_thread.do_run = True while current_thread.do_run: try: # 检查监控记录是否被手动停止 try: monitor.refresh_from_db() if not monitor.is_active: logger.info("监控被手动停止") break except ModelClass.DoesNotExist: logger.error("监控记录已被删除") break # 检查进程状态 if not process.is_running(): logger.warning(f"进程 {pid} 已终止") monitor.status = 0 monitor.is_active = False monitor.save() break process_status = process.status() status_map = { psutil.STATUS_RUNNING: 1, # 运行中 psutil.STATUS_SLEEPING: 1, # 休眠中(正常) psutil.STATUS_DISK_SLEEP: 1, # 磁盘休眠(正常) psutil.STATUS_STOPPED: 0, # 已停止 psutil.STATUS_TRACING_STOP: 0, # 跟踪停止 psutil.STATUS_ZOMBIE: 0, # 僵尸进程 psutil.STATUS_DEAD: 0, # 已死亡 psutil.STATUS_WAKING: 1, # 唤醒中 psutil.STATUS_IDLE: 1, # 空闲(正常) }.get(process_status, 1) # 默认为1(运行中) # 更新监控记录 monitor.status = status_map # 记录进程状态 logger.info( f"进程状态:\n" f"├─ 状态码: {status_map}\n" f"├─ 状态描述: {process_status}\n" f"├─ 监控状态: {'活跃' if monitor.is_active else '已停止'}\n" f"└─ 运行时长: {datetime.now() - datetime.fromtimestamp(process.create_time())}" ) # 获取资源使用情况 with process.oneshot(): # CPU信息 cpu_percent = process.cpu_percent() cpu_times = process.cpu_times() cpu_num = psutil.cpu_count() cpu_freq = psutil.cpu_freq() # 内存信息 memory_info = process.memory_info() memory_percent = process.memory_percent() memory_maps = len(process.memory_maps()) virtual_memory = psutil.virtual_memory() swap_memory = psutil.swap_memory() # GPU信息 try: gpu_usage, gpu_memory, gpu_status = get_process_gpu_usage(pid) gpu_status_map = { "无GPU": "未检测到GPU", "驱动未安装": "GPU驱动未安装", "未使用GPU": "进程未使用GPU", "获取超时": "GPU信息获取超时", "命令错误": "GPU命令执行错误", "获取错误": "GPU信息获取错误", "检测失败": "GPU检测失败", "正常": "正常" } gpu_status_text = gpu_status_map.get(gpu_status, "未知状态") except Exception as e: logger.error(f"获取GPU信息失败: {str(e)}") gpu_usage, gpu_memory, gpu_status_text = 0, 0, "异常" # 在日志中记录GPU状态 logger.info( f"GPU信息\n" f"├─ 状态: {gpu_status_text}\n" f"├─ 使用率: {gpu_usage:.1f}%\n" f"└─ 显存使用: {gpu_memory:.1f}MB" ) # IO信息 try: io_counters = process.io_counters() disk_io = psutil.disk_io_counters() except (psutil.AccessDenied, AttributeError): io_counters = None disk_io = None # 网络信息 try: net_connections = len(process.connections()) net_io = psutil.net_io_counters() except (psutil.AccessDenied, AttributeError): net_connections = 0 net_io = None # 其他系统信息 num_threads = process.num_threads() num_fds = process.num_fds() if hasattr(process, 'num_fds') else 0 ctx_switches = process.num_ctx_switches() # 更新监控记录 monitor.cpu_usage = cpu_percent monitor.memory_usage = memory_info.rss / (1024 * 1024 * 1024) # GB monitor.virtual_memory = memory_info.vms / (1024 * 1024 * 1024) # GB monitor.gpu_usage = gpu_usage monitor.gpu_memory = gpu_memory # 记录详细的资源使用情况 logger.info( f"资源使用情况 - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}:\n" f"├─ CPU信息\n" f"│ ├─ 使用率: {cpu_percent:.1f}%\n" f"│ ├─ 用户态时间: {cpu_times.user:.1f}s\n" f"│ ├─ 内核态时间: {cpu_times.system:.1f}s\n" f"│ ├─ CPU核心数: {cpu_num}\n" f"│ ├─ CPU频率: {cpu_freq.current:.1f}MHz\n" f"│ └─ 上下文切换: {ctx_switches.voluntary}/{ctx_switches.involuntary}\n" f"├─ 内存信息\n" f"│ ├─ 物理内存: {memory_info.rss/1024/1024:.1f}MB ({memory_percent:.1f}%)\n" f"│ ├─ 虚拟内存: {memory_info.vms/1024/1024:.1f}MB\n" f"│ ├─ 内存映射: {memory_maps}个\n" f"│ ├─ 系统内存使用: {virtual_memory.percent:.1f}%\n" f"│ └─ 交换空间使用: {swap_memory.percent:.1f}%\n" f"├─ GPU信息\n" f"│ ├─ 状态: {gpu_status_text}\n" f"│ ├─ 使用率: {gpu_usage:.1f}%\n" f"│ └─ 显存使用: {gpu_memory:.1f}MB\n" f"├─ IO信息\n" f"│ ├─ 读取: {io_counters.read_bytes/1024/1024:.1f}MB ({io_counters.read_count}次)\n" if io_counters else "│ ├─ 读取: 无法获取\n" f"│ └─ 写入: {io_counters.write_bytes/1024/1024:.1f}MB ({io_counters.write_count}次)\n" if io_counters else "│ └─ 写入: 无法获取\n" f"├─ 网络信息\n" f"│ ├─ 连接数: {net_connections}\n" f"│ ├─ 发送: {net_io.bytes_sent/1024/1024:.1f}MB\n" if net_io else "│ ├─ 发送: 无法获取\n" f"│ └─ 接收: {net_io.bytes_recv/1024/1024:.1f}MB\n" if net_io else "│ └─ 接收: 无法获取\n" f"└─ 其他信息\n" f" ├─ 线程数: {num_threads}\n" f" ├─ 文件描述符: {num_fds}\n" f" └─ 子进程数: {len(process.children())}" ) # 如果进程已经变为非活跃状态,更新状态并退出 if status_map == 0: monitor.is_active = False monitor.save() logger.info(f"进程状态变为 {process_status},停止监控") break monitor.save() time.sleep(MONITOR_INTERVAL) # 使用配置的间隔时间 except Exception as e: logger.error(f"监控出错: {str(e)}") logger.exception("详细错误信息:") time.sleep(5) # 错误后短暂等待 continue except Exception as e: logger.error(f"监控初始化失败: {str(e)}") logger.exception("详细错误信息:") finally: if monitor: try: monitor.refresh_from_db() monitor.is_active = False if not process.is_running(): monitor.status = 0 monitor.save() except (ModelClass.DoesNotExist, psutil.NoSuchProcess): pass logger.info( f"监控结束:\n" f"├─ 进程名称: {monitor.process_name}\n" f"├─ 进程ID: {monitor.pid}\n" f"├─ 监控类型: {resource_type}\n" f"├─ 进程状态: {'运行中' if monitor.status == 1 else '已终止'}\n" f"├─ 监控状态: 已停止\n" f"├─ 开始时间: {monitor.created_at}\n" f"└─ 结束时间: {monitor.updated_at}" ) def get_high_resource_processes(): """获取高资源占用的进程""" high_resource_procs = { 'cpu': [], 'gpu': [], 'memory': [] } for proc in psutil.process_iter(['pid', 'name']): try: process = psutil.Process(proc.info['pid']) # 检查CPU使用率 cpu_percent = process.cpu_percent(interval=1.0) if cpu_percent > 200: # 使用超过2个核心 high_resource_procs['cpu'].append({ 'pid': proc.info['pid'], 'name': proc.info['name'], 'cpu_usage': cpu_percent, 'cpu_cores': cpu_percent / 100 }) # 检查内存使用量 memory_gb = process.memory_info().rss / (1024 * 1024 * 1024) if memory_gb > 20: # 使用超过20GB内存 high_resource_procs['memory'].append({ 'pid': proc.info['pid'], 'name': proc.info['name'], 'memory_usage': memory_gb }) # 检查GPU使用率 gpu_index, gpu_usage, gpu_status = get_process_gpu_usage(proc.info['pid']) if gpu_usage > 50: # GPU使用率超过50% high_resource_procs['gpu'].append({ 'pid': proc.info['pid'], 'name': proc.info['name'], 'gpu_usage': gpu_usage, 'gpu_memory': gpu_memory, 'gpu_index': gpu_index }) except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess): continue return high_resource_procs