automated_task_monitor/monitor/views.py

from django.http import JsonResponse
from .tasks import monitor_process, get_process_gpu_usage
import threading
import psutil
from .models import HighCPUProcess, HighGPUProcess, HighMemoryProcess
import logging
import os
from datetime import datetime
import time
import nvidia_smi
from django.utils import timezone
from django.views.decorators.http import require_http_methods
from django.views.decorators.csrf import csrf_exempt

# 配置日志
LOG_DIR = 'logs/process_monitor'
os.makedirs(LOG_DIR, exist_ok=True)

def setup_logger(pid):
    """为每个进程设置独立的日志记录器"""
    log_file = os.path.join(LOG_DIR, f'process_{pid}_{datetime.now().strftime("%Y%m%d")}.log')
    logger = logging.getLogger(f'process_{pid}')
    logger.setLevel(logging.INFO)
    
    handler = logging.FileHandler(log_file)
    formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    handler.setFormatter(formatter)
    logger.addHandler(handler)
    
    return logger, log_file

def get_process_by_name(process_name):
    """根据进程名称获取进程PID"""
    pids = []
    for proc in psutil.process_iter(['pid', 'name']):
        try:
            if process_name.lower() in proc.info['name'].lower():
                pids.append(proc.info['pid'])
        except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
            pass
    return pids

def get_process_gpu_usage(pid):
    """获取进程的GPU使用情况"""
    try:
        nvidia_smi.nvmlInit()
        deviceCount = nvidia_smi.nvmlDeviceGetCount()
        gpu_usage = 0
        gpu_memory = 0
        
        for i in range(deviceCount):
            handle = nvidia_smi.nvmlDeviceGetHandleByIndex(i)
            processes = nvidia_smi.nvmlDeviceGetComputeRunningProcesses(handle)
            for process in processes:
                if process.pid == pid:
                    gpu_memory = process.usedGpuMemory / 1024 / 1024  # 转换为MB
                    gpu_usage = nvidia_smi.nvmlDeviceGetUtilizationRates(handle).gpu
                    return gpu_usage, gpu_memory
        
        return 0, 0
    except:
        return 0, 0
    finally:
        try:
            nvidia_smi.nvmlShutdown()
        except:
            pass

def get_high_resource_processes():
    """获取高资源占用的进程"""
    high_resource_pids = []
    for proc in psutil.process_iter(['pid', 'name']):
        try:
            # 获取进程信息
            process = psutil.Process(proc.info['pid'])
            memory_gb = process.memory_info().rss / (1024 * 1024 * 1024)  # 转换为GB
            
            # 获取GPU使用情况
            gpu_usage, gpu_memory = get_process_gpu_usage(proc.info['pid'])
            
            # 检查是否满足条件（GPU使用率>50%）
            if gpu_usage > 50:
                high_resource_pids.append({
                    'pid': proc.info['pid'],
                    'name': proc.info['name'],
                    'memory_gb': round(memory_gb, 2),
                    'gpu_usage': gpu_usage,
                    'gpu_memory': round(gpu_memory, 2)  # GPU显存使用量(MB)
                })
        except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
            continue
    return high_resource_pids

def auto_detect_high_resource_processes():
    """定期自动检测新的高资源进程"""
    while True:
        try:
            existing_pids = set(ProcessMonitor.objects.filter(is_active=True).values_list('pid', flat=True))
            high_resource_procs = get_high_resource_processes()
            
            for proc in high_resource_procs:
                if proc['pid'] not in existing_pids:
                    logger, log_file = setup_logger(proc['pid'])
                    
                    # 记录到数据库
                    monitor = ProcessMonitor.objects.create(
                        pid=proc['pid'],
                        process_name=proc['name'],
                        cpu_usage=0,
                        memory_usage=proc['memory_gb'],
                        network_usage=0,
                        log_path=log_file
                    )
                    
                    # 启动监控线程
                    threading.Thread(
                        target=monitor_process,
                        args=(proc['pid'], logger)
                    ).start()
                    
                    print(f"发现新的高资源进程: {proc['name']} (PID: {proc['pid']})")
            
            # 每5分钟检测一次
            time.sleep(300)
            
        except Exception as e:
            print(f"自动检测出错: {str(e)}")
            time.sleep(60)  # 出错后等待1分钟再试

def start_monitor(request):
    """开始监控进程"""
    pid = request.GET.get('pid')
    resource_type = request.GET.get('type', 'all')  # cpu, gpu, memory, all
    
    try:
        if pid:
            pid = int(pid)
            process = psutil.Process(pid)
            
            # 检查进程是否已经在监控中
            monitors = {
                'cpu': HighCPUProcess.objects.filter(pid=pid, is_active=True).exists(),
                'gpu': HighGPUProcess.objects.filter(pid=pid, is_active=True).exists(),
                'memory': HighMemoryProcess.objects.filter(pid=pid, is_active=True).exists()
            }
            
            # 根据资源类型启动监控
            results = []
            if resource_type == 'all':
                for rtype, is_monitored in monitors.items():
                    if not is_monitored:
                        thread = threading.Thread(
                            target=monitor_process,
                            args=(pid, rtype),
                            daemon=True
                        )
                        thread.start()
                        results.append(f"已启动{rtype}监控")
                    else:
                        results.append(f"{rtype}监控已在运行")
            else:
                if not monitors.get(resource_type):
                    thread = threading.Thread(
                        target=monitor_process,
                        args=(pid, resource_type),
                        daemon=True
                    )
                    thread.start()
                    results.append(f"已启动{resource_type}监控")
                else:
                    return JsonResponse({"error": f"进程 {pid} 已在{resource_type}监控中"}, status=400)
            
            return JsonResponse({
                "message": f"开始监控进程 {process.name()} (PID: {pid})",
                "results": results
            })
            
        # 自动检测高资源进程
        high_resource_procs = {
            'cpu': [],
            'gpu': [],
            'memory': []
        }
        
        for proc in psutil.process_iter(['pid', 'name']):
            try:
                process = psutil.Process(proc.info['pid'])
                
                # 检查CPU使用率 (>200% 表示使用超过2个核心)
                cpu_percent = process.cpu_percent(interval=1.0)
                if cpu_percent > 200:
                    high_resource_procs['cpu'].append(process)
                
                # 检查内存使用量 (>20GB)
                memory_gb = process.memory_info().rss / (1024 * 1024 * 1024)
                if memory_gb > 20:
                    high_resource_procs['memory'].append(process)
                
                # 检查GPU使用率 (>50%)
                gpu_usage, gpu_memory = get_process_gpu_usage(process.pid)
                if gpu_usage > 50:
                    high_resource_procs['gpu'].append(process)
                    
            except (psutil.NoSuchProcess, psutil.AccessDenied):
                continue
        
        # 启动监控
        results = {
            'cpu': [],
            'gpu': [],
            'memory': []
        }
        
        for resource_type, processes in high_resource_procs.items():
            for proc in processes:
                if not any([
                    HighCPUProcess.objects.filter(pid=proc.pid, is_active=True).exists(),
                    HighGPUProcess.objects.filter(pid=proc.pid, is_active=True).exists(),
                    HighMemoryProcess.objects.filter(pid=proc.pid, is_active=True).exists()
                ]):
                    thread = threading.Thread(
                        target=monitor_process,
                        args=(proc.pid, resource_type),
                        daemon=True
                    )
                    thread.start()
                    results[resource_type].append({
                        'pid': proc.pid,
                        'name': proc.name()
                    })
        
        return JsonResponse({
            "message": "开始监控高资源进程",
            "processes": results
        })
        
    except Exception as e:
        return JsonResponse({"error": str(e)}, status=500)

@csrf_exempt
@require_http_methods(["POST"])
def stop_monitor(request, pid):
    """停止监控指定进程"""
    resource_type = request.GET.get('type', 'all')  # 从查询参数获取资源类型
    
    try:
        # 根据资源类型选择要停止的监控
        monitors = []
        if resource_type == 'all':
            monitors.extend(HighCPUProcess.objects.filter(pid=pid, is_active=True))
            monitors.extend(HighGPUProcess.objects.filter(pid=pid, is_active=True))
            monitors.extend(HighMemoryProcess.objects.filter(pid=pid, is_active=True))
        elif resource_type == 'cpu':
            monitors.extend(HighCPUProcess.objects.filter(pid=pid, is_active=True))
        elif resource_type == 'gpu':
            monitors.extend(HighGPUProcess.objects.filter(pid=pid, is_active=True))
        elif resource_type == 'memory':
            monitors.extend(HighMemoryProcess.objects.filter(pid=pid, is_active=True))
        else:
            return JsonResponse({
                "error": f"不支持的资源类型: {resource_type}"
            }, status=400)
        
        if not monitors:
            return JsonResponse({
                "error": f"未找到进程 {pid} 的{resource_type}监控记录"
            }, status=404)
        
        # 更新所有监控记录的状态
        for monitor in monitors:
            # 只更新监控状态，不改变进程状态
            monitor.is_active = False
            monitor.save()
            
            # 记录停止操作
            logger = logging.getLogger(f'{monitor.__class__.__name__.lower()}_{pid}')
            logger.info(
                f"手动停止监控:\n"
                f"├─ 进程ID: {pid}\n"
                f"├─ 监控类型: {monitor.__class__.__name__}\n"
                f"├─ 进程状态: {'运行中' if monitor.status == 1 else '已终止'}\n"
                f"├─ 开始时间: {monitor.created_at}\n"
                f"└─ 停止时间: {timezone.now()}"
            )
        
        # 尝试终止相关的监控线程
        import threading
        current_threads = threading.enumerate()
        monitor_threads = [t for t in current_threads if t.name.startswith(f'monitor_{pid}')]
        for thread in monitor_threads:
            try:
                thread.do_run = False
            except:
                pass
        
        return JsonResponse({
            "message": f"已停止对进程 {pid} 的监控",
            "stopped_monitors": len(monitors),
            "process_status": "运行中" if monitors[0].status == 1 else "已终止"
        })
        
    except Exception as e:
        return JsonResponse({
            "error": f"停止监控失败: {str(e)}"
        }, status=500)

def get_process_metrics(request, pid):
    """获取进程监控数据"""
    resource_type = request.GET.get('type', 'all')
    try:
        results = {}
        monitors = {
            'cpu': HighCPUProcess,
            'gpu': HighGPUProcess,
            'memory': HighMemoryProcess
        }
        
        if resource_type == 'all':
            for rtype, model in monitors.items():
                try:
                    monitor = model.objects.get(pid=pid)
                    results[rtype] = {
                        'status': monitor.status,
                        'cpu_usage': monitor.cpu_usage,
                        'memory_usage': monitor.memory_usage,
                        'gpu_usage': monitor.gpu_usage,
                        'gpu_memory': monitor.gpu_memory,
                        'virtual_memory': monitor.virtual_memory
                    }
                    
                    # 添加特定资源类型的指标
                    if rtype == 'cpu':
                        results[rtype]['cpu_cores'] = monitor.cpu_cores
                    elif rtype == 'gpu':
                        results[rtype]['gpu_index'] = monitor.gpu_index
                    elif rtype == 'memory':
                        results[rtype]['swap_usage'] = monitor.swap_usage
                        
                except model.DoesNotExist:
                    continue
        else:
            model = monitors.get(resource_type)
            if model:
                try:
                    monitor = model.objects.get(pid=pid)
                    results[resource_type] = {
                        'status': monitor.status,
                        'cpu_usage': monitor.cpu_usage,
                        'memory_usage': monitor.memory_usage,
                        'gpu_usage': monitor.gpu_usage,
                        'gpu_memory': monitor.gpu_memory,
                        'virtual_memory': monitor.virtual_memory
                    }
                    
                    # 添加特定资源类型的指标
                    if resource_type == 'cpu':
                        results[resource_type]['cpu_cores'] = monitor.cpu_cores
                    elif resource_type == 'gpu':
                        results[resource_type]['gpu_index'] = monitor.gpu_index
                    elif resource_type == 'memory':
                        results[resource_type]['swap_usage'] = monitor.swap_usage
                        
                except model.DoesNotExist:
                    pass
        
        if not results:
            return JsonResponse({"error": f"未找到PID为{pid}的监控记录"}, status=404)
            
        return JsonResponse({
            "pid": pid,
            "metrics": results
        })
        
    except Exception as e:
        return JsonResponse({"error": str(e)}, status=500)

def auto_detect_monitor(request):
    """自动检测并监控高资源进程"""
    try:
        # 清理已停止的监控
        HighCPUProcess.objects.filter(is_active=True, status=0).update(is_active=False)
        HighGPUProcess.objects.filter(is_active=True, status=0).update(is_active=False)
        HighMemoryProcess.objects.filter(is_active=True, status=0).update(is_active=False)
        
        results = {
            'cpu': [],
            'gpu': [],
            'memory': []
        }
        
        # 首先收集所有进程的CPU使用率
        processes = {}
        for proc in psutil.process_iter(['pid', 'name', 'cpu_percent']):
            try:
                processes[proc.info['pid']] = proc.info
            except (psutil.NoSuchProcess, psutil.AccessDenied):
                continue
        
        # 等待一秒获取CPU使用率变化
        time.sleep(1)
        
        # 检测高资源进程
        for proc in psutil.process_iter(['pid', 'name', 'cpu_percent']):
            try:
                pid = proc.info['pid']
                if pid not in processes:
                    continue
                
                process = psutil.Process(pid)
                cpu_percent = proc.info['cpu_percent']
                
                # 检查CPU使用率 (>200% 表示使用超过2个核心)
                if cpu_percent > 200:
                    if not HighCPUProcess.objects.filter(pid=pid, is_active=True).exists():
                        thread = threading.Thread(
                            target=monitor_process,
                            args=(pid, 'cpu'),
                            daemon=True
                        )
                        thread.start()
                        results['cpu'].append({
                            'pid': pid,
                            'name': process.name(),
                            'cpu_usage': cpu_percent
                        })
                
                # 检查内存使用量 (>20GB)
                memory_gb = process.memory_info().rss / (1024 * 1024 * 1024)
                if memory_gb > 20:
                    if not HighMemoryProcess.objects.filter(pid=pid, is_active=True).exists():
                        thread = threading.Thread(
                            target=monitor_process,
                            args=(pid, 'memory'),
                            daemon=True
                        )
                        thread.start()
                        results['memory'].append({
                            'pid': pid,
                            'name': process.name(),
                            'memory_usage': memory_gb
                        })
                
                # 检查GPU使用率 (>50%)
                gpu_usage, gpu_memory = get_process_gpu_usage(pid)
                if gpu_usage > 50:
                    if not HighGPUProcess.objects.filter(pid=pid, is_active=True).exists():
                        thread = threading.Thread(
                            target=monitor_process,
                            args=(pid, 'gpu'),
                            daemon=True
                        )
                        thread.start()
                        results['gpu'].append({
                            'pid': pid,
                            'name': process.name(),
                            'gpu_usage': gpu_usage,
                            'gpu_memory': gpu_memory
                        })
                        
            except (psutil.NoSuchProcess, psutil.AccessDenied):
                continue
        
        return JsonResponse({
            "message": "已开始监控检测到的高资源进程",
            "detected_processes": results
        })
        
    except Exception as e:
        return JsonResponse({"error": str(e)}, status=500)