commit ccf1f45fe1dd55cf4b5dd6191785ff6657950cd8 Author: wanjia Date: Tue Feb 18 19:40:58 2025 +0800 初始化项目:进程监控系统 diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..ce51e77 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,11 @@ +echo "*.pyc +__pycache__/ +.env +*.log +logs/ +.idea/ +.vscode/ +*.sqlite3 +db.sqlite3 +venv/ +.venv/" > .gitignore \ No newline at end of file diff --git a/.idea/automated_task_monitor.iml b/.idea/automated_task_monitor.iml new file mode 100644 index 0000000..337c034 --- /dev/null +++ b/.idea/automated_task_monitor.iml @@ -0,0 +1,30 @@ + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml new file mode 100644 index 0000000..dc4857c --- /dev/null +++ b/.idea/inspectionProfiles/Project_Default.xml @@ -0,0 +1,93 @@ + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 0000000..105ce2d --- /dev/null +++ b/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..6a77fe2 --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,7 @@ + + + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..a36eaaa --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/workspace.xml b/.idea/workspace.xml new file mode 100644 index 0000000..df22ea7 --- /dev/null +++ b/.idea/workspace.xml @@ -0,0 +1,96 @@ + + + + + + + + + + + { + "customColor": "", + "associatedIndex": 6 +} + + + + { + "keyToString": { + "RunOnceActivity.OpenDjangoStructureViewOnStart": "true", + "RunOnceActivity.OpenProjectViewOnStart": "true", + "RunOnceActivity.ShowReadmeOnStart": "true", + "last_opened_file_path": "D:/pythonProject/myproject/.venv/Scripts", + "node.js.detected.package.eslint": "true", + "node.js.detected.package.tslint": "true", + "node.js.selected.package.eslint": "(autodetect)", + "node.js.selected.package.tslint": "(autodetect)", + "nodejs_package_manager_path": "npm", + "vue.rearranger.settings.migration": "true" + } +} + + + + + + + + + + + + + + + + 1739240192225 + + + + + + \ No newline at end of file diff --git a/automated_task_monitor/__init__.py b/automated_task_monitor/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/automated_task_monitor/__pycache__/__init__.cpython-310.pyc b/automated_task_monitor/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000..d83c3a1 Binary files /dev/null and b/automated_task_monitor/__pycache__/__init__.cpython-310.pyc differ diff --git a/automated_task_monitor/__pycache__/settings.cpython-310.pyc b/automated_task_monitor/__pycache__/settings.cpython-310.pyc new file mode 100644 index 0000000..947c750 Binary files /dev/null and b/automated_task_monitor/__pycache__/settings.cpython-310.pyc differ diff --git a/automated_task_monitor/__pycache__/urls.cpython-310.pyc b/automated_task_monitor/__pycache__/urls.cpython-310.pyc new file mode 100644 index 0000000..3b5d635 Binary files /dev/null and b/automated_task_monitor/__pycache__/urls.cpython-310.pyc differ diff --git a/automated_task_monitor/__pycache__/wsgi.cpython-310.pyc b/automated_task_monitor/__pycache__/wsgi.cpython-310.pyc new file mode 100644 index 0000000..f82423a Binary files /dev/null and b/automated_task_monitor/__pycache__/wsgi.cpython-310.pyc differ diff --git a/automated_task_monitor/asgi.py b/automated_task_monitor/asgi.py new file mode 100644 index 0000000..9e7f58a --- /dev/null +++ b/automated_task_monitor/asgi.py @@ -0,0 +1,16 @@ +""" +ASGI config for automated_task_monitor project. + +It exposes the ASGI callable as a module-level variable named ``application``. + +For more information on this file, see +https://docs.djangoproject.com/en/5.1/howto/deployment/asgi/ +""" + +import os + +from django.core.asgi import get_asgi_application + +os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'automated_task_monitor.settings') + +application = get_asgi_application() diff --git a/automated_task_monitor/settings.py b/automated_task_monitor/settings.py new file mode 100644 index 0000000..0bb765c --- /dev/null +++ b/automated_task_monitor/settings.py @@ -0,0 +1,143 @@ +""" +Django settings for automated_task_monitor project. + +Generated by 'django-admin startproject' using Django 5.1.6. + +For more information on this file, see +https://docs.djangoproject.com/en/5.1/topics/settings/ + +For the full list of settings and their values, see +https://docs.djangoproject.com/en/5.1/ref/settings/ +""" + +from pathlib import Path +import os + +# Build paths inside the project like this: BASE_DIR / 'subdir'. +BASE_DIR = Path(__file__).resolve().parent.parent + + +# Quick-start development settings - unsuitable for production +# See https://docs.djangoproject.com/en/5.1/howto/deployment/checklist/ + +# SECURITY WARNING: keep the secret key used in production secret! +SECRET_KEY = 'django-insecure-!1v8ca8ows01-*m5(&9)qgk5jc-my^q+4d+0)s_*^n&vne^&w9' + +# SECURITY WARNING: don't run with debug turned on in production! +DEBUG = True + +ALLOWED_HOSTS = [ + '127.0.0.1', # 本地开发 + 'localhost', # 本地开发 + '81.69.223.133', # 你的服务器IP +] + +# 监控配置 +MONITOR_INTERVAL = 60 # 监控间隔(秒) + +# Application definition + +INSTALLED_APPS = [ + 'django.contrib.admin', + 'django.contrib.auth', + 'django.contrib.contenttypes', + 'django.contrib.sessions', + 'django.contrib.messages', + 'django.contrib.staticfiles', + 'monitor.apps.MonitorConfig', +] + +MIDDLEWARE = [ + 'django.middleware.security.SecurityMiddleware', + 'django.contrib.sessions.middleware.SessionMiddleware', + 'django.middleware.common.CommonMiddleware', + 'django.middleware.csrf.CsrfViewMiddleware', + 'django.contrib.auth.middleware.AuthenticationMiddleware', + 'django.contrib.messages.middleware.MessageMiddleware', + 'django.middleware.clickjacking.XFrameOptionsMiddleware', +] + +ROOT_URLCONF = 'automated_task_monitor.urls' + +TEMPLATES = [ + { + 'BACKEND': 'django.template.backends.django.DjangoTemplates', + 'DIRS': [BASE_DIR / 'templates'] + , + 'APP_DIRS': True, + 'OPTIONS': { + 'context_processors': [ + 'django.template.context_processors.debug', + 'django.template.context_processors.request', + 'django.contrib.auth.context_processors.auth', + 'django.contrib.messages.context_processors.messages', + ], + }, + }, +] + +WSGI_APPLICATION = 'automated_task_monitor.wsgi.application' + + +# Database +# https://docs.djangoproject.com/en/5.1/ref/settings/#databases + +DATABASES = { + 'default': { + 'ENGINE': 'django.db.backends.mysql', + 'NAME': 'monitoring_db', + 'USER': 'root', + 'PASSWORD': '123456', + 'HOST': 'localhost', + 'PORT': '3306', + } +} + + +# Password validation +# https://docs.djangoproject.com/en/5.1/ref/settings/#auth-password-validators + +AUTH_PASSWORD_VALIDATORS = [ + { + 'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator', + }, + { + 'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator', + }, + { + 'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator', + }, + { + 'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator', + }, +] + + +# Internationalization +# https://docs.djangoproject.com/en/5.1/topics/i18n/ + +LANGUAGE_CODE = 'en-us' + +TIME_ZONE = 'Asia/Shanghai' + +USE_I18N = True + +USE_TZ = True + + +# Static files (CSS, JavaScript, Images) +# https://docs.djangoproject.com/en/5.1/howto/static-files/ + +STATIC_URL = 'static/' + +# Default primary key field type +# https://docs.djangoproject.com/en/5.1/ref/settings/#default-auto-field + +DEFAULT_AUTO_FIELD = 'django.db.models.BigAutoField' + +# 修改日志路径为项目根目录下的 logs 文件夹 +LOG_DIR = os.path.join(BASE_DIR, 'logs', 'process_monitor') + +# 确保日志目录存在 +os.makedirs(LOG_DIR, exist_ok=True) + diff --git a/automated_task_monitor/urls.py b/automated_task_monitor/urls.py new file mode 100644 index 0000000..02cc03f --- /dev/null +++ b/automated_task_monitor/urls.py @@ -0,0 +1,23 @@ +""" +URL configuration for automated_task_monitor project. + +The `urlpatterns` list routes URLs to views. For more information please see: + https://docs.djangoproject.com/en/5.1/topics/http/urls/ +Examples: +Function views + 1. Add an import: from my_app import views + 2. Add a URL to urlpatterns: path('', views.home, name='home') +Class-based views + 1. Add an import: from other_app.views import Home + 2. Add a URL to urlpatterns: path('', Home.as_view(), name='home') +Including another URLconf + 1. Import the include() function: from django.urls import include, path + 2. Add a URL to urlpatterns: path('blog/', include('blog.urls')) +""" +from django.contrib import admin +from django.urls import path, include + +urlpatterns = [ + path('admin/', admin.site.urls), + path('monitor/', include('monitor.urls')), +] diff --git a/automated_task_monitor/wsgi.py b/automated_task_monitor/wsgi.py new file mode 100644 index 0000000..3e1b2aa --- /dev/null +++ b/automated_task_monitor/wsgi.py @@ -0,0 +1,16 @@ +""" +WSGI config for automated_task_monitor project. + +It exposes the WSGI callable as a module-level variable named ``application``. + +For more information on this file, see +https://docs.djangoproject.com/en/5.1/howto/deployment/wsgi/ +""" + +import os + +from django.core.wsgi import get_wsgi_application + +os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'automated_task_monitor.settings') + +application = get_wsgi_application() diff --git a/logs/process_monitor/gpu/process_22572_20250218.log b/logs/process_monitor/gpu/process_22572_20250218.log new file mode 100644 index 0000000..dbbe7ea --- /dev/null +++ b/logs/process_monitor/gpu/process_22572_20250218.log @@ -0,0 +1,67 @@ +2025-02-18 17:35:58,750 - INFO - ʼؽ: + : Lark.exe + ID: 22572 + : gpu + ؼ: 60 + ʼʱ: 2025-02-18 17:35:58 +2025-02-18 17:35:58,755 - INFO - µļؼ¼ +2025-02-18 17:35:58,765 - INFO - ״̬: + ״̬: 1 + ״̬: running + ״̬: Ծ + ʱ: 7:43:55.278315 +2025-02-18 17:35:59,119 - INFO - GPUϢ + ״̬: + ʹ: 19.0% + Դʹ: 1499.0MB +2025-02-18 17:35:59,134 - INFO - Դʹ - 2025-02-18 17:35:59: + CPUϢ + ʹ: 0.0% + û̬ʱ: 4.9s + ں̬ʱ: 5.2s + CPU: 16 + CPUƵ: 4001.0MHz + л: 1470919/0 + ڴϢ + ڴ: 145.8MB (0.9%) + ڴ: 138.7MB + ڴӳ: 99 + ϵͳڴʹ: 79.8% + ռʹ: 10.0% + GPUϢ + ״̬: + ʹ: 19.0% + Դʹ: 1499.0MB + IOϢ + ȡ: 112.8MB (135217) + +2025-02-18 17:36:59,149 - INFO - ״̬: + ״̬: 1 + ״̬: running + ״̬: Ծ + ʱ: 7:44:55.662074 +2025-02-18 17:36:59,272 - INFO - GPUϢ + ״̬: + ʹ: 1.0% + Դʹ: 1486.0MB +2025-02-18 17:36:59,284 - INFO - Դʹ - 2025-02-18 17:36:59: + CPUϢ + ʹ: 0.0% + û̬ʱ: 4.9s + ں̬ʱ: 5.2s + CPU: 16 + CPUƵ: 4001.0MHz + л: 1473027/0 + ڴϢ + ڴ: 145.8MB (0.9%) + ڴ: 138.5MB + ڴӳ: 99 + ϵͳڴʹ: 80.1% + ռʹ: 10.0% + GPUϢ + ״̬: + ʹ: 1.0% + Դʹ: 1486.0MB + IOϢ + ȡ: 112.9MB (135263) + diff --git a/logs/process_monitor/memory/process_40128_20250218.log b/logs/process_monitor/memory/process_40128_20250218.log new file mode 100644 index 0000000..176c19a --- /dev/null +++ b/logs/process_monitor/memory/process_40128_20250218.log @@ -0,0 +1,15 @@ +2025-02-18 15:49:36,173 - INFO - ʼmemory pycharm64.exe (PID: 40128) +2025-02-18 15:49:37,621 - INFO - ڴʹ: + - ڴ: 2.05GB + - ڴ: 1.87GB + - ռ: 1.45GB + - CPU: 1.5% + - GPU: 0.0% +2025-02-18 15:50:38,966 - INFO - ڴʹ: + - ڴ: 2.03GB + - ڴ: 1.85GB + - ռ: 1.45GB + - CPU: 0.0% + - GPU: 0.0% +2025-02-18 15:51:38,972 - ERROR - 40128 ֹ +2025-02-18 15:51:38,975 - INFO - ѽ diff --git a/manage.py b/manage.py new file mode 100644 index 0000000..b0affcd --- /dev/null +++ b/manage.py @@ -0,0 +1,22 @@ +#!/usr/bin/env python +"""Django's command-line utility for administrative tasks.""" +import os +import sys + + +def main(): + """Run administrative tasks.""" + os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'automated_task_monitor.settings') + try: + from django.core.management import execute_from_command_line + except ImportError as exc: + raise ImportError( + "Couldn't import Django. Are you sure it's installed and " + "available on your PYTHONPATH environment variable? Did you " + "forget to activate a virtual environment?" + ) from exc + execute_from_command_line(sys.argv) + + +if __name__ == '__main__': + main() diff --git a/monitor/__init__.py b/monitor/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/monitor/__pycache__/__init__.cpython-310.pyc b/monitor/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000..ff00a8a Binary files /dev/null and b/monitor/__pycache__/__init__.cpython-310.pyc differ diff --git a/monitor/__pycache__/admin.cpython-310.pyc b/monitor/__pycache__/admin.cpython-310.pyc new file mode 100644 index 0000000..bb0eceb Binary files /dev/null and b/monitor/__pycache__/admin.cpython-310.pyc differ diff --git a/monitor/__pycache__/apps.cpython-310.pyc b/monitor/__pycache__/apps.cpython-310.pyc new file mode 100644 index 0000000..adce87d Binary files /dev/null and b/monitor/__pycache__/apps.cpython-310.pyc differ diff --git a/monitor/__pycache__/models.cpython-310.pyc b/monitor/__pycache__/models.cpython-310.pyc new file mode 100644 index 0000000..407fc0a Binary files /dev/null and b/monitor/__pycache__/models.cpython-310.pyc differ diff --git a/monitor/__pycache__/tasks.cpython-310.pyc b/monitor/__pycache__/tasks.cpython-310.pyc new file mode 100644 index 0000000..b3d4af7 Binary files /dev/null and b/monitor/__pycache__/tasks.cpython-310.pyc differ diff --git a/monitor/__pycache__/urls.cpython-310.pyc b/monitor/__pycache__/urls.cpython-310.pyc new file mode 100644 index 0000000..399de14 Binary files /dev/null and b/monitor/__pycache__/urls.cpython-310.pyc differ diff --git a/monitor/__pycache__/views.cpython-310.pyc b/monitor/__pycache__/views.cpython-310.pyc new file mode 100644 index 0000000..3d3ee21 Binary files /dev/null and b/monitor/__pycache__/views.cpython-310.pyc differ diff --git a/monitor/admin.py b/monitor/admin.py new file mode 100644 index 0000000..8c38f3f --- /dev/null +++ b/monitor/admin.py @@ -0,0 +1,3 @@ +from django.contrib import admin + +# Register your models here. diff --git a/monitor/apps.py b/monitor/apps.py new file mode 100644 index 0000000..7f7c4a0 --- /dev/null +++ b/monitor/apps.py @@ -0,0 +1,9 @@ +from django.apps import AppConfig + + +class MonitorConfig(AppConfig): + default_auto_field = 'django.db.models.BigAutoField' + name = 'monitor' + + def ready(self): + pass diff --git a/monitor/migrations/0001_initial.py b/monitor/migrations/0001_initial.py new file mode 100644 index 0000000..e8d07ce --- /dev/null +++ b/monitor/migrations/0001_initial.py @@ -0,0 +1,83 @@ +# Generated by Django 5.1.6 on 2025-02-18 07:09 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + initial = True + + dependencies = [ + ] + + operations = [ + migrations.CreateModel( + name='HighCPUProcess', + fields=[ + ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('pid', models.IntegerField(verbose_name='进程ID')), + ('process_name', models.CharField(max_length=255, verbose_name='进程名称')), + ('log_path', models.CharField(max_length=255, verbose_name='日志路径')), + ('is_active', models.BooleanField(default=True, verbose_name='是否活跃')), + ('created_at', models.DateTimeField(auto_now_add=True, verbose_name='创建时间')), + ('updated_at', models.DateTimeField(auto_now=True, verbose_name='更新时间')), + ('status', models.IntegerField(default=1, help_text='1:运行中, 0:已停止', verbose_name='进程状态')), + ('cpu_usage', models.FloatField(default=0, verbose_name='CPU使用率(%)')), + ('memory_usage', models.FloatField(default=0, verbose_name='内存使用量(GB)')), + ('gpu_usage', models.FloatField(default=0, verbose_name='GPU使用率(%)')), + ('gpu_memory', models.FloatField(default=0, verbose_name='GPU显存使用量(MB)')), + ('virtual_memory', models.FloatField(default=0, verbose_name='虚拟内存使用量(GB)')), + ('cpu_cores', models.IntegerField(default=0, verbose_name='使用的CPU核心数')), + ], + options={ + 'verbose_name': '高CPU进程', + 'verbose_name_plural': '高CPU进程', + }, + ), + migrations.CreateModel( + name='HighGPUProcess', + fields=[ + ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('pid', models.IntegerField(verbose_name='进程ID')), + ('process_name', models.CharField(max_length=255, verbose_name='进程名称')), + ('log_path', models.CharField(max_length=255, verbose_name='日志路径')), + ('is_active', models.BooleanField(default=True, verbose_name='是否活跃')), + ('created_at', models.DateTimeField(auto_now_add=True, verbose_name='创建时间')), + ('updated_at', models.DateTimeField(auto_now=True, verbose_name='更新时间')), + ('status', models.IntegerField(default=1, help_text='1:运行中, 0:已停止', verbose_name='进程状态')), + ('cpu_usage', models.FloatField(default=0, verbose_name='CPU使用率(%)')), + ('memory_usage', models.FloatField(default=0, verbose_name='内存使用量(GB)')), + ('gpu_usage', models.FloatField(default=0, verbose_name='GPU使用率(%)')), + ('gpu_memory', models.FloatField(default=0, verbose_name='GPU显存使用量(MB)')), + ('virtual_memory', models.FloatField(default=0, verbose_name='虚拟内存使用量(GB)')), + ('gpu_index', models.IntegerField(default=0, verbose_name='GPU设备索引')), + ], + options={ + 'verbose_name': '高GPU进程', + 'verbose_name_plural': '高GPU进程', + }, + ), + migrations.CreateModel( + name='HighMemoryProcess', + fields=[ + ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('pid', models.IntegerField(verbose_name='进程ID')), + ('process_name', models.CharField(max_length=255, verbose_name='进程名称')), + ('log_path', models.CharField(max_length=255, verbose_name='日志路径')), + ('is_active', models.BooleanField(default=True, verbose_name='是否活跃')), + ('created_at', models.DateTimeField(auto_now_add=True, verbose_name='创建时间')), + ('updated_at', models.DateTimeField(auto_now=True, verbose_name='更新时间')), + ('status', models.IntegerField(default=1, help_text='1:运行中, 0:已停止', verbose_name='进程状态')), + ('cpu_usage', models.FloatField(default=0, verbose_name='CPU使用率(%)')), + ('memory_usage', models.FloatField(default=0, verbose_name='内存使用量(GB)')), + ('gpu_usage', models.FloatField(default=0, verbose_name='GPU使用率(%)')), + ('gpu_memory', models.FloatField(default=0, verbose_name='GPU显存使用量(MB)')), + ('virtual_memory', models.FloatField(default=0, verbose_name='虚拟内存使用量(GB)')), + ('swap_usage', models.FloatField(default=0, verbose_name='交换空间使用量(GB)')), + ], + options={ + 'verbose_name': '高内存进程', + 'verbose_name_plural': '高内存进程', + }, + ), + ] diff --git a/monitor/migrations/__init__.py b/monitor/migrations/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/monitor/migrations/__pycache__/0001_initial.cpython-310.pyc b/monitor/migrations/__pycache__/0001_initial.cpython-310.pyc new file mode 100644 index 0000000..1177709 Binary files /dev/null and b/monitor/migrations/__pycache__/0001_initial.cpython-310.pyc differ diff --git a/monitor/migrations/__pycache__/0002_processmonitor_is_active_processmonitor_log_path_and_more.cpython-310.pyc b/monitor/migrations/__pycache__/0002_processmonitor_is_active_processmonitor_log_path_and_more.cpython-310.pyc new file mode 100644 index 0000000..11e9935 Binary files /dev/null and b/monitor/migrations/__pycache__/0002_processmonitor_is_active_processmonitor_log_path_and_more.cpython-310.pyc differ diff --git a/monitor/migrations/__pycache__/__init__.cpython-310.pyc b/monitor/migrations/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000..579cf8a Binary files /dev/null and b/monitor/migrations/__pycache__/__init__.cpython-310.pyc differ diff --git a/monitor/models.py b/monitor/models.py new file mode 100644 index 0000000..178c12a --- /dev/null +++ b/monitor/models.py @@ -0,0 +1,45 @@ +from django.db import models + +class BaseProcessMonitor(models.Model): + """进程监控基类""" + pid = models.IntegerField(verbose_name="进程ID") + process_name = models.CharField(max_length=255, verbose_name="进程名称") + log_path = models.CharField(max_length=255, verbose_name="日志路径") + is_active = models.BooleanField(default=True, verbose_name="是否活跃") + created_at = models.DateTimeField(auto_now_add=True, verbose_name="创建时间") + updated_at = models.DateTimeField(auto_now=True, verbose_name="更新时间") + status = models.IntegerField(default=1, verbose_name="进程状态", help_text="1:运行中, 0:已停止") + + # 基本资源信息 + cpu_usage = models.FloatField(default=0, verbose_name="CPU使用率(%)") + memory_usage = models.FloatField(default=0, verbose_name="内存使用量(GB)") + gpu_usage = models.FloatField(default=0, verbose_name="GPU使用率(%)") + gpu_memory = models.FloatField(default=0, verbose_name="GPU显存使用量(MB)") + virtual_memory = models.FloatField(default=0, verbose_name="虚拟内存使用量(GB)") + + class Meta: + abstract = True + +class HighCPUProcess(BaseProcessMonitor): + """高CPU使用进程监控""" + cpu_cores = models.IntegerField(default=0, verbose_name="使用的CPU核心数") + + class Meta: + verbose_name = "高CPU进程" + verbose_name_plural = "高CPU进程" + +class HighGPUProcess(BaseProcessMonitor): + """高GPU使用进程监控""" + gpu_index = models.IntegerField(default=0, verbose_name="GPU设备索引") + + class Meta: + verbose_name = "高GPU进程" + verbose_name_plural = "高GPU进程" + +class HighMemoryProcess(BaseProcessMonitor): + """高内存使用进程监控""" + swap_usage = models.FloatField(default=0, verbose_name="交换空间使用量(GB)") + + class Meta: + verbose_name = "高内存进程" + verbose_name_plural = "高内存进程" diff --git a/monitor/tasks.py b/monitor/tasks.py new file mode 100644 index 0000000..6d86101 --- /dev/null +++ b/monitor/tasks.py @@ -0,0 +1,406 @@ +import psutil +import time +from .models import HighCPUProcess, HighGPUProcess, HighMemoryProcess +import GPUtil # 需要安装 gputil +from django.utils import timezone +import os +import logging +from datetime import datetime +from django.db import models +from django.conf import settings + +def setup_logger(pid, resource_type): + """为每个进程设置独立的日志记录器""" + from django.conf import settings + import os + + # 构建完整的日志路径 + log_dir = os.path.join(settings.BASE_DIR, 'logs', 'process_monitor', resource_type) + os.makedirs(log_dir, exist_ok=True) + + log_file = os.path.abspath(os.path.join(log_dir, f'process_{pid}_{datetime.now().strftime("%Y%m%d")}.log')) + + logger = logging.getLogger(f'{resource_type}_process_{pid}') + logger.setLevel(logging.INFO) + + if not logger.handlers: + # 指定 utf-8 编码 + handler = logging.FileHandler(log_file, encoding='utf-8') + formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') + handler.setFormatter(formatter) + logger.addHandler(handler) + + return logger, log_file + +def get_process_gpu_usage(pid): + """获取进程的GPU使用情况""" + try: + import subprocess + import logging + + logger = logging.getLogger('gpu_check') + + # 检查 NVIDIA GPU + try: + # 先检查 nvidia-smi 是否可用 + gpu_info = subprocess.check_output(['which', 'nvidia-smi']).decode('utf-8').strip() + logger.info(f"找到 nvidia-smi: {gpu_info}") + + # 获取 GPU 列表 + gpu_list = subprocess.check_output(['nvidia-smi', '-L']).decode('utf-8').strip() + logger.info(f"检测到的GPU:\n{gpu_list}") + + if not gpu_list: + logger.warning("未检测到NVIDIA GPU") + return 0, 0, "无GPU" + + except (subprocess.SubprocessError, FileNotFoundError) as e: + logger.error(f"nvidia-smi 命令错误: {str(e)}") + return 0, 0, "驱动未安装" + + try: + # 获取所有 GPU 的基本信息 + cmd = ['nvidia-smi', '--format=csv,noheader,nounits', '--query-gpu=index,utilization.gpu,memory.used'] + gpu_output = subprocess.check_output(cmd, timeout=5).decode('utf-8').strip() + logger.info(f"GPU基本信息:\n{gpu_output}") + + # 获取进程的 GPU 使用情况 + cmd_process = ['nvidia-smi', '--format=csv,noheader,nounits', '--query-compute-apps=gpu_uuid,pid,used_memory'] + process_output = subprocess.check_output(cmd_process, timeout=5).decode('utf-8').strip() + logger.info(f"GPU进程信息:\n{process_output}") + + # 如果没有任何进程使用 GPU + if not process_output: + logger.info(f"进程 {pid} 未使用GPU") + return 0, 0, "未使用GPU" + + # 解析进程的 GPU 使用情况 + total_memory = 0 + gpu_util = 0 + + # 从进程输出中查找指定 PID + for line in process_output.split('\n'): + if str(pid) in line: + try: + _, _, memory = line.split(',') + total_memory += float(memory.strip()) + except (ValueError, IndexError) as e: + logger.error(f"解析GPU进程数据错误: {str(e)}, 数据: {line}") + continue + + # 如果找到了进程的 GPU 使用 + if total_memory > 0: + # 获取最大的 GPU 使用率 + for line in gpu_output.split('\n'): + try: + _, util, _ = line.split(',') + gpu_util = max(gpu_util, float(util.strip())) + except (ValueError, IndexError) as e: + logger.error(f"解析GPU使用率错误: {str(e)}, 数据: {line}") + continue + + logger.info( + f"进程 {pid} GPU使用情况:\n" + f"├─ GPU使用率: {gpu_util:.1f}%\n" + f"└─ 显存使用: {total_memory:.1f}MB" + ) + return gpu_util, total_memory, "正常" + else: + logger.info(f"进程 {pid} 未使用GPU") + return 0, 0, "未使用GPU" + + except subprocess.TimeoutExpired as e: + logger.error(f"GPU命令超时: {str(e)}") + return 0, 0, "获取超时" + except subprocess.CalledProcessError as e: + logger.error(f"GPU命令执行错误: {str(e)}, 输出: {e.output.decode('utf-8') if e.output else 'None'}") + return 0, 0, "命令错误" + except Exception as e: + logger.error(f"获取GPU信息时发生错误: {str(e)}") + return 0, 0, "获取错误" + + except Exception as e: + logger.error(f"GPU检测失败: {str(e)}") + return 0, 0, "检测失败" + +def monitor_process(pid, resource_type): + """监控进程资源使用情况""" + # 从 Django 设置中获取监控间隔,默认为 60 秒 + from django.conf import settings + MONITOR_INTERVAL = getattr(settings, 'MONITOR_INTERVAL', 60) # 单位:秒 + + logger, log_file = setup_logger(pid, resource_type) + monitor = None + + try: + process = psutil.Process(pid) + logger.info( + f"开始监控进程:\n" + f"├─ 进程名称: {process.name()}\n" + f"├─ 进程ID: {pid}\n" + f"├─ 监控类型: {resource_type}\n" + f"├─ 监控间隔: {MONITOR_INTERVAL}秒\n" + f"└─ 开始时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}" + ) + + # 根据资源类型选择模型 + ModelClass = { + 'cpu': HighCPUProcess, + 'gpu': HighGPUProcess, + 'memory': HighMemoryProcess + }.get(resource_type) + + if not ModelClass: + logger.error(f"未知的资源类型: {resource_type}") + return + + # 创建并保存监控记录 + monitor = ModelClass.objects.create( + pid=pid, + process_name=process.name(), + log_path=log_file, + status=1, + is_active=True + ) + logger.info("创建新的监控记录") + + # 设置线程名称 + import threading + current_thread = threading.current_thread() + current_thread.name = f'monitor_{pid}_{resource_type}' + current_thread.do_run = True + + while current_thread.do_run: + try: + # 检查监控记录是否被手动停止 + try: + monitor.refresh_from_db() + if not monitor.is_active: + logger.info("监控被手动停止") + break + except ModelClass.DoesNotExist: + logger.error("监控记录已被删除") + break + + # 检查进程状态 + if not process.is_running(): + logger.warning(f"进程 {pid} 已终止") + monitor.status = 0 + monitor.is_active = False + monitor.save() + break + + process_status = process.status() + status_map = { + psutil.STATUS_RUNNING: 1, # 运行中 + psutil.STATUS_SLEEPING: 1, # 休眠中(正常) + psutil.STATUS_DISK_SLEEP: 1, # 磁盘休眠(正常) + psutil.STATUS_STOPPED: 0, # 已停止 + psutil.STATUS_TRACING_STOP: 0, # 跟踪停止 + psutil.STATUS_ZOMBIE: 0, # 僵尸进程 + psutil.STATUS_DEAD: 0, # 已死亡 + psutil.STATUS_WAKING: 1, # 唤醒中 + psutil.STATUS_IDLE: 1, # 空闲(正常) + }.get(process_status, 1) # 默认为1(运行中) + + # 更新监控记录 + monitor.status = status_map + + # 记录进程状态 + logger.info( + f"进程状态:\n" + f"├─ 状态码: {status_map}\n" + f"├─ 状态描述: {process_status}\n" + f"├─ 监控状态: {'活跃' if monitor.is_active else '已停止'}\n" + f"└─ 运行时长: {datetime.now() - datetime.fromtimestamp(process.create_time())}" + ) + + # 获取资源使用情况 + with process.oneshot(): + # CPU信息 + cpu_percent = process.cpu_percent() + cpu_times = process.cpu_times() + cpu_num = psutil.cpu_count() + cpu_freq = psutil.cpu_freq() + + # 内存信息 + memory_info = process.memory_info() + memory_percent = process.memory_percent() + memory_maps = len(process.memory_maps()) + virtual_memory = psutil.virtual_memory() + swap_memory = psutil.swap_memory() + + # GPU信息 + try: + gpu_usage, gpu_memory, gpu_status = get_process_gpu_usage(pid) + gpu_status_map = { + "无GPU": "未检测到GPU", + "驱动未安装": "GPU驱动未安装", + "未使用GPU": "进程未使用GPU", + "获取超时": "GPU信息获取超时", + "命令错误": "GPU命令执行错误", + "获取错误": "GPU信息获取错误", + "检测失败": "GPU检测失败", + "正常": "正常" + } + gpu_status_text = gpu_status_map.get(gpu_status, "未知状态") + except Exception as e: + logger.error(f"获取GPU信息失败: {str(e)}") + gpu_usage, gpu_memory, gpu_status_text = 0, 0, "异常" + + # 在日志中记录GPU状态 + logger.info( + f"GPU信息\n" + f"├─ 状态: {gpu_status_text}\n" + f"├─ 使用率: {gpu_usage:.1f}%\n" + f"└─ 显存使用: {gpu_memory:.1f}MB" + ) + + # IO信息 + try: + io_counters = process.io_counters() + disk_io = psutil.disk_io_counters() + except (psutil.AccessDenied, AttributeError): + io_counters = None + disk_io = None + + # 网络信息 + try: + net_connections = len(process.connections()) + net_io = psutil.net_io_counters() + except (psutil.AccessDenied, AttributeError): + net_connections = 0 + net_io = None + + # 其他系统信息 + num_threads = process.num_threads() + num_fds = process.num_fds() if hasattr(process, 'num_fds') else 0 + ctx_switches = process.num_ctx_switches() + + # 更新监控记录 + monitor.cpu_usage = cpu_percent + monitor.memory_usage = memory_info.rss / (1024 * 1024 * 1024) # GB + monitor.virtual_memory = memory_info.vms / (1024 * 1024 * 1024) # GB + monitor.gpu_usage = gpu_usage + monitor.gpu_memory = gpu_memory + + # 记录详细的资源使用情况 + logger.info( + f"资源使用情况 - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}:\n" + f"├─ CPU信息\n" + f"│ ├─ 使用率: {cpu_percent:.1f}%\n" + f"│ ├─ 用户态时间: {cpu_times.user:.1f}s\n" + f"│ ├─ 内核态时间: {cpu_times.system:.1f}s\n" + f"│ ├─ CPU核心数: {cpu_num}\n" + f"│ ├─ CPU频率: {cpu_freq.current:.1f}MHz\n" + f"│ └─ 上下文切换: {ctx_switches.voluntary}/{ctx_switches.involuntary}\n" + f"├─ 内存信息\n" + f"│ ├─ 物理内存: {memory_info.rss/1024/1024:.1f}MB ({memory_percent:.1f}%)\n" + f"│ ├─ 虚拟内存: {memory_info.vms/1024/1024:.1f}MB\n" + f"│ ├─ 内存映射: {memory_maps}个\n" + f"│ ├─ 系统内存使用: {virtual_memory.percent:.1f}%\n" + f"│ └─ 交换空间使用: {swap_memory.percent:.1f}%\n" + f"├─ GPU信息\n" + f"│ ├─ 状态: {gpu_status_text}\n" + f"│ ├─ 使用率: {gpu_usage:.1f}%\n" + f"│ └─ 显存使用: {gpu_memory:.1f}MB\n" + f"├─ IO信息\n" + f"│ ├─ 读取: {io_counters.read_bytes/1024/1024:.1f}MB ({io_counters.read_count}次)\n" if io_counters else "│ ├─ 读取: 无法获取\n" + f"│ └─ 写入: {io_counters.write_bytes/1024/1024:.1f}MB ({io_counters.write_count}次)\n" if io_counters else "│ └─ 写入: 无法获取\n" + f"├─ 网络信息\n" + f"│ ├─ 连接数: {net_connections}\n" + f"│ ├─ 发送: {net_io.bytes_sent/1024/1024:.1f}MB\n" if net_io else "│ ├─ 发送: 无法获取\n" + f"│ └─ 接收: {net_io.bytes_recv/1024/1024:.1f}MB\n" if net_io else "│ └─ 接收: 无法获取\n" + f"└─ 其他信息\n" + f" ├─ 线程数: {num_threads}\n" + f" ├─ 文件描述符: {num_fds}\n" + f" └─ 子进程数: {len(process.children())}" + ) + + # 如果进程已经变为非活跃状态,更新状态并退出 + if status_map == 0: + monitor.is_active = False + monitor.save() + logger.info(f"进程状态变为 {process_status},停止监控") + break + + monitor.save() + time.sleep(MONITOR_INTERVAL) # 使用配置的间隔时间 + + except Exception as e: + logger.error(f"监控出错: {str(e)}") + logger.exception("详细错误信息:") + time.sleep(5) # 错误后短暂等待 + continue + + except Exception as e: + logger.error(f"监控初始化失败: {str(e)}") + logger.exception("详细错误信息:") + finally: + if monitor: + try: + monitor.refresh_from_db() + monitor.is_active = False + if not process.is_running(): + monitor.status = 0 + monitor.save() + except (ModelClass.DoesNotExist, psutil.NoSuchProcess): + pass + + logger.info( + f"监控结束:\n" + f"├─ 进程名称: {monitor.process_name}\n" + f"├─ 进程ID: {monitor.pid}\n" + f"├─ 监控类型: {resource_type}\n" + f"├─ 进程状态: {'运行中' if monitor.status == 1 else '已终止'}\n" + f"├─ 监控状态: 已停止\n" + f"├─ 开始时间: {monitor.created_at}\n" + f"└─ 结束时间: {monitor.updated_at}" + ) + +def get_high_resource_processes(): + """获取高资源占用的进程""" + high_resource_procs = { + 'cpu': [], + 'gpu': [], + 'memory': [] + } + + for proc in psutil.process_iter(['pid', 'name']): + try: + process = psutil.Process(proc.info['pid']) + + # 检查CPU使用率 + cpu_percent = process.cpu_percent(interval=1.0) + if cpu_percent > 200: # 使用超过2个核心 + high_resource_procs['cpu'].append({ + 'pid': proc.info['pid'], + 'name': proc.info['name'], + 'cpu_usage': cpu_percent, + 'cpu_cores': cpu_percent / 100 + }) + + # 检查内存使用量 + memory_gb = process.memory_info().rss / (1024 * 1024 * 1024) + if memory_gb > 20: # 使用超过20GB内存 + high_resource_procs['memory'].append({ + 'pid': proc.info['pid'], + 'name': proc.info['name'], + 'memory_usage': memory_gb + }) + + # 检查GPU使用率 + gpu_index, gpu_usage, gpu_status = get_process_gpu_usage(proc.info['pid']) + if gpu_usage > 50: # GPU使用率超过50% + high_resource_procs['gpu'].append({ + 'pid': proc.info['pid'], + 'name': proc.info['name'], + 'gpu_usage': gpu_usage, + 'gpu_memory': gpu_memory, + 'gpu_index': gpu_index + }) + + except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess): + continue + + return high_resource_procs diff --git a/monitor/tests.py b/monitor/tests.py new file mode 100644 index 0000000..7ce503c --- /dev/null +++ b/monitor/tests.py @@ -0,0 +1,3 @@ +from django.test import TestCase + +# Create your tests here. diff --git a/monitor/urls.py b/monitor/urls.py new file mode 100644 index 0000000..4fae023 --- /dev/null +++ b/monitor/urls.py @@ -0,0 +1,9 @@ +from django.urls import path +from . import views + +urlpatterns = [ + path('start_monitor/', views.start_monitor, name='start_monitor'), + path('stop_monitor//', views.stop_monitor, name='stop_monitor'), + path('metrics//', views.get_process_metrics, name='get_process_metrics'), + path('auto_detect/', views.auto_detect_monitor, name='auto_detect_monitor'), +] diff --git a/monitor/views.py b/monitor/views.py new file mode 100644 index 0000000..442a655 --- /dev/null +++ b/monitor/views.py @@ -0,0 +1,469 @@ +from django.http import JsonResponse +from .tasks import monitor_process, get_process_gpu_usage +import threading +import psutil +from .models import HighCPUProcess, HighGPUProcess, HighMemoryProcess +import logging +import os +from datetime import datetime +import time +import nvidia_smi +from django.utils import timezone +from django.views.decorators.http import require_http_methods +from django.views.decorators.csrf import csrf_exempt + +# 配置日志 +LOG_DIR = 'logs/process_monitor' +os.makedirs(LOG_DIR, exist_ok=True) + +def setup_logger(pid): + """为每个进程设置独立的日志记录器""" + log_file = os.path.join(LOG_DIR, f'process_{pid}_{datetime.now().strftime("%Y%m%d")}.log') + logger = logging.getLogger(f'process_{pid}') + logger.setLevel(logging.INFO) + + handler = logging.FileHandler(log_file) + formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') + handler.setFormatter(formatter) + logger.addHandler(handler) + + return logger, log_file + +def get_process_by_name(process_name): + """根据进程名称获取进程PID""" + pids = [] + for proc in psutil.process_iter(['pid', 'name']): + try: + if process_name.lower() in proc.info['name'].lower(): + pids.append(proc.info['pid']) + except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess): + pass + return pids + +def get_process_gpu_usage(pid): + """获取进程的GPU使用情况""" + try: + nvidia_smi.nvmlInit() + deviceCount = nvidia_smi.nvmlDeviceGetCount() + gpu_usage = 0 + gpu_memory = 0 + + for i in range(deviceCount): + handle = nvidia_smi.nvmlDeviceGetHandleByIndex(i) + processes = nvidia_smi.nvmlDeviceGetComputeRunningProcesses(handle) + for process in processes: + if process.pid == pid: + gpu_memory = process.usedGpuMemory / 1024 / 1024 # 转换为MB + gpu_usage = nvidia_smi.nvmlDeviceGetUtilizationRates(handle).gpu + return gpu_usage, gpu_memory + + return 0, 0 + except: + return 0, 0 + finally: + try: + nvidia_smi.nvmlShutdown() + except: + pass + +def get_high_resource_processes(): + """获取高资源占用的进程""" + high_resource_pids = [] + for proc in psutil.process_iter(['pid', 'name']): + try: + # 获取进程信息 + process = psutil.Process(proc.info['pid']) + memory_gb = process.memory_info().rss / (1024 * 1024 * 1024) # 转换为GB + + # 获取GPU使用情况 + gpu_usage, gpu_memory = get_process_gpu_usage(proc.info['pid']) + + # 检查是否满足条件(GPU使用率>50%) + if gpu_usage > 50: + high_resource_pids.append({ + 'pid': proc.info['pid'], + 'name': proc.info['name'], + 'memory_gb': round(memory_gb, 2), + 'gpu_usage': gpu_usage, + 'gpu_memory': round(gpu_memory, 2) # GPU显存使用量(MB) + }) + except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess): + continue + return high_resource_pids + +def auto_detect_high_resource_processes(): + """定期自动检测新的高资源进程""" + while True: + try: + existing_pids = set(ProcessMonitor.objects.filter(is_active=True).values_list('pid', flat=True)) + high_resource_procs = get_high_resource_processes() + + for proc in high_resource_procs: + if proc['pid'] not in existing_pids: + logger, log_file = setup_logger(proc['pid']) + + # 记录到数据库 + monitor = ProcessMonitor.objects.create( + pid=proc['pid'], + process_name=proc['name'], + cpu_usage=0, + memory_usage=proc['memory_gb'], + network_usage=0, + log_path=log_file + ) + + # 启动监控线程 + threading.Thread( + target=monitor_process, + args=(proc['pid'], logger) + ).start() + + print(f"发现新的高资源进程: {proc['name']} (PID: {proc['pid']})") + + # 每5分钟检测一次 + time.sleep(300) + + except Exception as e: + print(f"自动检测出错: {str(e)}") + time.sleep(60) # 出错后等待1分钟再试 + +def start_monitor(request): + """开始监控进程""" + pid = request.GET.get('pid') + resource_type = request.GET.get('type', 'all') # cpu, gpu, memory, all + + try: + if pid: + pid = int(pid) + process = psutil.Process(pid) + + # 检查进程是否已经在监控中 + monitors = { + 'cpu': HighCPUProcess.objects.filter(pid=pid, is_active=True).exists(), + 'gpu': HighGPUProcess.objects.filter(pid=pid, is_active=True).exists(), + 'memory': HighMemoryProcess.objects.filter(pid=pid, is_active=True).exists() + } + + # 根据资源类型启动监控 + results = [] + if resource_type == 'all': + for rtype, is_monitored in monitors.items(): + if not is_monitored: + thread = threading.Thread( + target=monitor_process, + args=(pid, rtype), + daemon=True + ) + thread.start() + results.append(f"已启动{rtype}监控") + else: + results.append(f"{rtype}监控已在运行") + else: + if not monitors.get(resource_type): + thread = threading.Thread( + target=monitor_process, + args=(pid, resource_type), + daemon=True + ) + thread.start() + results.append(f"已启动{resource_type}监控") + else: + return JsonResponse({"error": f"进程 {pid} 已在{resource_type}监控中"}, status=400) + + return JsonResponse({ + "message": f"开始监控进程 {process.name()} (PID: {pid})", + "results": results + }) + + # 自动检测高资源进程 + high_resource_procs = { + 'cpu': [], + 'gpu': [], + 'memory': [] + } + + for proc in psutil.process_iter(['pid', 'name']): + try: + process = psutil.Process(proc.info['pid']) + + # 检查CPU使用率 (>200% 表示使用超过2个核心) + cpu_percent = process.cpu_percent(interval=1.0) + if cpu_percent > 200: + high_resource_procs['cpu'].append(process) + + # 检查内存使用量 (>20GB) + memory_gb = process.memory_info().rss / (1024 * 1024 * 1024) + if memory_gb > 20: + high_resource_procs['memory'].append(process) + + # 检查GPU使用率 (>50%) + gpu_usage, gpu_memory = get_process_gpu_usage(process.pid) + if gpu_usage > 50: + high_resource_procs['gpu'].append(process) + + except (psutil.NoSuchProcess, psutil.AccessDenied): + continue + + # 启动监控 + results = { + 'cpu': [], + 'gpu': [], + 'memory': [] + } + + for resource_type, processes in high_resource_procs.items(): + for proc in processes: + if not any([ + HighCPUProcess.objects.filter(pid=proc.pid, is_active=True).exists(), + HighGPUProcess.objects.filter(pid=proc.pid, is_active=True).exists(), + HighMemoryProcess.objects.filter(pid=proc.pid, is_active=True).exists() + ]): + thread = threading.Thread( + target=monitor_process, + args=(proc.pid, resource_type), + daemon=True + ) + thread.start() + results[resource_type].append({ + 'pid': proc.pid, + 'name': proc.name() + }) + + return JsonResponse({ + "message": "开始监控高资源进程", + "processes": results + }) + + except Exception as e: + return JsonResponse({"error": str(e)}, status=500) + +@csrf_exempt +@require_http_methods(["POST"]) +def stop_monitor(request, pid): + """停止监控指定进程""" + resource_type = request.GET.get('type', 'all') # 从查询参数获取资源类型 + + try: + # 根据资源类型选择要停止的监控 + monitors = [] + if resource_type == 'all': + monitors.extend(HighCPUProcess.objects.filter(pid=pid, is_active=True)) + monitors.extend(HighGPUProcess.objects.filter(pid=pid, is_active=True)) + monitors.extend(HighMemoryProcess.objects.filter(pid=pid, is_active=True)) + elif resource_type == 'cpu': + monitors.extend(HighCPUProcess.objects.filter(pid=pid, is_active=True)) + elif resource_type == 'gpu': + monitors.extend(HighGPUProcess.objects.filter(pid=pid, is_active=True)) + elif resource_type == 'memory': + monitors.extend(HighMemoryProcess.objects.filter(pid=pid, is_active=True)) + else: + return JsonResponse({ + "error": f"不支持的资源类型: {resource_type}" + }, status=400) + + if not monitors: + return JsonResponse({ + "error": f"未找到进程 {pid} 的{resource_type}监控记录" + }, status=404) + + # 更新所有监控记录的状态 + for monitor in monitors: + # 只更新监控状态,不改变进程状态 + monitor.is_active = False + monitor.save() + + # 记录停止操作 + logger = logging.getLogger(f'{monitor.__class__.__name__.lower()}_{pid}') + logger.info( + f"手动停止监控:\n" + f"├─ 进程ID: {pid}\n" + f"├─ 监控类型: {monitor.__class__.__name__}\n" + f"├─ 进程状态: {'运行中' if monitor.status == 1 else '已终止'}\n" + f"├─ 开始时间: {monitor.created_at}\n" + f"└─ 停止时间: {timezone.now()}" + ) + + # 尝试终止相关的监控线程 + import threading + current_threads = threading.enumerate() + monitor_threads = [t for t in current_threads if t.name.startswith(f'monitor_{pid}')] + for thread in monitor_threads: + try: + thread.do_run = False + except: + pass + + return JsonResponse({ + "message": f"已停止对进程 {pid} 的监控", + "stopped_monitors": len(monitors), + "process_status": "运行中" if monitors[0].status == 1 else "已终止" + }) + + except Exception as e: + return JsonResponse({ + "error": f"停止监控失败: {str(e)}" + }, status=500) + +def get_process_metrics(request, pid): + """获取进程监控数据""" + resource_type = request.GET.get('type', 'all') + try: + results = {} + monitors = { + 'cpu': HighCPUProcess, + 'gpu': HighGPUProcess, + 'memory': HighMemoryProcess + } + + if resource_type == 'all': + for rtype, model in monitors.items(): + try: + monitor = model.objects.get(pid=pid) + results[rtype] = { + 'status': monitor.status, + 'cpu_usage': monitor.cpu_usage, + 'memory_usage': monitor.memory_usage, + 'gpu_usage': monitor.gpu_usage, + 'gpu_memory': monitor.gpu_memory, + 'virtual_memory': monitor.virtual_memory + } + + # 添加特定资源类型的指标 + if rtype == 'cpu': + results[rtype]['cpu_cores'] = monitor.cpu_cores + elif rtype == 'gpu': + results[rtype]['gpu_index'] = monitor.gpu_index + elif rtype == 'memory': + results[rtype]['swap_usage'] = monitor.swap_usage + + except model.DoesNotExist: + continue + else: + model = monitors.get(resource_type) + if model: + try: + monitor = model.objects.get(pid=pid) + results[resource_type] = { + 'status': monitor.status, + 'cpu_usage': monitor.cpu_usage, + 'memory_usage': monitor.memory_usage, + 'gpu_usage': monitor.gpu_usage, + 'gpu_memory': monitor.gpu_memory, + 'virtual_memory': monitor.virtual_memory + } + + # 添加特定资源类型的指标 + if resource_type == 'cpu': + results[resource_type]['cpu_cores'] = monitor.cpu_cores + elif resource_type == 'gpu': + results[resource_type]['gpu_index'] = monitor.gpu_index + elif resource_type == 'memory': + results[resource_type]['swap_usage'] = monitor.swap_usage + + except model.DoesNotExist: + pass + + if not results: + return JsonResponse({"error": f"未找到PID为{pid}的监控记录"}, status=404) + + return JsonResponse({ + "pid": pid, + "metrics": results + }) + + except Exception as e: + return JsonResponse({"error": str(e)}, status=500) + +def auto_detect_monitor(request): + """自动检测并监控高资源进程""" + try: + # 清理已停止的监控 + HighCPUProcess.objects.filter(is_active=True, status=0).update(is_active=False) + HighGPUProcess.objects.filter(is_active=True, status=0).update(is_active=False) + HighMemoryProcess.objects.filter(is_active=True, status=0).update(is_active=False) + + results = { + 'cpu': [], + 'gpu': [], + 'memory': [] + } + + # 首先收集所有进程的CPU使用率 + processes = {} + for proc in psutil.process_iter(['pid', 'name', 'cpu_percent']): + try: + processes[proc.info['pid']] = proc.info + except (psutil.NoSuchProcess, psutil.AccessDenied): + continue + + # 等待一秒获取CPU使用率变化 + time.sleep(1) + + # 检测高资源进程 + for proc in psutil.process_iter(['pid', 'name', 'cpu_percent']): + try: + pid = proc.info['pid'] + if pid not in processes: + continue + + process = psutil.Process(pid) + cpu_percent = proc.info['cpu_percent'] + + # 检查CPU使用率 (>200% 表示使用超过2个核心) + if cpu_percent > 200: + if not HighCPUProcess.objects.filter(pid=pid, is_active=True).exists(): + thread = threading.Thread( + target=monitor_process, + args=(pid, 'cpu'), + daemon=True + ) + thread.start() + results['cpu'].append({ + 'pid': pid, + 'name': process.name(), + 'cpu_usage': cpu_percent + }) + + # 检查内存使用量 (>20GB) + memory_gb = process.memory_info().rss / (1024 * 1024 * 1024) + if memory_gb > 20: + if not HighMemoryProcess.objects.filter(pid=pid, is_active=True).exists(): + thread = threading.Thread( + target=monitor_process, + args=(pid, 'memory'), + daemon=True + ) + thread.start() + results['memory'].append({ + 'pid': pid, + 'name': process.name(), + 'memory_usage': memory_gb + }) + + # 检查GPU使用率 (>50%) + gpu_usage, gpu_memory = get_process_gpu_usage(pid) + if gpu_usage > 50: + if not HighGPUProcess.objects.filter(pid=pid, is_active=True).exists(): + thread = threading.Thread( + target=monitor_process, + args=(pid, 'gpu'), + daemon=True + ) + thread.start() + results['gpu'].append({ + 'pid': pid, + 'name': process.name(), + 'gpu_usage': gpu_usage, + 'gpu_memory': gpu_memory + }) + + except (psutil.NoSuchProcess, psutil.AccessDenied): + continue + + return JsonResponse({ + "message": "已开始监控检测到的高资源进程", + "detected_processes": results + }) + + except Exception as e: + return JsonResponse({"error": str(e)}, status=500) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..1a17f68 Binary files /dev/null and b/requirements.txt differ