初始化项目:进程监控系统

This commit is contained in:
wanjia 2025-02-18 19:40:58 +08:00
commit ccf1f45fe1
40 changed files with 1580 additions and 0 deletions

11
.idea/.gitignore generated vendored Normal file
View File

@ -0,0 +1,11 @@
echo "*.pyc
__pycache__/
.env
*.log
logs/
.idea/
.vscode/
*.sqlite3
db.sqlite3
venv/
.venv/" > .gitignore

30
.idea/automated_task_monitor.iml generated Normal file
View File

@ -0,0 +1,30 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="FacetManager">
<facet type="django" name="Django">
<configuration>
<option name="rootFolder" value="$MODULE_DIR$" />
<option name="settingsModule" value="automated_task_monitor/settings.py" />
<option name="manageScript" value="$MODULE_DIR$/manage.py" />
<option name="environment" value="&lt;map/&gt;" />
<option name="doNotUseTestRunner" value="false" />
<option name="trackFilePattern" value="migrations" />
</configuration>
</facet>
</component>
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$">
<excludeFolder url="file://$MODULE_DIR$/.venv" />
</content>
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
<component name="TemplatesService">
<option name="TEMPLATE_CONFIGURATION" value="Django" />
<option name="TEMPLATE_FOLDERS">
<list>
<option value="$MODULE_DIR$/../automated_task_monitor\templates" />
</list>
</option>
</component>
</module>

View File

@ -0,0 +1,93 @@
<component name="InspectionProjectProfileManager">
<profile version="1.0">
<option name="myName" value="Project Default" />
<inspection_tool class="Eslint" enabled="true" level="WARNING" enabled_by_default="true" />
<inspection_tool class="HtmlUnknownAttribute" enabled="true" level="WARNING" enabled_by_default="true">
<option name="myValues">
<value>
<list size="6">
<item index="0" class="java.lang.String" itemvalue="folder" />
<item index="1" class="java.lang.String" itemvalue="selected_folder" />
<item index="2" class="java.lang.String" itemvalue="%}selected{%" />
<item index="3" class="java.lang.String" itemvalue="{%" />
<item index="4" class="java.lang.String" itemvalue="endif" />
<item index="5" class="java.lang.String" itemvalue="%}" />
</list>
</value>
</option>
<option name="myCustomValuesEnabled" value="true" />
</inspection_tool>
<inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
<option name="ignoredPackages">
<value>
<list size="50">
<item index="0" class="java.lang.String" itemvalue="tensorflow" />
<item index="1" class="java.lang.String" itemvalue="ntplib" />
<item index="2" class="java.lang.String" itemvalue="sklearn" />
<item index="3" class="java.lang.String" itemvalue="mysqlclient" />
<item index="4" class="java.lang.String" itemvalue="async-timeout" />
<item index="5" class="java.lang.String" itemvalue="cffi" />
<item index="6" class="java.lang.String" itemvalue="python-dotenv" />
<item index="7" class="java.lang.String" itemvalue="pycparser" />
<item index="8" class="java.lang.String" itemvalue="alibabacloud-openapi-util" />
<item index="9" class="java.lang.String" itemvalue="frozenlist" />
<item index="10" class="java.lang.String" itemvalue="alibabacloud-credentials" />
<item index="11" class="java.lang.String" itemvalue="rest-framework-simplejwt" />
<item index="12" class="java.lang.String" itemvalue="alibabacloud-tea-console" />
<item index="13" class="java.lang.String" itemvalue="certifi" />
<item index="14" class="java.lang.String" itemvalue="urllib3" />
<item index="15" class="java.lang.String" itemvalue="alibabacloud-market20151101" />
<item index="16" class="java.lang.String" itemvalue="django-cors-headers" />
<item index="17" class="java.lang.String" itemvalue="alibabacloud-tea-util" />
<item index="18" class="java.lang.String" itemvalue="alibabacloud-endpoint-util" />
<item index="19" class="java.lang.String" itemvalue="tzdata" />
<item index="20" class="java.lang.String" itemvalue="cryptography" />
<item index="21" class="java.lang.String" itemvalue="alibabacloud-darabonba-env" />
<item index="22" class="java.lang.String" itemvalue="alibabacloud-tea-openapi" />
<item index="23" class="java.lang.String" itemvalue="attrs" />
<item index="24" class="java.lang.String" itemvalue="jmespath" />
<item index="25" class="java.lang.String" itemvalue="alibabacloud-tea" />
<item index="26" class="java.lang.String" itemvalue="alibabacloud-gateway-spi" />
<item index="27" class="java.lang.String" itemvalue="django-rest-framework" />
<item index="28" class="java.lang.String" itemvalue="Django" />
<item index="29" class="java.lang.String" itemvalue="typing_extensions" />
<item index="30" class="java.lang.String" itemvalue="alibabacloud-tea-xml" />
<item index="31" class="java.lang.String" itemvalue="aiohttp" />
<item index="32" class="java.lang.String" itemvalue="multidict" />
<item index="33" class="java.lang.String" itemvalue="yarl" />
<item index="34" class="java.lang.String" itemvalue="aiosignal" />
<item index="35" class="java.lang.String" itemvalue="idna" />
<item index="36" class="java.lang.String" itemvalue="PyJWT" />
<item index="37" class="java.lang.String" itemvalue="rsa" />
<item index="38" class="java.lang.String" itemvalue="msal" />
<item index="39" class="java.lang.String" itemvalue="django-sslserver" />
<item index="40" class="java.lang.String" itemvalue="six" />
<item index="41" class="java.lang.String" itemvalue="asgiref" />
<item index="42" class="java.lang.String" itemvalue="ecdsa" />
<item index="43" class="java.lang.String" itemvalue="pyasn1" />
<item index="44" class="java.lang.String" itemvalue="requests" />
<item index="45" class="java.lang.String" itemvalue="sqlparse" />
<item index="46" class="java.lang.String" itemvalue="charset-normalizer" />
<item index="47" class="java.lang.String" itemvalue="pytz" />
<item index="48" class="java.lang.String" itemvalue="djangorestframework" />
<item index="49" class="java.lang.String" itemvalue="python-jose" />
</list>
</value>
</option>
</inspection_tool>
<inspection_tool class="PyPep8NamingInspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
<option name="ignoredErrors">
<list>
<option value="N806" />
</list>
</option>
</inspection_tool>
<inspection_tool class="PyUnresolvedReferencesInspection" enabled="true" level="WARNING" enabled_by_default="true">
<option name="ignoredIdentifiers">
<list>
<option value="MarKet.urls.Market" />
</list>
</option>
</inspection_tool>
</profile>
</component>

View File

@ -0,0 +1,6 @@
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>

7
.idea/misc.xml generated Normal file
View File

@ -0,0 +1,7 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="Black">
<option name="sdkName" value="Python 3.10 (automated_task_monitor)" />
</component>
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (automated_task_monitor)" project-jdk-type="Python SDK" />
</project>

8
.idea/modules.xml generated Normal file
View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/automated_task_monitor.iml" filepath="$PROJECT_DIR$/.idea/automated_task_monitor.iml" />
</modules>
</component>
</project>

96
.idea/workspace.xml generated Normal file
View File

@ -0,0 +1,96 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="AutoImportSettings">
<option name="autoReloadType" value="SELECTIVE" />
</component>
<component name="ChangeListManager">
<list default="true" id="12eb891e-5234-4edf-9697-982a38ae2f63" name="更改" comment="" />
<option name="SHOW_DIALOG" value="false" />
<option name="HIGHLIGHT_CONFLICTS" value="true" />
<option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
<option name="LAST_RESOLUTION" value="IGNORE" />
</component>
<component name="FileTemplateManagerImpl">
<option name="RECENT_TEMPLATES">
<list>
<option value="Python Script" />
</list>
</option>
</component>
<component name="ProjectColorInfo">{
&quot;customColor&quot;: &quot;&quot;,
&quot;associatedIndex&quot;: 6
}</component>
<component name="ProjectId" id="2ssNC7QAMi8oN4Kv7IADr6QUnp8" />
<component name="ProjectViewState">
<option name="hideEmptyMiddlePackages" value="true" />
<option name="showLibraryContents" value="true" />
</component>
<component name="PropertiesComponent">{
&quot;keyToString&quot;: {
&quot;RunOnceActivity.OpenDjangoStructureViewOnStart&quot;: &quot;true&quot;,
&quot;RunOnceActivity.OpenProjectViewOnStart&quot;: &quot;true&quot;,
&quot;RunOnceActivity.ShowReadmeOnStart&quot;: &quot;true&quot;,
&quot;last_opened_file_path&quot;: &quot;D:/pythonProject/myproject/.venv/Scripts&quot;,
&quot;node.js.detected.package.eslint&quot;: &quot;true&quot;,
&quot;node.js.detected.package.tslint&quot;: &quot;true&quot;,
&quot;node.js.selected.package.eslint&quot;: &quot;(autodetect)&quot;,
&quot;node.js.selected.package.tslint&quot;: &quot;(autodetect)&quot;,
&quot;nodejs_package_manager_path&quot;: &quot;npm&quot;,
&quot;vue.rearranger.settings.migration&quot;: &quot;true&quot;
}
}</component>
<component name="RunManager">
<configuration name="automated_task_monitor" type="Python.DjangoServer" factoryName="Django server">
<module name="automated_task_monitor" />
<option name="ENV_FILES" value="" />
<option name="INTERPRETER_OPTIONS" value="" />
<option name="PARENT_ENVS" value="true" />
<envs>
<env name="PYTHONUNBUFFERED" value="1" />
</envs>
<option name="SDK_HOME" value="" />
<option name="WORKING_DIRECTORY" value="" />
<option name="IS_MODULE_SDK" value="false" />
<option name="ADD_CONTENT_ROOTS" value="true" />
<option name="ADD_SOURCE_ROOTS" value="true" />
<option name="launchJavascriptDebuger" value="false" />
<option name="port" value="8000" />
<option name="host" value="localhost" />
<option name="additionalOptions" value="" />
<option name="browserUrl" value="" />
<option name="runTestServer" value="false" />
<option name="runNoReload" value="false" />
<option name="useCustomRunCommand" value="false" />
<option name="customRunCommand" value="" />
<method v="2" />
</configuration>
</component>
<component name="SharedIndexes">
<attachedChunks>
<set>
<option value="bundled-python-sdk-09665e90c3a7-b11f5e8da5ad-com.jetbrains.pycharm.pro.sharedIndexes.bundled-PY-233.15026.15" />
</set>
</attachedChunks>
</component>
<component name="SpellCheckerSettings" RuntimeDictionaries="0" Folders="0" CustomDictionaries="0" DefaultDictionary="应用程序级" UseSingleDictionary="true" transferred="true" />
<component name="TaskManager">
<task active="true" id="Default" summary="默认任务">
<changelist id="12eb891e-5234-4edf-9697-982a38ae2f63" name="更改" comment="" />
<created>1739240192225</created>
<option name="number" value="Default" />
<option name="presentableId" value="Default" />
<updated>1739240192225</updated>
<workItem from="1739240213323" duration="67000" />
<workItem from="1739240288664" duration="1163000" />
<workItem from="1739774523802" duration="384000" />
<workItem from="1739775058347" duration="988000" />
<workItem from="1739864902383" duration="147000" />
<workItem from="1739869240489" duration="361000" />
</task>
<servers />
</component>
<component name="TypeScriptGeneratedFilesManager">
<option name="version" value="3" />
</component>
</project>

View File

View File

@ -0,0 +1,16 @@
"""
ASGI config for automated_task_monitor project.
It exposes the ASGI callable as a module-level variable named ``application``.
For more information on this file, see
https://docs.djangoproject.com/en/5.1/howto/deployment/asgi/
"""
import os
from django.core.asgi import get_asgi_application
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'automated_task_monitor.settings')
application = get_asgi_application()

View File

@ -0,0 +1,143 @@
"""
Django settings for automated_task_monitor project.
Generated by 'django-admin startproject' using Django 5.1.6.
For more information on this file, see
https://docs.djangoproject.com/en/5.1/topics/settings/
For the full list of settings and their values, see
https://docs.djangoproject.com/en/5.1/ref/settings/
"""
from pathlib import Path
import os
# Build paths inside the project like this: BASE_DIR / 'subdir'.
BASE_DIR = Path(__file__).resolve().parent.parent
# Quick-start development settings - unsuitable for production
# See https://docs.djangoproject.com/en/5.1/howto/deployment/checklist/
# SECURITY WARNING: keep the secret key used in production secret!
SECRET_KEY = 'django-insecure-!1v8ca8ows01-*m5(&9)qgk5jc-my^q+4d+0)s_*^n&vne^&w9'
# SECURITY WARNING: don't run with debug turned on in production!
DEBUG = True
ALLOWED_HOSTS = [
'127.0.0.1', # 本地开发
'localhost', # 本地开发
'81.69.223.133', # 你的服务器IP
]
# 监控配置
MONITOR_INTERVAL = 60 # 监控间隔(秒)
# Application definition
INSTALLED_APPS = [
'django.contrib.admin',
'django.contrib.auth',
'django.contrib.contenttypes',
'django.contrib.sessions',
'django.contrib.messages',
'django.contrib.staticfiles',
'monitor.apps.MonitorConfig',
]
MIDDLEWARE = [
'django.middleware.security.SecurityMiddleware',
'django.contrib.sessions.middleware.SessionMiddleware',
'django.middleware.common.CommonMiddleware',
'django.middleware.csrf.CsrfViewMiddleware',
'django.contrib.auth.middleware.AuthenticationMiddleware',
'django.contrib.messages.middleware.MessageMiddleware',
'django.middleware.clickjacking.XFrameOptionsMiddleware',
]
ROOT_URLCONF = 'automated_task_monitor.urls'
TEMPLATES = [
{
'BACKEND': 'django.template.backends.django.DjangoTemplates',
'DIRS': [BASE_DIR / 'templates']
,
'APP_DIRS': True,
'OPTIONS': {
'context_processors': [
'django.template.context_processors.debug',
'django.template.context_processors.request',
'django.contrib.auth.context_processors.auth',
'django.contrib.messages.context_processors.messages',
],
},
},
]
WSGI_APPLICATION = 'automated_task_monitor.wsgi.application'
# Database
# https://docs.djangoproject.com/en/5.1/ref/settings/#databases
DATABASES = {
'default': {
'ENGINE': 'django.db.backends.mysql',
'NAME': 'monitoring_db',
'USER': 'root',
'PASSWORD': '123456',
'HOST': 'localhost',
'PORT': '3306',
}
}
# Password validation
# https://docs.djangoproject.com/en/5.1/ref/settings/#auth-password-validators
AUTH_PASSWORD_VALIDATORS = [
{
'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator',
},
{
'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator',
},
{
'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator',
},
{
'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator',
},
]
# Internationalization
# https://docs.djangoproject.com/en/5.1/topics/i18n/
LANGUAGE_CODE = 'en-us'
TIME_ZONE = 'Asia/Shanghai'
USE_I18N = True
USE_TZ = True
# Static files (CSS, JavaScript, Images)
# https://docs.djangoproject.com/en/5.1/howto/static-files/
STATIC_URL = 'static/'
# Default primary key field type
# https://docs.djangoproject.com/en/5.1/ref/settings/#default-auto-field
DEFAULT_AUTO_FIELD = 'django.db.models.BigAutoField'
# 修改日志路径为项目根目录下的 logs 文件夹
LOG_DIR = os.path.join(BASE_DIR, 'logs', 'process_monitor')
# 确保日志目录存在
os.makedirs(LOG_DIR, exist_ok=True)

View File

@ -0,0 +1,23 @@
"""
URL configuration for automated_task_monitor project.
The `urlpatterns` list routes URLs to views. For more information please see:
https://docs.djangoproject.com/en/5.1/topics/http/urls/
Examples:
Function views
1. Add an import: from my_app import views
2. Add a URL to urlpatterns: path('', views.home, name='home')
Class-based views
1. Add an import: from other_app.views import Home
2. Add a URL to urlpatterns: path('', Home.as_view(), name='home')
Including another URLconf
1. Import the include() function: from django.urls import include, path
2. Add a URL to urlpatterns: path('blog/', include('blog.urls'))
"""
from django.contrib import admin
from django.urls import path, include
urlpatterns = [
path('admin/', admin.site.urls),
path('monitor/', include('monitor.urls')),
]

View File

@ -0,0 +1,16 @@
"""
WSGI config for automated_task_monitor project.
It exposes the WSGI callable as a module-level variable named ``application``.
For more information on this file, see
https://docs.djangoproject.com/en/5.1/howto/deployment/wsgi/
"""
import os
from django.core.wsgi import get_wsgi_application
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'automated_task_monitor.settings')
application = get_wsgi_application()

View File

@ -0,0 +1,67 @@
2025-02-18 17:35:58,750 - INFO - 开始监控进程:
├─ 进程名称: Lark.exe
├─ 进程ID: 22572
├─ 监控类型: gpu
├─ 监控间隔: 60秒
└─ 开始时间: 2025-02-18 17:35:58
2025-02-18 17:35:58,755 - INFO - 创建新的监控记录
2025-02-18 17:35:58,765 - INFO - 进程状态:
├─ 状态码: 1
├─ 状态描述: running
├─ 监控状态: 活跃
└─ 运行时长: 7:43:55.278315
2025-02-18 17:35:59,119 - INFO - GPU信息
├─ 状态: 正常
├─ 使用率: 19.0%
└─ 显存使用: 1499.0MB
2025-02-18 17:35:59,134 - INFO - 资源使用情况 - 2025-02-18 17:35:59:
├─ CPU信息
│ ├─ 使用率: 0.0%
│ ├─ 用户态时间: 4.9s
│ ├─ 内核态时间: 5.2s
│ ├─ CPU核心数: 16
│ ├─ CPU频率: 4001.0MHz
│ └─ 上下文切换: 1470919/0
├─ 内存信息
│ ├─ 物理内存: 145.8MB (0.9%)
│ ├─ 虚拟内存: 138.7MB
│ ├─ 内存映射: 99个
│ ├─ 系统内存使用: 79.8%
│ └─ 交换空间使用: 10.0%
├─ GPU信息
│ ├─ 状态: 正常
│ ├─ 使用率: 19.0%
│ └─ 显存使用: 1499.0MB
├─ IO信息
│ ├─ 读取: 112.8MB (135217次)
2025-02-18 17:36:59,149 - INFO - 进程状态:
├─ 状态码: 1
├─ 状态描述: running
├─ 监控状态: 活跃
└─ 运行时长: 7:44:55.662074
2025-02-18 17:36:59,272 - INFO - GPU信息
├─ 状态: 正常
├─ 使用率: 1.0%
└─ 显存使用: 1486.0MB
2025-02-18 17:36:59,284 - INFO - 资源使用情况 - 2025-02-18 17:36:59:
├─ CPU信息
│ ├─ 使用率: 0.0%
│ ├─ 用户态时间: 4.9s
│ ├─ 内核态时间: 5.2s
│ ├─ CPU核心数: 16
│ ├─ CPU频率: 4001.0MHz
│ └─ 上下文切换: 1473027/0
├─ 内存信息
│ ├─ 物理内存: 145.8MB (0.9%)
│ ├─ 虚拟内存: 138.5MB
│ ├─ 内存映射: 99个
│ ├─ 系统内存使用: 80.1%
│ └─ 交换空间使用: 10.0%
├─ GPU信息
│ ├─ 状态: 正常
│ ├─ 使用率: 1.0%
│ └─ 显存使用: 1486.0MB
├─ IO信息
│ ├─ 读取: 112.9MB (135263次)

View File

@ -0,0 +1,15 @@
2025-02-18 15:49:36,173 - INFO - 开始监控memory进程 pycharm64.exe (PID: 40128)
2025-02-18 15:49:37,621 - INFO - 内存使用情况:
- 物理内存: 2.05GB
- 虚拟内存: 1.87GB
- 交换空间: 1.45GB
- CPU: 1.5%
- GPU: 0.0%
2025-02-18 15:50:38,966 - INFO - 内存使用情况:
- 物理内存: 2.03GB
- 虚拟内存: 1.85GB
- 交换空间: 1.45GB
- CPU: 0.0%
- GPU: 0.0%
2025-02-18 15:51:38,972 - ERROR - 进程 40128 已终止
2025-02-18 15:51:38,975 - INFO - 监控已结束

22
manage.py Normal file
View File

@ -0,0 +1,22 @@
#!/usr/bin/env python
"""Django's command-line utility for administrative tasks."""
import os
import sys
def main():
"""Run administrative tasks."""
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'automated_task_monitor.settings')
try:
from django.core.management import execute_from_command_line
except ImportError as exc:
raise ImportError(
"Couldn't import Django. Are you sure it's installed and "
"available on your PYTHONPATH environment variable? Did you "
"forget to activate a virtual environment?"
) from exc
execute_from_command_line(sys.argv)
if __name__ == '__main__':
main()

0
monitor/__init__.py Normal file
View File

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

3
monitor/admin.py Normal file
View File

@ -0,0 +1,3 @@
from django.contrib import admin
# Register your models here.

9
monitor/apps.py Normal file
View File

@ -0,0 +1,9 @@
from django.apps import AppConfig
class MonitorConfig(AppConfig):
default_auto_field = 'django.db.models.BigAutoField'
name = 'monitor'
def ready(self):
pass

View File

@ -0,0 +1,83 @@
# Generated by Django 5.1.6 on 2025-02-18 07:09
from django.db import migrations, models
class Migration(migrations.Migration):
initial = True
dependencies = [
]
operations = [
migrations.CreateModel(
name='HighCPUProcess',
fields=[
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('pid', models.IntegerField(verbose_name='进程ID')),
('process_name', models.CharField(max_length=255, verbose_name='进程名称')),
('log_path', models.CharField(max_length=255, verbose_name='日志路径')),
('is_active', models.BooleanField(default=True, verbose_name='是否活跃')),
('created_at', models.DateTimeField(auto_now_add=True, verbose_name='创建时间')),
('updated_at', models.DateTimeField(auto_now=True, verbose_name='更新时间')),
('status', models.IntegerField(default=1, help_text='1:运行中, 0:已停止', verbose_name='进程状态')),
('cpu_usage', models.FloatField(default=0, verbose_name='CPU使用率(%)')),
('memory_usage', models.FloatField(default=0, verbose_name='内存使用量(GB)')),
('gpu_usage', models.FloatField(default=0, verbose_name='GPU使用率(%)')),
('gpu_memory', models.FloatField(default=0, verbose_name='GPU显存使用量(MB)')),
('virtual_memory', models.FloatField(default=0, verbose_name='虚拟内存使用量(GB)')),
('cpu_cores', models.IntegerField(default=0, verbose_name='使用的CPU核心数')),
],
options={
'verbose_name': '高CPU进程',
'verbose_name_plural': '高CPU进程',
},
),
migrations.CreateModel(
name='HighGPUProcess',
fields=[
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('pid', models.IntegerField(verbose_name='进程ID')),
('process_name', models.CharField(max_length=255, verbose_name='进程名称')),
('log_path', models.CharField(max_length=255, verbose_name='日志路径')),
('is_active', models.BooleanField(default=True, verbose_name='是否活跃')),
('created_at', models.DateTimeField(auto_now_add=True, verbose_name='创建时间')),
('updated_at', models.DateTimeField(auto_now=True, verbose_name='更新时间')),
('status', models.IntegerField(default=1, help_text='1:运行中, 0:已停止', verbose_name='进程状态')),
('cpu_usage', models.FloatField(default=0, verbose_name='CPU使用率(%)')),
('memory_usage', models.FloatField(default=0, verbose_name='内存使用量(GB)')),
('gpu_usage', models.FloatField(default=0, verbose_name='GPU使用率(%)')),
('gpu_memory', models.FloatField(default=0, verbose_name='GPU显存使用量(MB)')),
('virtual_memory', models.FloatField(default=0, verbose_name='虚拟内存使用量(GB)')),
('gpu_index', models.IntegerField(default=0, verbose_name='GPU设备索引')),
],
options={
'verbose_name': '高GPU进程',
'verbose_name_plural': '高GPU进程',
},
),
migrations.CreateModel(
name='HighMemoryProcess',
fields=[
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('pid', models.IntegerField(verbose_name='进程ID')),
('process_name', models.CharField(max_length=255, verbose_name='进程名称')),
('log_path', models.CharField(max_length=255, verbose_name='日志路径')),
('is_active', models.BooleanField(default=True, verbose_name='是否活跃')),
('created_at', models.DateTimeField(auto_now_add=True, verbose_name='创建时间')),
('updated_at', models.DateTimeField(auto_now=True, verbose_name='更新时间')),
('status', models.IntegerField(default=1, help_text='1:运行中, 0:已停止', verbose_name='进程状态')),
('cpu_usage', models.FloatField(default=0, verbose_name='CPU使用率(%)')),
('memory_usage', models.FloatField(default=0, verbose_name='内存使用量(GB)')),
('gpu_usage', models.FloatField(default=0, verbose_name='GPU使用率(%)')),
('gpu_memory', models.FloatField(default=0, verbose_name='GPU显存使用量(MB)')),
('virtual_memory', models.FloatField(default=0, verbose_name='虚拟内存使用量(GB)')),
('swap_usage', models.FloatField(default=0, verbose_name='交换空间使用量(GB)')),
],
options={
'verbose_name': '高内存进程',
'verbose_name_plural': '高内存进程',
},
),
]

View File

45
monitor/models.py Normal file
View File

@ -0,0 +1,45 @@
from django.db import models
class BaseProcessMonitor(models.Model):
"""进程监控基类"""
pid = models.IntegerField(verbose_name="进程ID")
process_name = models.CharField(max_length=255, verbose_name="进程名称")
log_path = models.CharField(max_length=255, verbose_name="日志路径")
is_active = models.BooleanField(default=True, verbose_name="是否活跃")
created_at = models.DateTimeField(auto_now_add=True, verbose_name="创建时间")
updated_at = models.DateTimeField(auto_now=True, verbose_name="更新时间")
status = models.IntegerField(default=1, verbose_name="进程状态", help_text="1:运行中, 0:已停止")
# 基本资源信息
cpu_usage = models.FloatField(default=0, verbose_name="CPU使用率(%)")
memory_usage = models.FloatField(default=0, verbose_name="内存使用量(GB)")
gpu_usage = models.FloatField(default=0, verbose_name="GPU使用率(%)")
gpu_memory = models.FloatField(default=0, verbose_name="GPU显存使用量(MB)")
virtual_memory = models.FloatField(default=0, verbose_name="虚拟内存使用量(GB)")
class Meta:
abstract = True
class HighCPUProcess(BaseProcessMonitor):
"""高CPU使用进程监控"""
cpu_cores = models.IntegerField(default=0, verbose_name="使用的CPU核心数")
class Meta:
verbose_name = "高CPU进程"
verbose_name_plural = "高CPU进程"
class HighGPUProcess(BaseProcessMonitor):
"""高GPU使用进程监控"""
gpu_index = models.IntegerField(default=0, verbose_name="GPU设备索引")
class Meta:
verbose_name = "高GPU进程"
verbose_name_plural = "高GPU进程"
class HighMemoryProcess(BaseProcessMonitor):
"""高内存使用进程监控"""
swap_usage = models.FloatField(default=0, verbose_name="交换空间使用量(GB)")
class Meta:
verbose_name = "高内存进程"
verbose_name_plural = "高内存进程"

406
monitor/tasks.py Normal file
View File

@ -0,0 +1,406 @@
import psutil
import time
from .models import HighCPUProcess, HighGPUProcess, HighMemoryProcess
import GPUtil # 需要安装 gputil
from django.utils import timezone
import os
import logging
from datetime import datetime
from django.db import models
from django.conf import settings
def setup_logger(pid, resource_type):
"""为每个进程设置独立的日志记录器"""
from django.conf import settings
import os
# 构建完整的日志路径
log_dir = os.path.join(settings.BASE_DIR, 'logs', 'process_monitor', resource_type)
os.makedirs(log_dir, exist_ok=True)
log_file = os.path.abspath(os.path.join(log_dir, f'process_{pid}_{datetime.now().strftime("%Y%m%d")}.log'))
logger = logging.getLogger(f'{resource_type}_process_{pid}')
logger.setLevel(logging.INFO)
if not logger.handlers:
# 指定 utf-8 编码
handler = logging.FileHandler(log_file, encoding='utf-8')
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
logger.addHandler(handler)
return logger, log_file
def get_process_gpu_usage(pid):
"""获取进程的GPU使用情况"""
try:
import subprocess
import logging
logger = logging.getLogger('gpu_check')
# 检查 NVIDIA GPU
try:
# 先检查 nvidia-smi 是否可用
gpu_info = subprocess.check_output(['which', 'nvidia-smi']).decode('utf-8').strip()
logger.info(f"找到 nvidia-smi: {gpu_info}")
# 获取 GPU 列表
gpu_list = subprocess.check_output(['nvidia-smi', '-L']).decode('utf-8').strip()
logger.info(f"检测到的GPU:\n{gpu_list}")
if not gpu_list:
logger.warning("未检测到NVIDIA GPU")
return 0, 0, "无GPU"
except (subprocess.SubprocessError, FileNotFoundError) as e:
logger.error(f"nvidia-smi 命令错误: {str(e)}")
return 0, 0, "驱动未安装"
try:
# 获取所有 GPU 的基本信息
cmd = ['nvidia-smi', '--format=csv,noheader,nounits', '--query-gpu=index,utilization.gpu,memory.used']
gpu_output = subprocess.check_output(cmd, timeout=5).decode('utf-8').strip()
logger.info(f"GPU基本信息:\n{gpu_output}")
# 获取进程的 GPU 使用情况
cmd_process = ['nvidia-smi', '--format=csv,noheader,nounits', '--query-compute-apps=gpu_uuid,pid,used_memory']
process_output = subprocess.check_output(cmd_process, timeout=5).decode('utf-8').strip()
logger.info(f"GPU进程信息:\n{process_output}")
# 如果没有任何进程使用 GPU
if not process_output:
logger.info(f"进程 {pid} 未使用GPU")
return 0, 0, "未使用GPU"
# 解析进程的 GPU 使用情况
total_memory = 0
gpu_util = 0
# 从进程输出中查找指定 PID
for line in process_output.split('\n'):
if str(pid) in line:
try:
_, _, memory = line.split(',')
total_memory += float(memory.strip())
except (ValueError, IndexError) as e:
logger.error(f"解析GPU进程数据错误: {str(e)}, 数据: {line}")
continue
# 如果找到了进程的 GPU 使用
if total_memory > 0:
# 获取最大的 GPU 使用率
for line in gpu_output.split('\n'):
try:
_, util, _ = line.split(',')
gpu_util = max(gpu_util, float(util.strip()))
except (ValueError, IndexError) as e:
logger.error(f"解析GPU使用率错误: {str(e)}, 数据: {line}")
continue
logger.info(
f"进程 {pid} GPU使用情况:\n"
f"├─ GPU使用率: {gpu_util:.1f}%\n"
f"└─ 显存使用: {total_memory:.1f}MB"
)
return gpu_util, total_memory, "正常"
else:
logger.info(f"进程 {pid} 未使用GPU")
return 0, 0, "未使用GPU"
except subprocess.TimeoutExpired as e:
logger.error(f"GPU命令超时: {str(e)}")
return 0, 0, "获取超时"
except subprocess.CalledProcessError as e:
logger.error(f"GPU命令执行错误: {str(e)}, 输出: {e.output.decode('utf-8') if e.output else 'None'}")
return 0, 0, "命令错误"
except Exception as e:
logger.error(f"获取GPU信息时发生错误: {str(e)}")
return 0, 0, "获取错误"
except Exception as e:
logger.error(f"GPU检测失败: {str(e)}")
return 0, 0, "检测失败"
def monitor_process(pid, resource_type):
"""监控进程资源使用情况"""
# 从 Django 设置中获取监控间隔,默认为 60 秒
from django.conf import settings
MONITOR_INTERVAL = getattr(settings, 'MONITOR_INTERVAL', 60) # 单位:秒
logger, log_file = setup_logger(pid, resource_type)
monitor = None
try:
process = psutil.Process(pid)
logger.info(
f"开始监控进程:\n"
f"├─ 进程名称: {process.name()}\n"
f"├─ 进程ID: {pid}\n"
f"├─ 监控类型: {resource_type}\n"
f"├─ 监控间隔: {MONITOR_INTERVAL}\n"
f"└─ 开始时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
)
# 根据资源类型选择模型
ModelClass = {
'cpu': HighCPUProcess,
'gpu': HighGPUProcess,
'memory': HighMemoryProcess
}.get(resource_type)
if not ModelClass:
logger.error(f"未知的资源类型: {resource_type}")
return
# 创建并保存监控记录
monitor = ModelClass.objects.create(
pid=pid,
process_name=process.name(),
log_path=log_file,
status=1,
is_active=True
)
logger.info("创建新的监控记录")
# 设置线程名称
import threading
current_thread = threading.current_thread()
current_thread.name = f'monitor_{pid}_{resource_type}'
current_thread.do_run = True
while current_thread.do_run:
try:
# 检查监控记录是否被手动停止
try:
monitor.refresh_from_db()
if not monitor.is_active:
logger.info("监控被手动停止")
break
except ModelClass.DoesNotExist:
logger.error("监控记录已被删除")
break
# 检查进程状态
if not process.is_running():
logger.warning(f"进程 {pid} 已终止")
monitor.status = 0
monitor.is_active = False
monitor.save()
break
process_status = process.status()
status_map = {
psutil.STATUS_RUNNING: 1, # 运行中
psutil.STATUS_SLEEPING: 1, # 休眠中(正常)
psutil.STATUS_DISK_SLEEP: 1, # 磁盘休眠(正常)
psutil.STATUS_STOPPED: 0, # 已停止
psutil.STATUS_TRACING_STOP: 0, # 跟踪停止
psutil.STATUS_ZOMBIE: 0, # 僵尸进程
psutil.STATUS_DEAD: 0, # 已死亡
psutil.STATUS_WAKING: 1, # 唤醒中
psutil.STATUS_IDLE: 1, # 空闲(正常)
}.get(process_status, 1) # 默认为1运行中
# 更新监控记录
monitor.status = status_map
# 记录进程状态
logger.info(
f"进程状态:\n"
f"├─ 状态码: {status_map}\n"
f"├─ 状态描述: {process_status}\n"
f"├─ 监控状态: {'活跃' if monitor.is_active else '已停止'}\n"
f"└─ 运行时长: {datetime.now() - datetime.fromtimestamp(process.create_time())}"
)
# 获取资源使用情况
with process.oneshot():
# CPU信息
cpu_percent = process.cpu_percent()
cpu_times = process.cpu_times()
cpu_num = psutil.cpu_count()
cpu_freq = psutil.cpu_freq()
# 内存信息
memory_info = process.memory_info()
memory_percent = process.memory_percent()
memory_maps = len(process.memory_maps())
virtual_memory = psutil.virtual_memory()
swap_memory = psutil.swap_memory()
# GPU信息
try:
gpu_usage, gpu_memory, gpu_status = get_process_gpu_usage(pid)
gpu_status_map = {
"无GPU": "未检测到GPU",
"驱动未安装": "GPU驱动未安装",
"未使用GPU": "进程未使用GPU",
"获取超时": "GPU信息获取超时",
"命令错误": "GPU命令执行错误",
"获取错误": "GPU信息获取错误",
"检测失败": "GPU检测失败",
"正常": "正常"
}
gpu_status_text = gpu_status_map.get(gpu_status, "未知状态")
except Exception as e:
logger.error(f"获取GPU信息失败: {str(e)}")
gpu_usage, gpu_memory, gpu_status_text = 0, 0, "异常"
# 在日志中记录GPU状态
logger.info(
f"GPU信息\n"
f"├─ 状态: {gpu_status_text}\n"
f"├─ 使用率: {gpu_usage:.1f}%\n"
f"└─ 显存使用: {gpu_memory:.1f}MB"
)
# IO信息
try:
io_counters = process.io_counters()
disk_io = psutil.disk_io_counters()
except (psutil.AccessDenied, AttributeError):
io_counters = None
disk_io = None
# 网络信息
try:
net_connections = len(process.connections())
net_io = psutil.net_io_counters()
except (psutil.AccessDenied, AttributeError):
net_connections = 0
net_io = None
# 其他系统信息
num_threads = process.num_threads()
num_fds = process.num_fds() if hasattr(process, 'num_fds') else 0
ctx_switches = process.num_ctx_switches()
# 更新监控记录
monitor.cpu_usage = cpu_percent
monitor.memory_usage = memory_info.rss / (1024 * 1024 * 1024) # GB
monitor.virtual_memory = memory_info.vms / (1024 * 1024 * 1024) # GB
monitor.gpu_usage = gpu_usage
monitor.gpu_memory = gpu_memory
# 记录详细的资源使用情况
logger.info(
f"资源使用情况 - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}:\n"
f"├─ CPU信息\n"
f"│ ├─ 使用率: {cpu_percent:.1f}%\n"
f"│ ├─ 用户态时间: {cpu_times.user:.1f}s\n"
f"│ ├─ 内核态时间: {cpu_times.system:.1f}s\n"
f"│ ├─ CPU核心数: {cpu_num}\n"
f"│ ├─ CPU频率: {cpu_freq.current:.1f}MHz\n"
f"│ └─ 上下文切换: {ctx_switches.voluntary}/{ctx_switches.involuntary}\n"
f"├─ 内存信息\n"
f"│ ├─ 物理内存: {memory_info.rss/1024/1024:.1f}MB ({memory_percent:.1f}%)\n"
f"│ ├─ 虚拟内存: {memory_info.vms/1024/1024:.1f}MB\n"
f"│ ├─ 内存映射: {memory_maps}\n"
f"│ ├─ 系统内存使用: {virtual_memory.percent:.1f}%\n"
f"│ └─ 交换空间使用: {swap_memory.percent:.1f}%\n"
f"├─ GPU信息\n"
f"│ ├─ 状态: {gpu_status_text}\n"
f"│ ├─ 使用率: {gpu_usage:.1f}%\n"
f"│ └─ 显存使用: {gpu_memory:.1f}MB\n"
f"├─ IO信息\n"
f"│ ├─ 读取: {io_counters.read_bytes/1024/1024:.1f}MB ({io_counters.read_count}次)\n" if io_counters else "│ ├─ 读取: 无法获取\n"
f"│ └─ 写入: {io_counters.write_bytes/1024/1024:.1f}MB ({io_counters.write_count}次)\n" if io_counters else "│ └─ 写入: 无法获取\n"
f"├─ 网络信息\n"
f"│ ├─ 连接数: {net_connections}\n"
f"│ ├─ 发送: {net_io.bytes_sent/1024/1024:.1f}MB\n" if net_io else "│ ├─ 发送: 无法获取\n"
f"│ └─ 接收: {net_io.bytes_recv/1024/1024:.1f}MB\n" if net_io else "│ └─ 接收: 无法获取\n"
f"└─ 其他信息\n"
f" ├─ 线程数: {num_threads}\n"
f" ├─ 文件描述符: {num_fds}\n"
f" └─ 子进程数: {len(process.children())}"
)
# 如果进程已经变为非活跃状态,更新状态并退出
if status_map == 0:
monitor.is_active = False
monitor.save()
logger.info(f"进程状态变为 {process_status},停止监控")
break
monitor.save()
time.sleep(MONITOR_INTERVAL) # 使用配置的间隔时间
except Exception as e:
logger.error(f"监控出错: {str(e)}")
logger.exception("详细错误信息:")
time.sleep(5) # 错误后短暂等待
continue
except Exception as e:
logger.error(f"监控初始化失败: {str(e)}")
logger.exception("详细错误信息:")
finally:
if monitor:
try:
monitor.refresh_from_db()
monitor.is_active = False
if not process.is_running():
monitor.status = 0
monitor.save()
except (ModelClass.DoesNotExist, psutil.NoSuchProcess):
pass
logger.info(
f"监控结束:\n"
f"├─ 进程名称: {monitor.process_name}\n"
f"├─ 进程ID: {monitor.pid}\n"
f"├─ 监控类型: {resource_type}\n"
f"├─ 进程状态: {'运行中' if monitor.status == 1 else '已终止'}\n"
f"├─ 监控状态: 已停止\n"
f"├─ 开始时间: {monitor.created_at}\n"
f"└─ 结束时间: {monitor.updated_at}"
)
def get_high_resource_processes():
"""获取高资源占用的进程"""
high_resource_procs = {
'cpu': [],
'gpu': [],
'memory': []
}
for proc in psutil.process_iter(['pid', 'name']):
try:
process = psutil.Process(proc.info['pid'])
# 检查CPU使用率
cpu_percent = process.cpu_percent(interval=1.0)
if cpu_percent > 200: # 使用超过2个核心
high_resource_procs['cpu'].append({
'pid': proc.info['pid'],
'name': proc.info['name'],
'cpu_usage': cpu_percent,
'cpu_cores': cpu_percent / 100
})
# 检查内存使用量
memory_gb = process.memory_info().rss / (1024 * 1024 * 1024)
if memory_gb > 20: # 使用超过20GB内存
high_resource_procs['memory'].append({
'pid': proc.info['pid'],
'name': proc.info['name'],
'memory_usage': memory_gb
})
# 检查GPU使用率
gpu_index, gpu_usage, gpu_status = get_process_gpu_usage(proc.info['pid'])
if gpu_usage > 50: # GPU使用率超过50%
high_resource_procs['gpu'].append({
'pid': proc.info['pid'],
'name': proc.info['name'],
'gpu_usage': gpu_usage,
'gpu_memory': gpu_memory,
'gpu_index': gpu_index
})
except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
continue
return high_resource_procs

3
monitor/tests.py Normal file
View File

@ -0,0 +1,3 @@
from django.test import TestCase
# Create your tests here.

9
monitor/urls.py Normal file
View File

@ -0,0 +1,9 @@
from django.urls import path
from . import views
urlpatterns = [
path('start_monitor/', views.start_monitor, name='start_monitor'),
path('stop_monitor/<int:pid>/', views.stop_monitor, name='stop_monitor'),
path('metrics/<int:pid>/', views.get_process_metrics, name='get_process_metrics'),
path('auto_detect/', views.auto_detect_monitor, name='auto_detect_monitor'),
]

469
monitor/views.py Normal file
View File

@ -0,0 +1,469 @@
from django.http import JsonResponse
from .tasks import monitor_process, get_process_gpu_usage
import threading
import psutil
from .models import HighCPUProcess, HighGPUProcess, HighMemoryProcess
import logging
import os
from datetime import datetime
import time
import nvidia_smi
from django.utils import timezone
from django.views.decorators.http import require_http_methods
from django.views.decorators.csrf import csrf_exempt
# 配置日志
LOG_DIR = 'logs/process_monitor'
os.makedirs(LOG_DIR, exist_ok=True)
def setup_logger(pid):
"""为每个进程设置独立的日志记录器"""
log_file = os.path.join(LOG_DIR, f'process_{pid}_{datetime.now().strftime("%Y%m%d")}.log')
logger = logging.getLogger(f'process_{pid}')
logger.setLevel(logging.INFO)
handler = logging.FileHandler(log_file)
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
logger.addHandler(handler)
return logger, log_file
def get_process_by_name(process_name):
"""根据进程名称获取进程PID"""
pids = []
for proc in psutil.process_iter(['pid', 'name']):
try:
if process_name.lower() in proc.info['name'].lower():
pids.append(proc.info['pid'])
except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
pass
return pids
def get_process_gpu_usage(pid):
"""获取进程的GPU使用情况"""
try:
nvidia_smi.nvmlInit()
deviceCount = nvidia_smi.nvmlDeviceGetCount()
gpu_usage = 0
gpu_memory = 0
for i in range(deviceCount):
handle = nvidia_smi.nvmlDeviceGetHandleByIndex(i)
processes = nvidia_smi.nvmlDeviceGetComputeRunningProcesses(handle)
for process in processes:
if process.pid == pid:
gpu_memory = process.usedGpuMemory / 1024 / 1024 # 转换为MB
gpu_usage = nvidia_smi.nvmlDeviceGetUtilizationRates(handle).gpu
return gpu_usage, gpu_memory
return 0, 0
except:
return 0, 0
finally:
try:
nvidia_smi.nvmlShutdown()
except:
pass
def get_high_resource_processes():
"""获取高资源占用的进程"""
high_resource_pids = []
for proc in psutil.process_iter(['pid', 'name']):
try:
# 获取进程信息
process = psutil.Process(proc.info['pid'])
memory_gb = process.memory_info().rss / (1024 * 1024 * 1024) # 转换为GB
# 获取GPU使用情况
gpu_usage, gpu_memory = get_process_gpu_usage(proc.info['pid'])
# 检查是否满足条件GPU使用率>50%
if gpu_usage > 50:
high_resource_pids.append({
'pid': proc.info['pid'],
'name': proc.info['name'],
'memory_gb': round(memory_gb, 2),
'gpu_usage': gpu_usage,
'gpu_memory': round(gpu_memory, 2) # GPU显存使用量(MB)
})
except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
continue
return high_resource_pids
def auto_detect_high_resource_processes():
"""定期自动检测新的高资源进程"""
while True:
try:
existing_pids = set(ProcessMonitor.objects.filter(is_active=True).values_list('pid', flat=True))
high_resource_procs = get_high_resource_processes()
for proc in high_resource_procs:
if proc['pid'] not in existing_pids:
logger, log_file = setup_logger(proc['pid'])
# 记录到数据库
monitor = ProcessMonitor.objects.create(
pid=proc['pid'],
process_name=proc['name'],
cpu_usage=0,
memory_usage=proc['memory_gb'],
network_usage=0,
log_path=log_file
)
# 启动监控线程
threading.Thread(
target=monitor_process,
args=(proc['pid'], logger)
).start()
print(f"发现新的高资源进程: {proc['name']} (PID: {proc['pid']})")
# 每5分钟检测一次
time.sleep(300)
except Exception as e:
print(f"自动检测出错: {str(e)}")
time.sleep(60) # 出错后等待1分钟再试
def start_monitor(request):
"""开始监控进程"""
pid = request.GET.get('pid')
resource_type = request.GET.get('type', 'all') # cpu, gpu, memory, all
try:
if pid:
pid = int(pid)
process = psutil.Process(pid)
# 检查进程是否已经在监控中
monitors = {
'cpu': HighCPUProcess.objects.filter(pid=pid, is_active=True).exists(),
'gpu': HighGPUProcess.objects.filter(pid=pid, is_active=True).exists(),
'memory': HighMemoryProcess.objects.filter(pid=pid, is_active=True).exists()
}
# 根据资源类型启动监控
results = []
if resource_type == 'all':
for rtype, is_monitored in monitors.items():
if not is_monitored:
thread = threading.Thread(
target=monitor_process,
args=(pid, rtype),
daemon=True
)
thread.start()
results.append(f"已启动{rtype}监控")
else:
results.append(f"{rtype}监控已在运行")
else:
if not monitors.get(resource_type):
thread = threading.Thread(
target=monitor_process,
args=(pid, resource_type),
daemon=True
)
thread.start()
results.append(f"已启动{resource_type}监控")
else:
return JsonResponse({"error": f"进程 {pid} 已在{resource_type}监控中"}, status=400)
return JsonResponse({
"message": f"开始监控进程 {process.name()} (PID: {pid})",
"results": results
})
# 自动检测高资源进程
high_resource_procs = {
'cpu': [],
'gpu': [],
'memory': []
}
for proc in psutil.process_iter(['pid', 'name']):
try:
process = psutil.Process(proc.info['pid'])
# 检查CPU使用率 (>200% 表示使用超过2个核心)
cpu_percent = process.cpu_percent(interval=1.0)
if cpu_percent > 200:
high_resource_procs['cpu'].append(process)
# 检查内存使用量 (>20GB)
memory_gb = process.memory_info().rss / (1024 * 1024 * 1024)
if memory_gb > 20:
high_resource_procs['memory'].append(process)
# 检查GPU使用率 (>50%)
gpu_usage, gpu_memory = get_process_gpu_usage(process.pid)
if gpu_usage > 50:
high_resource_procs['gpu'].append(process)
except (psutil.NoSuchProcess, psutil.AccessDenied):
continue
# 启动监控
results = {
'cpu': [],
'gpu': [],
'memory': []
}
for resource_type, processes in high_resource_procs.items():
for proc in processes:
if not any([
HighCPUProcess.objects.filter(pid=proc.pid, is_active=True).exists(),
HighGPUProcess.objects.filter(pid=proc.pid, is_active=True).exists(),
HighMemoryProcess.objects.filter(pid=proc.pid, is_active=True).exists()
]):
thread = threading.Thread(
target=monitor_process,
args=(proc.pid, resource_type),
daemon=True
)
thread.start()
results[resource_type].append({
'pid': proc.pid,
'name': proc.name()
})
return JsonResponse({
"message": "开始监控高资源进程",
"processes": results
})
except Exception as e:
return JsonResponse({"error": str(e)}, status=500)
@csrf_exempt
@require_http_methods(["POST"])
def stop_monitor(request, pid):
"""停止监控指定进程"""
resource_type = request.GET.get('type', 'all') # 从查询参数获取资源类型
try:
# 根据资源类型选择要停止的监控
monitors = []
if resource_type == 'all':
monitors.extend(HighCPUProcess.objects.filter(pid=pid, is_active=True))
monitors.extend(HighGPUProcess.objects.filter(pid=pid, is_active=True))
monitors.extend(HighMemoryProcess.objects.filter(pid=pid, is_active=True))
elif resource_type == 'cpu':
monitors.extend(HighCPUProcess.objects.filter(pid=pid, is_active=True))
elif resource_type == 'gpu':
monitors.extend(HighGPUProcess.objects.filter(pid=pid, is_active=True))
elif resource_type == 'memory':
monitors.extend(HighMemoryProcess.objects.filter(pid=pid, is_active=True))
else:
return JsonResponse({
"error": f"不支持的资源类型: {resource_type}"
}, status=400)
if not monitors:
return JsonResponse({
"error": f"未找到进程 {pid}{resource_type}监控记录"
}, status=404)
# 更新所有监控记录的状态
for monitor in monitors:
# 只更新监控状态,不改变进程状态
monitor.is_active = False
monitor.save()
# 记录停止操作
logger = logging.getLogger(f'{monitor.__class__.__name__.lower()}_{pid}')
logger.info(
f"手动停止监控:\n"
f"├─ 进程ID: {pid}\n"
f"├─ 监控类型: {monitor.__class__.__name__}\n"
f"├─ 进程状态: {'运行中' if monitor.status == 1 else '已终止'}\n"
f"├─ 开始时间: {monitor.created_at}\n"
f"└─ 停止时间: {timezone.now()}"
)
# 尝试终止相关的监控线程
import threading
current_threads = threading.enumerate()
monitor_threads = [t for t in current_threads if t.name.startswith(f'monitor_{pid}')]
for thread in monitor_threads:
try:
thread.do_run = False
except:
pass
return JsonResponse({
"message": f"已停止对进程 {pid} 的监控",
"stopped_monitors": len(monitors),
"process_status": "运行中" if monitors[0].status == 1 else "已终止"
})
except Exception as e:
return JsonResponse({
"error": f"停止监控失败: {str(e)}"
}, status=500)
def get_process_metrics(request, pid):
"""获取进程监控数据"""
resource_type = request.GET.get('type', 'all')
try:
results = {}
monitors = {
'cpu': HighCPUProcess,
'gpu': HighGPUProcess,
'memory': HighMemoryProcess
}
if resource_type == 'all':
for rtype, model in monitors.items():
try:
monitor = model.objects.get(pid=pid)
results[rtype] = {
'status': monitor.status,
'cpu_usage': monitor.cpu_usage,
'memory_usage': monitor.memory_usage,
'gpu_usage': monitor.gpu_usage,
'gpu_memory': monitor.gpu_memory,
'virtual_memory': monitor.virtual_memory
}
# 添加特定资源类型的指标
if rtype == 'cpu':
results[rtype]['cpu_cores'] = monitor.cpu_cores
elif rtype == 'gpu':
results[rtype]['gpu_index'] = monitor.gpu_index
elif rtype == 'memory':
results[rtype]['swap_usage'] = monitor.swap_usage
except model.DoesNotExist:
continue
else:
model = monitors.get(resource_type)
if model:
try:
monitor = model.objects.get(pid=pid)
results[resource_type] = {
'status': monitor.status,
'cpu_usage': monitor.cpu_usage,
'memory_usage': monitor.memory_usage,
'gpu_usage': monitor.gpu_usage,
'gpu_memory': monitor.gpu_memory,
'virtual_memory': monitor.virtual_memory
}
# 添加特定资源类型的指标
if resource_type == 'cpu':
results[resource_type]['cpu_cores'] = monitor.cpu_cores
elif resource_type == 'gpu':
results[resource_type]['gpu_index'] = monitor.gpu_index
elif resource_type == 'memory':
results[resource_type]['swap_usage'] = monitor.swap_usage
except model.DoesNotExist:
pass
if not results:
return JsonResponse({"error": f"未找到PID为{pid}的监控记录"}, status=404)
return JsonResponse({
"pid": pid,
"metrics": results
})
except Exception as e:
return JsonResponse({"error": str(e)}, status=500)
def auto_detect_monitor(request):
"""自动检测并监控高资源进程"""
try:
# 清理已停止的监控
HighCPUProcess.objects.filter(is_active=True, status=0).update(is_active=False)
HighGPUProcess.objects.filter(is_active=True, status=0).update(is_active=False)
HighMemoryProcess.objects.filter(is_active=True, status=0).update(is_active=False)
results = {
'cpu': [],
'gpu': [],
'memory': []
}
# 首先收集所有进程的CPU使用率
processes = {}
for proc in psutil.process_iter(['pid', 'name', 'cpu_percent']):
try:
processes[proc.info['pid']] = proc.info
except (psutil.NoSuchProcess, psutil.AccessDenied):
continue
# 等待一秒获取CPU使用率变化
time.sleep(1)
# 检测高资源进程
for proc in psutil.process_iter(['pid', 'name', 'cpu_percent']):
try:
pid = proc.info['pid']
if pid not in processes:
continue
process = psutil.Process(pid)
cpu_percent = proc.info['cpu_percent']
# 检查CPU使用率 (>200% 表示使用超过2个核心)
if cpu_percent > 200:
if not HighCPUProcess.objects.filter(pid=pid, is_active=True).exists():
thread = threading.Thread(
target=monitor_process,
args=(pid, 'cpu'),
daemon=True
)
thread.start()
results['cpu'].append({
'pid': pid,
'name': process.name(),
'cpu_usage': cpu_percent
})
# 检查内存使用量 (>20GB)
memory_gb = process.memory_info().rss / (1024 * 1024 * 1024)
if memory_gb > 20:
if not HighMemoryProcess.objects.filter(pid=pid, is_active=True).exists():
thread = threading.Thread(
target=monitor_process,
args=(pid, 'memory'),
daemon=True
)
thread.start()
results['memory'].append({
'pid': pid,
'name': process.name(),
'memory_usage': memory_gb
})
# 检查GPU使用率 (>50%)
gpu_usage, gpu_memory = get_process_gpu_usage(pid)
if gpu_usage > 50:
if not HighGPUProcess.objects.filter(pid=pid, is_active=True).exists():
thread = threading.Thread(
target=monitor_process,
args=(pid, 'gpu'),
daemon=True
)
thread.start()
results['gpu'].append({
'pid': pid,
'name': process.name(),
'gpu_usage': gpu_usage,
'gpu_memory': gpu_memory
})
except (psutil.NoSuchProcess, psutil.AccessDenied):
continue
return JsonResponse({
"message": "已开始监控检测到的高资源进程",
"detected_processes": results
})
except Exception as e:
return JsonResponse({"error": str(e)}, status=500)

BIN
requirements.txt Normal file

Binary file not shown.